In [1]:
import pandas as pd
from collections import defaultdict
import networkx as nx
import linkpred
from sklearn import neighbors, datasets, preprocessing
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix, classification_report, roc_auc_score
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.decomposition import PCA


df = pd.read_csv('imdb.csv')

data_sample1 = defaultdict(list)
data_sample2 = defaultdict(list)  # data sample for 2016 movies 

for index, row in df.iterrows():
    actors = row['Actors'].split(',')
    year = row['Year']
    for actor in actors:
        if year < 2016:
            data_sample1[actor].append(row['Title'])
        else:
            data_sample2[actor].append(row['Title'])
            
            
G_coeffs = nx.Graph(name="Graph for coefficients")
G_y = nx.Graph(name="Graph for Y")

# generating coefficients graph
for actor in data_sample1:
    movies = data_sample1[actor]
    actor = actor.strip()
    G_coeffs.add_node(actor, actor_attributes=movies)

    for node, attrs in G_coeffs.nodes(data=True):
        if node != actor:
            for a in attrs['actor_attributes']:
                if a in movies and not G_coeffs.has_edge(actor, node):
                    G_coeffs.add_edge(actor, node, film=a)
                
# generate Y graph
for actor in data_sample2:
    movies = data_sample2[actor]
    actor = actor.strip()
    G_y.add_node(actor, actor_attributes=movies)

    for node, attrs in G_y.nodes(data=True):
        if node != actor:
            for a in attrs['actor_attributes']:
                if a in movies and not G_y.has_edge(actor, node):
                    G_y.add_edge(actor, node, film=a)

In [None]:
# whole dataset 710758 rows in train data and 304609 rows in test data, all features
indexes = []
degree_src = []  # degree centrality
degree_dst = []
clos_src = []  # closenness centrality
clos_dst = []
betw_src = []  # betweenness centrality
betw_dst = []

d = nx.betweenness_centrality(G_coeffs, normalized=True)

for u, v, p in nx.jaccard_coefficient(G_coeffs):
    indexes.append(tuple((u, v)))
    degree_src.append(nx.degree(G_coeffs, u))
    degree_dst.append(nx.degree(G_coeffs, v))
    clos_src.append(nx.closeness_centrality(G_coeffs, u))
    clos_dst.append(nx.closeness_centrality(G_coeffs, v))
    betw_src.append(d[u])
    betw_dst.append(d[v])
                    
edge_df = pd.DataFrame(index=indexes)
edge_df['Jaccard'] = [i[2] for i in nx.jaccard_coefficient(G_coeffs, edge_df.index)]
edge_df['Adamic-Adar'] = [i[2] for i in nx.adamic_adar_index(G_coeffs, edge_df.index)]
edge_df['Pref-Attach'] = [i[2] for i in nx.preferential_attachment(G_coeffs, edge_df.index)]
edge_df['Res-Alloc'] = [i[2] for i in nx.resource_allocation_index(G_coeffs, edge_df.index)]
edge_df['Degree_centrality'] = [min(i, j) for i, j in zip(degree_src, degree_dst)]
edge_df['Closeness_centrality'] = [min(i, j) for i, j in zip(clos_src, clos_dst)]
edge_df['Betweenness_ centrality'] = [min(i, j) for i, j in zip(betw_src, betw_dst)]
edge_df['Y'] = [1 if G_y.has_edge(u, v) else 0 for u, v in indexes]

train, test = train_test_split(edge_df, test_size=0.3) 
features = ['Jaccard', 'Adamic-Adar', 'Pref-Attach', 'Res-Alloc', 'Degree_centrality', 'Closeness_centrality', 'Betweenness_ centrality']

In [None]:
# sample: 1000 from the whole train dataset, all features
data_class_0 = train[train['Y'] == 0][:1000]  # slice only 1000 rows 
data_class_1 = train[train['Y'] == 1]
df_1000 = data_class_0.append(data_class_1)  # reduced train set with 1000 '0' rows and all '1' rows
X_train = df_1000.loc[:, features].values
y_train = df_1000.loc[:,['Y']].values
X_test = test.loc[:, features].values
y_test = test.loc[:,['Y']].values

scaler = preprocessing.StandardScaler().fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)
y_train = y_train.values.ravel()
y_test = y_test.values.ravel()

In [None]:
# knn for the 1000 dataset
knn = neighbors.KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train, y_train)
y_pred_knn_1000 = knn.predict(X_test)
y_pred_proba_knn_1000 = knn.predict_proba(X_test)[:,1]  # compute probabilities
accuracy_knn_1000 = accuracy_score(y_test, y_pred_knn_1000)
recall_knn_1000 = recall_score(y_test, y_pred_knn_1000)
prec_knn_1000 = precision_score(y_test, y_pred_knn_1000)
cm_knn_1000 = confusion_matrix(y_test, y_pred_knn_1000)
auc_knn_1000 = roc_auc_score(y_test, y_pred_proba_knn_1000)  # AUC 

In [None]:
# svm for the 1000 dataset
svc = SVC(kernel='linear')
svc.fit(X_train, y_train)
y_pred_svm_1000 = svc.predict(X_test)
accuracy_svm_1000 = accuracy_score(y_test, y_pred_svm_1000)
recall_svm_1000 = recall_score(y_test, y_pred_svm_1000)
prec_svm_1000 = precision_score(y_test, y_pred_svm_1000)
cm_svm_1000 = confusion_matrix(y_test, y_pred_svm_1000)

In [None]:
# random forest for the 1000 dataset
rf = RandomForestClassifier(max_depth=2, random_state=0)
rf.fit(X_train, y_train)
y_pred_rf_1000 = rf.predict(X_test)
y_pred_proba_rf_1000 = rf.predict_proba(X_test)[:,1]  # compute probabilities
accuracy_rf_1000 = accuracy_score(y_test, y_pred_rf_1000)
recall_rf_1000 = recall_score(y_test, y_pred_rf_1000)
prec_rf_1000 = precision_score(y_test, y_pred_rf_1000)
cm_rf_1000 = confusion_matrix(y_test, y_pred_rf_1000)
auc_rf_1000 = roc_auc_score(y_test, y_pred_proba_rf_1000)  # AUC 

In [None]:
# logistic regr for the 1000 dataset
lr = LogisticRegression()
lr.fit(X_train, y_train)
y_pred_lr_1000 = lr.predict(X_test)
y_pred_proba_lr_1000 = lr.predict_proba(X_test)[:,1]  # compute probabilities
accuracy_lr_1000 = accuracy_score(y_test, y_pred_lr_1000)
recall_lr_1000 = recall_score(y_test, y_pred_lr_1000)
prec_lr_1000 = precision_score(y_test, y_pred_lr_1000)
cm_lr_1000 = confusion_matrix(y_test, y_pred_lr_1000)
auc_lr_1000 = roc_auc_score(y_test, y_pred_proba_lr_1000)  # AUC 

In [None]:
# decision trees for the 1000 dataset
dt = DecisionTreeClassifier()
dt.fit(X_train, y_train)
y_pred_dt_1000 = dt.predict(X_test)
y_pred_proba_dt_1000 = dt.predict_proba(X_test)[:,1]  # compute probabilities
accuracy_dt_1000 = accuracy_score(y_test, y_pred_dt_1000)
recall_dt_1000 = recall_score(y_test, y_pred_dt_1000)
prec_dt_1000 = precision_score(y_test, y_pred_dt_1000)
cm_dt_1000 = confusion_matrix(y_test, y_pred_dt_1000)
auc_dt_1000 = roc_auc_score(y_test, y_pred_proba_dt_1000)  # AUC 

In [None]:
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

In [None]:
# sample: 1000 from the whole train dataset, n = 6 features
data_class_0 = train[train['Y'] == 0][:1000]  # slice only 1000 rows 
data_class_1 = train[train['Y'] == 1]
df_1000 = data_class_0.append(data_class_1)  # reduced train set with 1000 '0' rows and all '1' rows
X_train = df_1000.loc[:, features].values
y_train = df_1000.loc[:,['Y']].values
X_test = test.loc[:, features].values
y_test = test.loc[:,['Y']].values

pca = PCA(n_components=6)  # specify num of features to keep
pca.fit(X_train)
ratio_1000 = pca.explained_variance_ratio_
X_train = pca.transform(X_train)
X_test = pca.transform(X_test)
y_train = y_train.ravel() # transform to numpy array
y_test = y_test.ravel()

In [None]:
# knn for the 1000 dataset, 6 features
knn = neighbors.KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train, y_train)
y_pred_knn_1000_6 = knn.predict(X_test)
y_pred_proba_knn_1000_6 = knn.predict_proba(X_test)[:,1]  # compute probabilities
accuracy_knn_1000_6 = accuracy_score(y_test, y_pred_knn_1000_6)
recall_knn_1000_6 = recall_score(y_test, y_pred_knn_1000_6)
prec_knn_1000_6 = precision_score(y_test, y_pred_knn_1000_6)
cm_knn_1000_6 = confusion_matrix(y_test, y_pred_knn_1000_6)
auc_knn_1000_6 = roc_auc_score(y_test, y_pred_proba_knn_1000_6)  # AUC 

In [None]:
# svm for the 1000 dataset, 6 features
svc = SVC(kernel='linear')
svc.fit(X_train, y_train)
y_pred_svm_1000_6 = svc.predict(X_test)
accuracy_svm_1000_6 = accuracy_score(y_test, y_pred_svm_1000_6)
recall_svm_1000_6 = recall_score(y_test, y_pred_svm_1000_6)
prec_svm_1000_6 = precision_score(y_test, y_pred_svm_1000_6)
cm_svm_1000_6 = confusion_matrix(y_test, y_pred_svm_1000_6)

In [None]:
# random forest for the 1000 dataset, 6 features
rf = RandomForestClassifier(max_depth=2, random_state=0)
rf.fit(X_train, y_train)
y_pred_rf_1000_6 = rf.predict(X_test)
y_pred_proba_rf_1000_6 = rf.predict_proba(X_test)[:,1]  # compute probabilities
accuracy_rf_1000_6 = accuracy_score(y_test, y_pred_rf_1000_6)
recall_rf_1000_6 = recall_score(y_test, y_pred_rf_1000_6)
prec_rf_1000_6 = precision_score(y_test, y_pred_rf_1000_6)
cm_rf_1000_6 = confusion_matrix(y_test, y_pred_rf_1000_6)
auc_rf_1000_6 = roc_auc_score(y_test, y_pred_proba_rf_1000_6)  # AUC

In [None]:
# logistic regr for the 1000 dataset, 6 features
lr = LogisticRegression()
lr.fit(X_train, y_train)
y_pred_lr_1000_6 = lr.predict(X_test)
y_pred_proba_lr_1000_6 = lr.predict_proba(X_test)[:,1]  # compute probabilities
accuracy_lr_1000_6 = accuracy_score(y_test, y_pred_lr_1000_6)
recall_lr_1000_6 = recall_score(y_test, y_pred_lr_1000_6)
prec_lr_1000_6 = precision_score(y_test, y_pred_lr_1000_6)
cm_lr_1000_6 = confusion_matrix(y_test, y_pred_lr_1000_6)
auc_lr_1000_6 = roc_auc_score(y_test, y_pred_proba_lr_1000_6)  # AUC 

In [None]:
# decision trees for the 1000 dataset, 6 features
dt = DecisionTreeClassifier()
dt.fit(X_train, y_train)
y_pred_dt_1000_6 = dt.predict(X_test)
y_pred_proba_dt_1000_6 = dt.predict_proba(X_test)[:,1]  # compute probabilities
accuracy_dt_1000_6 = accuracy_score(y_test, y_pred_dt_1000_6)
recall_dt_1000_6 = recall_score(y_test, y_pred_dt_1000_6)
prec_dt_1000_6 = precision_score(y_test, y_pred_dt_1000_6)
cm_dt_1000_6 = confusion_matrix(y_test, y_pred_dt_1000_6)
auc_dt_1000_6 = roc_auc_score(y_test, y_pred_proba_dt_1000_6)  # AUC 

In [None]:
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

In [None]:
# sample: 1000 from the whole train dataset, n = 5 features
data_class_0 = train[train['Y'] == 0][:1000]  # slice only 1000 rows 
data_class_1 = train[train['Y'] == 1]
df_1000 = data_class_0.append(data_class_1)  # reduced train set with 1000 '0' rows and all '1' rows
X_train = df_1000.loc[:, features].values
y_train = df_1000.loc[:,['Y']].values
X_test = test.loc[:, features].values
y_test = test.loc[:,['Y']].values

pca = PCA(n_components=5)  # specify num of features to keep
pca.fit(X_train)
ratio_1000 = pca.explained_variance_ratio_
X_train = pca.transform(X_train)
X_test = pca.transform(X_test)
y_train = y_train.ravel() # transform to numpy array
y_test = y_test.ravel()

In [None]:
# knn for the 1000 dataset, 5 features
knn = neighbors.KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train, y_train)
y_pred_knn_1000_5 = knn.predict(X_test)
y_pred_proba_knn_1000_5 = knn.predict_proba(X_test)[:,1]  # compute probabilities
accuracy_knn_1000_5 = accuracy_score(y_test, y_pred_knn_1000_5)
recall_knn_1000_5 = recall_score(y_test, y_pred_knn_1000_5)
prec_knn_1000_5 = precision_score(y_test, y_pred_knn_1000_5)
cm_knn_1000_5 = confusion_matrix(y_test, y_pred_knn_1000_5)
auc_knn_1000_5 = roc_auc_score(y_test, y_pred_proba_knn_1000_5)  # AUC 

In [None]:
# svm for the 1000 dataset, 5 features
svc = SVC(kernel='linear')
svc.fit(X_train, y_train)
y_pred_svm_1000_5 = svc.predict(X_test)
accuracy_svm_1000_5 = accuracy_score(y_test, y_pred_svm_1000_5)
recall_svm_1000_5 = recall_score(y_test, y_pred_svm_1000_5)
prec_svm_1000_5 = precision_score(y_test, y_pred_svm_1000_5)
cm_svm_1000_5 = confusion_matrix(y_test, y_pred_svm_1000_5)

In [None]:
# random forest for the 1000 dataset, 5 features
rf = RandomForestClassifier(max_depth=2, random_state=0)
rf.fit(X_train, y_train)
y_pred_rf_1000_5 = rf.predict(X_test)
y_pred_proba_rf_1000_5 = rf.predict_proba(X_test)[:,1]  # compute probabilities
accuracy_rf_1000_5 = accuracy_score(y_test, y_pred_rf_1000_5)
recall_rf_1000_5 = recall_score(y_test, y_pred_rf_1000_5)
prec_rf_1000_5 = precision_score(y_test, y_pred_rf_1000_5)
cm_rf_1000_5 = confusion_matrix(y_test, y_pred_rf_1000_5)
auc_rf_1000_5 = roc_auc_score(y_test, y_pred_proba_rf_1000_5)  # AUC

In [None]:
# logistic regr for the 1000 dataset, 5 features
lr = LogisticRegression()
lr.fit(X_train, y_train)
y_pred_lr_1000_5 = lr.predict(X_test)
y_pred_proba_lr_1000_5 = lr.predict_proba(X_test)[:,1]  # compute probabilities
accuracy_lr_1000_5 = accuracy_score(y_test, y_pred_lr_1000_5)
recall_lr_1000_5 = recall_score(y_test, y_pred_lr_1000_5)
prec_lr_1000_5 = precision_score(y_test, y_pred_lr_1000_5)
cm_lr_1000_5 = confusion_matrix(y_test, y_pred_lr_1000_5)
auc_lr_1000_5 = roc_auc_score(y_test, y_pred_proba_lr_1000_5)  # AUC 

In [None]:
# decision trees for the 1000 dataset, 5 features
dt = DecisionTreeClassifier()
dt.fit(X_train, y_train)
y_pred_dt_1000_5 = dt.predict(X_test)
y_pred_proba_dt_1000_5 = dt.predict_proba(X_test)[:,1]  # compute probabilities
accuracy_dt_1000_5 = accuracy_score(y_test, y_pred_dt_1000_5)
recall_dt_1000_5 = recall_score(y_test, y_pred_dt_1000_5)
prec_dt_1000_5 = precision_score(y_test, y_pred_dt_1000_5)
cm_dt_1000_5 = confusion_matrix(y_test, y_pred_dt_1000_5)
auc_dt_1000_5 = roc_auc_score(y_test, y_pred_proba_dt_1000_5)  # AUC 

In [None]:
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

In [None]:
# sample: 1000 from the whole train dataset, n = 4 features
data_class_0 = train[train['Y'] == 0][:1000]  # slice only 1000 rows 
data_class_1 = train[train['Y'] == 1]
df_1000 = data_class_0.append(data_class_1)  # reduced train set with 1000 '0' rows and all '1' rows
X_train = df_1000.loc[:, features].values
y_train = df_1000.loc[:,['Y']].values
X_test = test.loc[:, features].values
y_test = test.loc[:,['Y']].values

pca = PCA(n_components=4)  # specify num of features to keep
pca.fit(X_train)
ratio_1000 = pca.explained_variance_ratio_
X_train = pca.transform(X_train)
X_test = pca.transform(X_test)
y_train = y_train.ravel() # transform to numpy array
y_test = y_test.ravel()

In [None]:
# knn for the 1000 dataset, 4 features
knn = neighbors.KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train, y_train)
y_pred_knn_1000_4 = knn.predict(X_test)
y_pred_proba_knn_1000_4 = knn.predict_proba(X_test)[:,1]  # compute probabilities
accuracy_knn_1000_4 = accuracy_score(y_test, y_pred_knn_1000_4)
recall_knn_1000_4 = recall_score(y_test, y_pred_knn_1000_4)
prec_knn_1000_4 = precision_score(y_test, y_pred_knn_1000_4)
cm_knn_1000_4 = confusion_matrix(y_test, y_pred_knn_1000_4)
auc_knn_1000_4 = roc_auc_score(y_test, y_pred_proba_knn_1000_4)  # AUC 

In [None]:
# svm for the 1000 dataset, 4 features
svc = SVC(kernel='linear')
svc.fit(X_train, y_train)
y_pred_svm_1000_4 = svc.predict(X_test)
accuracy_svm_1000_4 = accuracy_score(y_test, y_pred_svm_1000_4)
recall_svm_1000_4 = recall_score(y_test, y_pred_svm_1000_4)
prec_svm_1000_4 = precision_score(y_test, y_pred_svm_1000_4)
cm_svm_1000_4 = confusion_matrix(y_test, y_pred_svm_1000_4)

In [None]:
# random forest for the 1000 dataset, 4 features
rf = RandomForestClassifier(max_depth=2, random_state=0)
rf.fit(X_train, y_train)
y_pred_rf_1000_4 = rf.predict(X_test)
y_pred_proba_rf_1000_4 = rf.predict_proba(X_test)[:,1]  # compute probabilities
accuracy_rf_1000_4 = accuracy_score(y_test, y_pred_rf_1000_4)
recall_rf_1000_4 = recall_score(y_test, y_pred_rf_1000_4)
prec_rf_1000_4 = precision_score(y_test, y_pred_rf_1000_4)
cm_rf_1000_4 = confusion_matrix(y_test, y_pred_rf_1000_4)
auc_rf_1000_4 = roc_auc_score(y_test, y_pred_proba_rf_1000_4)  # AUC

In [None]:
# logistic regr for the 1000 dataset, 4 features
lr = LogisticRegression()
lr.fit(X_train, y_train)
y_pred_lr_1000_4 = lr.predict(X_test)
y_pred_proba_lr_1000_4 = lr.predict_proba(X_test)[:,1]  # compute probabilities
accuracy_lr_1000_4 = accuracy_score(y_test, y_pred_lr_1000_4)
recall_lr_1000_4 = recall_score(y_test, y_pred_lr_1000_4)
prec_lr_1000_4 = precision_score(y_test, y_pred_lr_1000_4)
cm_lr_1000_4 = confusion_matrix(y_test, y_pred_lr_1000_4)
auc_lr_1000_4 = roc_auc_score(y_test, y_pred_proba_lr_1000_4)  # AUC 

In [None]:
# decision trees for the 1000 dataset, 4 features
dt = DecisionTreeClassifier()
dt.fit(X_train, y_train)
y_pred_dt_1000_4 = dt.predict(X_test)
y_pred_proba_dt_1000_4 = dt.predict_proba(X_test)[:,1]  # compute probabilities
accuracy_dt_1000_4 = accuracy_score(y_test, y_pred_dt_1000_4)
recall_dt_1000_4 = recall_score(y_test, y_pred_dt_1000_4)
prec_dt_1000_4 = precision_score(y_test, y_pred_dt_1000_4)
cm_dt_1000_4 = confusion_matrix(y_test, y_pred_dt_1000_4)
auc_dt_1000_4 = roc_auc_score(y_test, y_pred_proba_dt_1000_4)  # AUC 

In [None]:
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

In [None]:
# sample: 1000 from the whole train dataset, n = 3 features
data_class_0 = train[train['Y'] == 0][:1000]  # slice only 1000 rows 
data_class_1 = train[train['Y'] == 1]
df_1000 = data_class_0.append(data_class_1)  # reduced train set with 1000 '0' rows and all '1' rows
X_train = df_1000.loc[:, features].values
y_train = df_1000.loc[:,['Y']].values
X_test = test.loc[:, features].values
y_test = test.loc[:,['Y']].values

pca = PCA(n_components=3)  # specify num of features to keep
pca.fit(X_train)
ratio_1000 = pca.explained_variance_ratio_
X_train = pca.transform(X_train)
X_test = pca.transform(X_test)
y_train = y_train.ravel() # transform to numpy array
y_test = y_test.ravel()

In [None]:
# knn for the 1000 dataset, 3 features
knn = neighbors.KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train, y_train)
y_pred_knn_1000_3 = knn.predict(X_test)
y_pred_proba_knn_1000_3 = knn.predict_proba(X_test)[:,1]  # compute probabilities
accuracy_knn_1000_3 = accuracy_score(y_test, y_pred_knn_1000_3)
recall_knn_1000_3 = recall_score(y_test, y_pred_knn_1000_3)
prec_knn_1000_3 = precision_score(y_test, y_pred_knn_1000_3)
cm_knn_1000_3 = confusion_matrix(y_test, y_pred_knn_1000_3)
auc_knn_1000_3 = roc_auc_score(y_test, y_pred_proba_knn_1000_3)  # AUC 

In [None]:
# svm for the 1000 dataset, 3 features
svc = SVC(kernel='linear')
svc.fit(X_train, y_train)
y_pred_svm_1000_3 = svc.predict(X_test)
accuracy_svm_1000_3 = accuracy_score(y_test, y_pred_svm_1000_3)
recall_svm_1000_3 = recall_score(y_test, y_pred_svm_1000_3)
prec_svm_1000_3 = precision_score(y_test, y_pred_svm_1000_3)
cm_svm_1000_3 = confusion_matrix(y_test, y_pred_svm_1000_3)

In [None]:
# random forest for the 1000 dataset, 3 features
rf = RandomForestClassifier(max_depth=2, random_state=0)
rf.fit(X_train, y_train)
y_pred_rf_1000_3 = rf.predict(X_test)
y_pred_proba_rf_1000_3 = rf.predict_proba(X_test)[:,1]  # compute probabilities
accuracy_rf_1000_3 = accuracy_score(y_test, y_pred_rf_1000_3)
recall_rf_1000_3 = recall_score(y_test, y_pred_rf_1000_3)
prec_rf_1000_3 = precision_score(y_test, y_pred_rf_1000_3)
cm_rf_1000_3 = confusion_matrix(y_test, y_pred_rf_1000_3)
auc_rf_1000_3 = roc_auc_score(y_test, y_pred_proba_rf_1000_3)  # AUC

In [None]:
# logistic regr for the 1000 dataset, 3 features
lr = LogisticRegression()
lr.fit(X_train, y_train)
y_pred_lr_1000_3 = lr.predict(X_test)
y_pred_proba_lr_1000_3 = lr.predict_proba(X_test)[:,1]  # compute probabilities
accuracy_lr_1000_3 = accuracy_score(y_test, y_pred_lr_1000_3)
recall_lr_1000_3 = recall_score(y_test, y_pred_lr_1000_3)
prec_lr_1000_3 = precision_score(y_test, y_pred_lr_1000_3)
cm_lr_1000_3 = confusion_matrix(y_test, y_pred_lr_1000_3)
auc_lr_1000_3 = roc_auc_score(y_test, y_pred_proba_lr_1000_3)  # AUC 

In [None]:
# decision trees for the 1000 dataset, 3 features
dt = DecisionTreeClassifier()
dt.fit(X_train, y_train)
y_pred_dt_1000_3 = dt.predict(X_test)
y_pred_proba_dt_1000_3 = dt.predict_proba(X_test)[:,1]  # compute probabilities
accuracy_dt_1000_3 = accuracy_score(y_test, y_pred_dt_1000_3)
recall_dt_1000_3 = recall_score(y_test, y_pred_dt_1000_3)
prec_dt_1000_3 = precision_score(y_test, y_pred_dt_1000_3)
cm_dt_1000_3 = confusion_matrix(y_test, y_pred_dt_1000_3)
auc_dt_1000_3 = roc_auc_score(y_test, y_pred_proba_dt_1000_3)  # AUC 

In [None]:
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

In [None]:
# sample: 1000 from the whole train dataset, n = 2 features
data_class_0 = train[train['Y'] == 0][:1000]  # slice only 1000 rows 
data_class_1 = train[train['Y'] == 1]
df_1000 = data_class_0.append(data_class_1)  # reduced train set with 1000 '0' rows and all '1' rows
X_train = df_1000.loc[:, features].values
y_train = df_1000.loc[:,['Y']].values
X_test = test.loc[:, features].values
y_test = test.loc[:,['Y']].values

pca = PCA(n_components=2)  # specify num of features to keep
pca.fit(X_train)
ratio_1000 = pca.explained_variance_ratio_
X_train = pca.transform(X_train)
X_test = pca.transform(X_test)
y_train = y_train.ravel() # transform to numpy array
y_test = y_test.ravel()

In [None]:
# knn for the 1000 dataset, 2 features
knn = neighbors.KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train, y_train)
y_pred_knn_1000_2 = knn.predict(X_test)
y_pred_proba_knn_1000_2 = knn.predict_proba(X_test)[:,1]  # compute probabilities
accuracy_knn_1000_2 = accuracy_score(y_test, y_pred_knn_1000_2)
recall_knn_1000_2 = recall_score(y_test, y_pred_knn_1000_2)
prec_knn_1000_2 = precision_score(y_test, y_pred_knn_1000_2)
cm_knn_1000_2 = confusion_matrix(y_test, y_pred_knn_1000_2)
auc_knn_1000_2 = roc_auc_score(y_test, y_pred_proba_knn_1000_2)  # AUC 

In [None]:
# svm for the 1000 dataset, 2 features
svc = SVC(kernel='linear')
svc.fit(X_train, y_train)
y_pred_svm_1000_2 = svc.predict(X_test)
accuracy_svm_1000_2 = accuracy_score(y_test, y_pred_svm_1000_2)
recall_svm_1000_2 = recall_score(y_test, y_pred_svm_1000_2)
prec_svm_1000_2 = precision_score(y_test, y_pred_svm_1000_2)
cm_svm_1000_2 = confusion_matrix(y_test, y_pred_svm_1000_2)

In [None]:
# random forest for the 1000 dataset, 2 features
rf = RandomForestClassifier(max_depth=2, random_state=0)
rf.fit(X_train, y_train)
y_pred_rf_1000_2 = rf.predict(X_test)
y_pred_proba_rf_1000_2 = rf.predict_proba(X_test)[:,1]  # compute probabilities
accuracy_rf_1000_2 = accuracy_score(y_test, y_pred_rf_1000_2)
recall_rf_1000_2 = recall_score(y_test, y_pred_rf_1000_2)
prec_rf_1000_2 = precision_score(y_test, y_pred_rf_1000_2)
cm_rf_1000_2 = confusion_matrix(y_test, y_pred_rf_1000_2)
auc_rf_1000_2 = roc_auc_score(y_test, y_pred_proba_rf_1000_2)  # AUC

In [None]:
# logistic regr for the 1000 dataset, 2 features
lr = LogisticRegression()
lr.fit(X_train, y_train)
y_pred_lr_1000_2 = lr.predict(X_test)
y_pred_proba_lr_1000_2 = lr.predict_proba(X_test)[:,1]  # compute probabilities
accuracy_lr_1000_2 = accuracy_score(y_test, y_pred_lr_1000_2)
recall_lr_1000_2 = recall_score(y_test, y_pred_lr_1000_2)
prec_lr_1000_2 = precision_score(y_test, y_pred_lr_1000_2)
cm_lr_1000_2 = confusion_matrix(y_test, y_pred_lr_1000_2)
auc_lr_1000_2 = roc_auc_score(y_test, y_pred_proba_lr_1000_2)  # AUC 

In [None]:
# decision trees for the 1000 dataset, 2 features
dt = DecisionTreeClassifier()
dt.fit(X_train, y_train)
y_pred_dt_1000_2 = dt.predict(X_test)
y_pred_proba_dt_1000_2 = dt.predict_proba(X_test)[:,1]  # compute probabilities
accuracy_dt_1000_2 = accuracy_score(y_test, y_pred_dt_1000_2)
recall_dt_1000_2 = recall_score(y_test, y_pred_dt_1000_2)
prec_dt_1000_2 = precision_score(y_test, y_pred_dt_1000_2)
cm_dt_1000_2 = confusion_matrix(y_test, y_pred_dt_1000_2)
auc_dt_1000_2 = roc_auc_score(y_test, y_pred_proba_dt_1000_2)  # AUC 