In [None]:
import pandas as pd
from collections import defaultdict
import networkx as nx
import linkpred
from sklearn import neighbors, datasets, preprocessing
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score
from sklearn.svm import SVC


df = pd.read_csv('imdb.csv')

In [None]:
# samples: 2006-2015, 2016; connected by movies
data_sample1 = defaultdict(list)
data_sample2 = defaultdict(list)  # data sample for 2016 movies 

for index, row in df.iterrows():
    actors = row['Actors'].split(',')
    year = row['Year']
    for actor in actors:
        if year < 2016:
            data_sample1[actor].append(row['Title'])
        else:
            data_sample2[actor].append(row['Title'])
            
            
G_coeffs = nx.Graph(name="Graph for coefficients")
G_y = nx.Graph(name="Graph for Y")

# generating coefficients graph
for actor in data_sample1:
    movies = data_sample1[actor]
    actor = actor.strip()
    G_coeffs.add_node(actor, actor_attributes=movies)

    for node, attrs in G_coeffs.nodes(data=True):
        if node != actor:
            for a in attrs['actor_attributes']:
                if a in movies and not G_coeffs.has_edge(actor, node):
                    G_coeffs.add_edge(actor, node, film=a)
                
# generate Y graph
for actor in data_sample2:
    movies = data_sample2[actor]
    actor = actor.strip()
    G_y.add_node(actor, actor_attributes=movies)

    for node, attrs in G_y.nodes(data=True):
        if node != actor:
            for a in attrs['actor_attributes']:
                if a in movies and not G_y.has_edge(actor, node):
                    G_y.add_edge(actor, node, film=a)
                    
indexes = []
for u, v, p in nx.jaccard_coefficient(G_coeffs):
    indexes.append(tuple((u, v)))
                    
edge_df = pd.DataFrame(index=indexes)
edge_df['Jaccard'] = [i[2] for i in nx.jaccard_coefficient(G_coeffs, edge_df.index)]
edge_df['Adamic-Adar'] = [i[2] for i in nx.adamic_adar_index(G_coeffs, edge_df.index)]
edge_df['Pref-Attach'] = [i[2] for i in nx.preferential_attachment(G_coeffs, edge_df.index)]
edge_df['Res-Alloc'] = [i[2] for i in nx.resource_allocation_index(G_coeffs, edge_df.index)]
edge_df['Y'] = [1 if G_y.has_edge(u, v) else 0 for u, v in indexes]

In [14]:
edge_df.shape

(1015367, 5)

In [105]:
train, test = train_test_split(edge_df, test_size=0.3)
train_0 = train[train['Y'] == 0]
train_1 = train[train['Y'] == 1]
test_0 = test[test['Y'] == 0]
test_1 = test[test['Y'] == 1]

In [106]:
X_train = train[['Jaccard', 'Adamic-Adar', 'Pref-Attach', 'Res-Alloc']]
y_train = train[['Y']]
X_test = test[['Jaccard', 'Adamic-Adar', 'Pref-Attach', 'Res-Alloc']]
y_test = test[['Y']]

In [107]:
scaler = preprocessing.StandardScaler().fit(X_train)  # standartize data
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)
y_train = y_train.values.ravel() # transform to numpy array
y_test = y_test.values.ravel()

In [None]:
knn = neighbors.KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train, y_train)
y_pred = knn.predict(X_test)
accuracy_knn = accuracy_score(y_test, y_pred)
recall_knn = recall_score(y_test, y_pred)
prec_knn = precision_score(y_test, y_pred)

In [None]:
svc = SVC(kernel='linear')
svc.fit(X_train, y_train)
y_pred = svc.predict(X_test)
accuracy_svm = accuracy_score(y_test, y_pred)
recall_svm = recall_score(y_test, y_pred)

In [108]:
# train on subsample
data_class_0 = train[train['Y'] == 0][:1000]
data_class_1 = train[train['Y'] == 1]
merged_data = data_class_0.append(data_class_1)
X_train = merged_data[['Jaccard', 'Adamic-Adar', 'Pref-Attach', 'Res-Alloc']]
y_train = merged_data[['Y']]
X_test = test[['Jaccard', 'Adamic-Adar', 'Pref-Attach', 'Res-Alloc']]
y_test = test[['Y']]

scaler = preprocessing.StandardScaler().fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)
y_train = y_train.values.ravel()
y_test = y_test.values.ravel()

svc = SVC(kernel='linear')
svc.fit(X_train, y_train)
y_pred = svc.predict(X_test)
recall_svm = recall_score(y_test, y_pred)

In [None]:
# with added centralities features
indexes = []
degree_src = []  # degree centrality
degree_dst = []
clos_src = []  # closenness centrality
clos_dst = []
betw_src = []
betw_dst = []
# edge_centr = nx.edge_betweenness_centrality(G_coeffs, normalized=True)
for u, v, p in nx.jaccard_coefficient(G_coeffs):
    indexes.append(tuple((u, v)))
    degree_src.append(nx.degree(G_coeffs, u))
    degree_dst.append(nx.degree(G_coeffs, v))
    clos_src.append(nx.closeness_centrality(G_coeffs, u))
    clos_dst.append(nx.closeness_centrality(G_coeffs, v))
    betw_src.append(nx.closeness_centrality(G_coeffs, u))
    betw_dst.append(nx.closeness_centrality(G_coeffs, v))
                    
edge_df = pd.DataFrame(index=indexes)
edge_df['Jaccard'] = [i[2] for i in nx.jaccard_coefficient(G_coeffs, edge_df.index)]
edge_df['Adamic-Adar'] = [i[2] for i in nx.adamic_adar_index(G_coeffs, edge_df.index)]
edge_df['Pref-Attach'] = [i[2] for i in nx.preferential_attachment(G_coeffs, edge_df.index)]
edge_df['Res-Alloc'] = [i[2] for i in nx.resource_allocation_index(G_coeffs, edge_df.index)]
edge_df['Degree_centrality'] = [min(i, j) for i, j in zip(degree_src, degree_dst)]
edge_df['Closeness_centrality'] = [min(i, j) for i, j in zip(clos_src, clos_dst)]
edge_df['Betweenness_ centrality'] = [min(i, j) for i, j in zip(betw_src, betw_dst)]
edge_df['Y'] = [1 if G_y.has_edge(u, v) else 0 for u, v in indexes]

train, test = train_test_split(edge_df, test_size=0.3)
X_train = train[['Jaccard', 'Adamic-Adar', 'Pref-Attach', 'Res-Alloc']]
y_train = train[['Y']]
X_test = test[['Jaccard', 'Adamic-Adar', 'Pref-Attach', 'Res-Alloc', 'Degree_centrality', 'Closeness_centrality', 'Betweenness_ centrality']]
y_test = test[['Y']]

scaler = preprocessing.StandardScaler().fit(X_train) 
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)
y_train = y_train.values.ravel() 
y_test = y_test.values.ravel()

knn = neighbors.KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train, y_train)
y_pred = knn.predict(X_test)
accuracy_knn = accuracy_score(y_test, y_pred)
recall_knn = recall_score(y_test, y_pred)
prec_knn = precision_score(y_test, y_pred)

In [112]:
edge_df.describe()

Unnamed: 0,Jaccard,Adamic-Adar,Pref-Attach,Res-Alloc,Deg_centrality,Y
count,1015367.0,1015367.0,1015367.0,1015367.0,1015367.0,1015367.0
mean,0.001431494,0.007189461,24.00484,0.001489223,3.147642,0.0005229636
std,0.01309157,0.05700029,31.13075,0.01285396,1.603312,0.02286243
min,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,9.0,0.0,2.0,0.0
50%,0.0,0.0,15.0,0.0,3.0,0.0
75%,0.0,0.0,27.0,0.0,3.0,0.0
max,0.6666667,2.164043,870.0,0.75,29.0,1.0
