In [3]:
import pandas as pd
from collections import defaultdict
import networkx as nx
import linkpred
from sklearn import neighbors, datasets, preprocessing
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix, classification_report, roc_auc_score
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.decomposition import PCA


df = pd.read_csv('imdb.csv')

In [5]:
# samples: 2006-2015, 2016; connected by movies
train_data = defaultdict(list)
test_data = defaultdict(list)

for index, row in df.iterrows():
    actors = row['Actors'].split(',')
    year = row['Year']
    for actor in actors:
        if year < 2016:
            train_data[actor].append(row['Title'])
        else:
            test_data[actor].append(row['Title'])
            
            
G_train = nx.Graph(name="TrainData")
G_test = nx.Graph(name="TestData")

# generating train graph
for actor in train_data:
    movies = train_data[actor]
    actor = actor.strip()
    G_train.add_node(actor, actor_attributes=movies)

    for node, attrs in G_train.nodes(data=True):
        if node != actor:
            for a in attrs['actor_attributes']:
                if a in movies and not G_train.has_edge(actor, node):
                    G_train.add_edge(actor, node, film=a)
                
# generate test graph
for actor in test_data:
    movies = test_data[actor]
    actor = actor.strip()
    G_test.add_node(actor, actor_attributes=movies)

    for node, attrs in G_test.nodes(data=True):
        if node != actor:
            for a in attrs['actor_attributes']:
                if a in movies and not G_test.has_edge(actor, node):
                    G_test.add_edge(actor, node, film=a)
                    
# indexes = []
# for u, v, p in nx.jaccard_coefficient(G_train):
#     indexes.append(tuple((u, v)))
                    
# edge_df = pd.DataFrame(index=indexes)
# edge_df['Jaccard'] = [i[2] for i in nx.jaccard_coefficient(G_train, edge_df.index)]
# edge_df['Adamic-Adar'] = [i[2] for i in nx.adamic_adar_index(G_train, edge_df.index)]
# edge_df['Pref-Attach'] = [i[2] for i in nx.preferential_attachment(G_train, edge_df.index)]
# edge_df['Res-Alloc'] = [i[2] for i in nx.resource_allocation_index(G_train, edge_df.index)]
# edge_df['Y'] = [1 if G_test.has_edge(u, v) else 0 for u, v in indexes]

# edge_df = edge_df[(edge_df['Jaccard'] > 0.001) & (edge_df['Adamic-Adar'] > 0.007) & (edge_df['Pref-Attach'] > 0.001) & (edge_df['Res-Alloc'] > 0.001)]
# i = 0
# for index, row in edge_df.iterrows():
#     if row['Y'] == 1:
#         i += 1
# print("Intersection: {}\nDataset shape: {}".format(i, edge_df.shape[0]))
# edge_df.describe()

indexes = []
betw_src = []  # betweenness centrality
betw_dst = []

d = nx.betweenness_centrality(G_train, normalized=True)

for u, v, p in nx.jaccard_coefficient(G_train):
    indexes.append(tuple((u, v)))
    betw_src.append(d[u])
    betw_dst.append(d[v])
                    
edge_df = pd.DataFrame(index=indexes)
edge_df['Jaccard'] = [i[2] for i in nx.jaccard_coefficient(G_train, edge_df.index)]
edge_df['Betweenness_ centrality'] = [min(i, j) for i, j in zip(betw_src, betw_dst)]
edge_df['Y'] = [1 if G_test.has_edge(u, v) else 0 for u, v in indexes]
train, test = train_test_split(edge_df, test_size=0.3) 

features = ['Jaccard', 'Betweenness_ centrality']

X_train = train.loc[:, features].values
y_train = train.loc[:,['Y']].values
X_test = test.loc[:, features].values
y_test = test.loc[:,['Y']].values
# scaler = preprocessing.StandardScaler().fit(X_train)  # standartize data
# X_train = scaler.transform(X_train)
# X_test = scaler.transform(X_test)
# y_train = y_train.values.ravel() # transform to numpy array
# y_test = y_test.values.ravel()

In [10]:
scaler = preprocessing.StandardScaler().fit(X_train)  # standartize data
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)
y_train = y_train.ravel() # transform to numpy array
y_test = y_test.ravel()

In [None]:
# knn for the whole dataset
knn = neighbors.KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train, y_train)
y_pred_knn_all = knn.predict(X_test)
y_pred_proba_knn_all = knn.predict_proba(X_test)[:,1]  # compute probabilities
accuracy_knn_all = accuracy_score(y_test, y_pred_knn_all)  # accuracy
recall_knn_all = recall_score(y_test, y_pred_knn_all)  # recall
prec_knn_all = precision_score(y_test, y_pred_knn_all)  # precision
cm_knn_all = confusion_matrix(y_test, y_pred_knn_all)  # confusion matrix
auc_knn_all = roc_auc_score(y_test, y_pred_proba_knn_all)  # AUC 

In [22]:
auc_knn_all = roc_auc_score(y_test, y_pred_proba_knn_all)
auc_knn_all

0.5054790843158582

In [56]:
# samples: 2006-2010, 2011-2016; connected by movies
train_data = defaultdict(list)
test_data = defaultdict(list)

for index, row in df.iterrows():
    actors = row['Actors'].split(',')
    year = row['Year']
    for actor in actors:
        if year < 2011:
            train_data[actor].append(row['Title'])
        else:
            test_data[actor].append(row['Title'])
            
            
G_train = nx.Graph(name="TrainData")
G_test = nx.Graph(name="TestData")

# generating train graph
for actor in train_data:
    movies = train_data[actor]
    actor = actor.strip()
    G_train.add_node(actor, actor_attributes=movies)

    for node, attrs in G_train.nodes(data=True):
        if node != actor:
            for a in attrs['actor_attributes']:
                if a in movies and not G_train.has_edge(actor, node):
                    G_train.add_edge(actor, node, film=a)
                
# generate test graph
for actor in test_data:
    movies = test_data[actor]
    actor = actor.strip()
    G_test.add_node(actor, actor_attributes=movies)

    for node, attrs in G_test.nodes(data=True):
        if node != actor:
            for a in attrs['actor_attributes']:
                if a in movies and not G_test.has_edge(actor, node):
                    G_test.add_edge(actor, node, film=a)
                    
indexes = []
for u, v, p in nx.jaccard_coefficient(G_train):
    indexes.append(tuple((u, v)))
                    
edge_df = pd.DataFrame(index=indexes)
edge_df['Jaccard'] = [i[2] for i in nx.jaccard_coefficient(G_train, edge_df.index)]
edge_df['Adamic-Adar'] = [i[2] for i in nx.adamic_adar_index(G_train, edge_df.index)]
edge_df['Pref-Attach'] = [i[2] for i in nx.preferential_attachment(G_train, edge_df.index)]
edge_df['Res-Alloc'] = [i[2] for i in nx.resource_allocation_index(G_train, edge_df.index)]
edge_df['Y'] = [1 if G_test.has_edge(u, v) else 0 for u, v in indexes]

edge_df = edge_df[(edge_df['Jaccard'] > 0.001) & (edge_df['Adamic-Adar'] > 0.007) & (edge_df['Pref-Attach'] > 0.001) & (edge_df['Res-Alloc'] > 0.001)]
i = 0
for index, row in edge_df.iterrows():
    if row['Y'] == 1:
        i += 1
print("Intersection: {}\nDataset shape: {}".format(i, edge_df.shape[0]))
edge_df.describe()

Intersection: 21
Dataset shape: 3023


Unnamed: 0,Jaccard,Adamic-Adar,Pref-Attach,Res-Alloc,Y
count,3023.0,3023.0,3023.0,3023.0,3023.0
mean,0.126471,0.482874,29.485941,0.124042,0.006947
std,0.058564,0.116487,26.61882,0.049561,0.083071
min,0.03125,0.345976,4.0,0.055556,0.0
25%,0.083333,0.417032,12.0,0.090909,0.0
50%,0.125,0.45512,18.0,0.111111,0.0
75%,0.2,0.558111,36.0,0.166667,0.0
max,0.5,1.864005,270.0,0.6,1.0


In [57]:
# samples: 2006-2008, 2009-2016; connected by movies
train_data = defaultdict(list)
test_data = defaultdict(list)

for index, row in df.iterrows():
    actors = row['Actors'].split(',')
    year = row['Year']
    for actor in actors:
        if year < 2009:
            train_data[actor].append(row['Title'])
        else:
            test_data[actor].append(row['Title'])
            
            
G_train = nx.Graph(name="TrainData")
G_test = nx.Graph(name="TestData")

# generating train graph
for actor in train_data:
    movies = train_data[actor]
    actor = actor.strip()
    G_train.add_node(actor, actor_attributes=movies)

    for node, attrs in G_train.nodes(data=True):
        if node != actor:
            for a in attrs['actor_attributes']:
                if a in movies and not G_train.has_edge(actor, node):
                    G_train.add_edge(actor, node, film=a)
                
# generate test graph
for actor in test_data:
    movies = test_data[actor]
    actor = actor.strip()
    G_test.add_node(actor, actor_attributes=movies)

    for node, attrs in G_test.nodes(data=True):
        if node != actor:
            for a in attrs['actor_attributes']:
                if a in movies and not G_test.has_edge(actor, node):
                    G_test.add_edge(actor, node, film=a)
                    
indexes = []
for u, v, p in nx.jaccard_coefficient(G_train):
    indexes.append(tuple((u, v)))
                    
edge_df = pd.DataFrame(index=indexes)
edge_df['Jaccard'] = [i[2] for i in nx.jaccard_coefficient(G_train, edge_df.index)]
edge_df['Adamic-Adar'] = [i[2] for i in nx.adamic_adar_index(G_train, edge_df.index)]
edge_df['Pref-Attach'] = [i[2] for i in nx.preferential_attachment(G_train, edge_df.index)]
edge_df['Res-Alloc'] = [i[2] for i in nx.resource_allocation_index(G_train, edge_df.index)]
edge_df['Y'] = [1 if G_test.has_edge(u, v) else 0 for u, v in indexes]

edge_df = edge_df[(edge_df['Jaccard'] > 0.001) & (edge_df['Adamic-Adar'] > 0.007) & (edge_df['Pref-Attach'] > 0.001) & (edge_df['Res-Alloc'] > 0.001)]
i = 0
for index, row in edge_df.iterrows():
    if row['Y'] == 1:
        i += 1
print("Intersection: {}\nDataset shape: {}".format(i, edge_df.shape[0]))
edge_df.describe()

Intersection: 11
Dataset shape: 1097


Unnamed: 0,Jaccard,Adamic-Adar,Pref-Attach,Res-Alloc,Y
count,1097.0,1097.0,1097.0,1097.0,1097.0
mean,0.153495,0.506733,19.480401,0.135358,0.010027
std,0.067325,0.132387,15.679262,0.053259,0.099679
min,0.043478,0.369269,6.0,0.066667,0.0
25%,0.111111,0.434294,9.0,0.1,0.0
50%,0.142857,0.480898,16.0,0.125,0.0
75%,0.2,0.558111,24.0,0.166667,0.0
max,1.0,1.749144,144.0,0.535714,1.0


In [58]:
# samples: 2006-2014, 2015-2016; connected by movies
train_data = defaultdict(list)
test_data = defaultdict(list)

for index, row in df.iterrows():
    actors = row['Actors'].split(',')
    year = row['Year']
    for actor in actors:
        if year < 2015:
            train_data[actor].append(row['Title'])
        else:
            test_data[actor].append(row['Title'])
            
            
G_train = nx.Graph(name="TrainData")
G_test = nx.Graph(name="TestData")

# generating train graph
for actor in train_data:
    movies = train_data[actor]
    actor = actor.strip()
    G_train.add_node(actor, actor_attributes=movies)

    for node, attrs in G_train.nodes(data=True):
        if node != actor:
            for a in attrs['actor_attributes']:
                if a in movies and not G_train.has_edge(actor, node):
                    G_train.add_edge(actor, node, film=a)
                
# generate test graph
for actor in test_data:
    movies = test_data[actor]
    actor = actor.strip()
    G_test.add_node(actor, actor_attributes=movies)

    for node, attrs in G_test.nodes(data=True):
        if node != actor:
            for a in attrs['actor_attributes']:
                if a in movies and not G_test.has_edge(actor, node):
                    G_test.add_edge(actor, node, film=a)
                    
indexes = []
for u, v, p in nx.jaccard_coefficient(G_train):
    indexes.append(tuple((u, v)))
                    
edge_df = pd.DataFrame(index=indexes)
edge_df['Jaccard'] = [i[2] for i in nx.jaccard_coefficient(G_train, edge_df.index)]
edge_df['Adamic-Adar'] = [i[2] for i in nx.adamic_adar_index(G_train, edge_df.index)]
edge_df['Pref-Attach'] = [i[2] for i in nx.preferential_attachment(G_train, edge_df.index)]
edge_df['Res-Alloc'] = [i[2] for i in nx.resource_allocation_index(G_train, edge_df.index)]
edge_df['Y'] = [1 if G_test.has_edge(u, v) else 0 for u, v in indexes]

edge_df = edge_df[(edge_df['Jaccard'] > 0.001) & (edge_df['Adamic-Adar'] > 0.007) & (edge_df['Pref-Attach'] > 0.001) & (edge_df['Res-Alloc'] > 0.001)]
i = 0
for index, row in edge_df.iterrows():
    if row['Y'] == 1:
        i += 1
print("Intersection: {}\nDataset shape: {}".format(i, edge_df.shape[0]))
edge_df.describe()

Intersection: 56
Dataset shape: 13128


Unnamed: 0,Jaccard,Adamic-Adar,Pref-Attach,Res-Alloc,Y
count,13128.0,13128.0,13128.0,13128.0,13128.0
mean,0.089081,0.426715,69.929007,0.090966,0.004266
std,0.056827,0.128257,71.529312,0.047101,0.065175
min,0.021739,0.303413,2.0,0.037037,0.0
25%,0.05,0.345976,21.0,0.055556,0.0
50%,0.071429,0.389871,45.0,0.076923,0.0
75%,0.111111,0.45512,96.0,0.111111,0.0
max,1.0,2.164043,648.0,0.75,1.0


In [59]:
# samples: 2006-2011, 2012-2016; connected by movies
train_data = defaultdict(list)
test_data = defaultdict(list)

for index, row in df.iterrows():
    actors = row['Actors'].split(',')
    year = row['Year']
    for actor in actors:
        if year < 2012:
            train_data[actor].append(row['Title'])
        else:
            test_data[actor].append(row['Title'])
            
            
G_train = nx.Graph(name="TrainData")
G_test = nx.Graph(name="TestData")

# generating train graph
for actor in train_data:
    movies = train_data[actor]
    actor = actor.strip()
    G_train.add_node(actor, actor_attributes=movies)

    for node, attrs in G_train.nodes(data=True):
        if node != actor:
            for a in attrs['actor_attributes']:
                if a in movies and not G_train.has_edge(actor, node):
                    G_train.add_edge(actor, node, film=a)
                
# generate test graph
for actor in test_data:
    movies = test_data[actor]
    actor = actor.strip()
    G_test.add_node(actor, actor_attributes=movies)

    for node, attrs in G_test.nodes(data=True):
        if node != actor:
            for a in attrs['actor_attributes']:
                if a in movies and not G_test.has_edge(actor, node):
                    G_test.add_edge(actor, node, film=a)
                    
indexes = []
for u, v, p in nx.jaccard_coefficient(G_train):
    indexes.append(tuple((u, v)))
                    
edge_df = pd.DataFrame(index=indexes)
edge_df['Jaccard'] = [i[2] for i in nx.jaccard_coefficient(G_train, edge_df.index)]
edge_df['Adamic-Adar'] = [i[2] for i in nx.adamic_adar_index(G_train, edge_df.index)]
edge_df['Pref-Attach'] = [i[2] for i in nx.preferential_attachment(G_train, edge_df.index)]
edge_df['Res-Alloc'] = [i[2] for i in nx.resource_allocation_index(G_train, edge_df.index)]
edge_df['Y'] = [1 if G_test.has_edge(u, v) else 0 for u, v in indexes]

edge_df = edge_df[(edge_df['Jaccard'] > 0.001) & (edge_df['Adamic-Adar'] > 0.007) & (edge_df['Pref-Attach'] > 0.001) & (edge_df['Res-Alloc'] > 0.001)]
i = 0
for index, row in edge_df.iterrows():
    if row['Y'] == 1:
        i += 1
print("Intersection: {}\nDataset shape: {}".format(i, edge_df.shape[0]))
edge_df.describe()

Intersection: 31
Dataset shape: 4544


Unnamed: 0,Jaccard,Adamic-Adar,Pref-Attach,Res-Alloc,Y
count,4544.0,4544.0,4544.0,4544.0,4544.0
mean,0.116464,0.470214,36.21985,0.116031,0.006822
std,0.061368,0.127615,32.839759,0.051654,0.082323
min,0.03125,0.345976,4.0,0.055556,0.0
25%,0.071429,0.40243,15.0,0.083333,0.0
50%,0.1,0.45512,27.0,0.111111,0.0
75%,0.142857,0.558111,45.0,0.142857,0.0
max,1.0,2.164043,270.0,0.75,1.0


In [60]:
# samples: 2006-2010, 2011-2016; connected by genres
train_data = defaultdict(list)
test_data = defaultdict(list)

for index, row in df.iterrows():
    actors = row['Actors'].split(',')
    year = row['Year']
    for actor in actors:
        if year < 2011:
            train_data[actor].append(row['Genre'])
        else:
            test_data[actor].append(row['Genre'])
            
            
G_train = nx.Graph(name="TrainData")
G_test = nx.Graph(name="TestData")

# generating train graph
for actor in train_data:
    movies = train_data[actor]
    actor = actor.strip()
    G_train.add_node(actor, actor_attributes=movies)

    for node, attrs in G_train.nodes(data=True):
        if node != actor:
            for a in attrs['actor_attributes']:
                if a in movies and not G_train.has_edge(actor, node):
                    G_train.add_edge(actor, node, film=a)
                
# generate test graph
for actor in test_data:
    movies = test_data[actor]
    actor = actor.strip()
    G_test.add_node(actor, actor_attributes=movies)

    for node, attrs in G_test.nodes(data=True):
        if node != actor:
            for a in attrs['actor_attributes']:
                if a in movies and not G_test.has_edge(actor, node):
                    G_test.add_edge(actor, node, film=a)
           

indexes = []
for u, v, p in nx.jaccard_coefficient(G_train):
    indexes.append(tuple((u, v)))
                    
edge_df = pd.DataFrame(index=indexes)
edge_df['Jaccard'] = [i[2] for i in nx.jaccard_coefficient(G_train, edge_df.index)]
edge_df['Adamic-Adar'] = [i[2] for i in nx.adamic_adar_index(G_train, edge_df.index)]
edge_df['Pref-Attach'] = [i[2] for i in nx.preferential_attachment(G_train, edge_df.index)]
edge_df['Res-Alloc'] = [i[2] for i in nx.resource_allocation_index(G_train, edge_df.index)]
edge_df['Y'] = [1 if G_test.has_edge(u, v) else 0 for u, v in indexes]

edge_df = edge_df[(edge_df['Jaccard'] > 0.001) & (edge_df['Adamic-Adar'] > 0.007) & (edge_df['Pref-Attach'] > 0.001) & (edge_df['Res-Alloc'] > 0.001)]
i = 0
for index, row in edge_df.iterrows():
    if row['Y'] == 1:
        i += 1
print("Intersection: {}\nDataset shape: {}".format(i, edge_df.shape[0]))
edge_df.describe()

Intersection: 2507
Dataset shape: 54230


Unnamed: 0,Jaccard,Adamic-Adar,Pref-Attach,Res-Alloc,Y
count,54230.0,54230.0,54230.0,54230.0,54230.0
mean,0.028507,0.388769,891.250175,0.03,0.046229
std,0.031219,0.423652,784.373979,0.039395,0.209983
min,0.005882,0.211932,6.0,0.008929,0.0
25%,0.016393,0.232338,361.0,0.013514,0.0
50%,0.021739,0.255622,675.0,0.02,0.0
75%,0.032258,0.464148,1188.0,0.034483,0.0
max,0.803922,10.302478,10192.0,0.775456,1.0


In [62]:
# samples: 2006-2015, 2016; connected by genres
train_data = defaultdict(list)
test_data = defaultdict(list)

for index, row in df.iterrows():
    actors = row['Actors'].split(',')
    year = row['Year']
    for actor in actors:
        if year < 2016:
            train_data[actor].append(row['Genre'])
        else:
            test_data[actor].append(row['Genre'])
            
            
G_train = nx.Graph(name="TrainData")
G_test = nx.Graph(name="TestData")

# generating train graph
for actor in train_data:
    movies = train_data[actor]
    actor = actor.strip()
    G_train.add_node(actor, actor_attributes=movies)

    for node, attrs in G_train.nodes(data=True):
        if node != actor:
            for a in attrs['actor_attributes']:
                if a in movies and not G_train.has_edge(actor, node):
                    G_train.add_edge(actor, node, film=a)
                
# generate test graph
for actor in test_data:
    movies = test_data[actor]
    actor = actor.strip()
    G_test.add_node(actor, actor_attributes=movies)

    for node, attrs in G_test.nodes(data=True):
        if node != actor:
            for a in attrs['actor_attributes']:
                if a in movies and not G_test.has_edge(actor, node):
                    G_test.add_edge(actor, node, film=a)
           

indexes = []
for u, v, p in nx.jaccard_coefficient(G_train):
    indexes.append(tuple((u, v)))
                    
edge_df = pd.DataFrame(index=indexes)
edge_df['Jaccard'] = [i[2] for i in nx.jaccard_coefficient(G_train, edge_df.index)]
edge_df['Adamic-Adar'] = [i[2] for i in nx.adamic_adar_index(G_train, edge_df.index)]
edge_df['Pref-Attach'] = [i[2] for i in nx.preferential_attachment(G_train, edge_df.index)]
edge_df['Res-Alloc'] = [i[2] for i in nx.resource_allocation_index(G_train, edge_df.index)]
edge_df['Y'] = [1 if G_test.has_edge(u, v) else 0 for u, v in indexes]

edge_df = edge_df[(edge_df['Jaccard'] > 0.001) & (edge_df['Adamic-Adar'] > 0.007) & (edge_df['Pref-Attach'] > 0.001) & (edge_df['Res-Alloc'] > 0.001)]
i = 0
for index, row in edge_df.iterrows():
    if row['Y'] == 1:
        i += 1
print("Intersection: {}\nDataset shape: {}".format(i, edge_df.shape[0]))
edge_df.describe()

Intersection: 2175
Dataset shape: 466090


Unnamed: 0,Jaccard,Adamic-Adar,Pref-Attach,Res-Alloc,Y
count,466090.0,466090.0,466090.0,466090.0,466090.0
mean,0.0282,0.932169,5235.840666,0.030105,0.004666
std,0.031863,1.335164,5979.26545,0.045663,0.068152
min,0.002778,0.172261,3.0,0.003012,0.0
25%,0.011494,0.210367,1414.0,0.008333,0.0
50%,0.020408,0.440089,3168.0,0.016856,0.0
75%,0.035294,1.098563,6844.0,0.034303,0.0
max,0.886364,27.36859,96019.0,1.132536,1.0


In [63]:
# samples: 2006-2008, 2009-2016; connected by genres
train_data = defaultdict(list)
test_data = defaultdict(list)

for index, row in df.iterrows():
    actors = row['Actors'].split(',')
    year = row['Year']
    for actor in actors:
        if year < 2009:
            train_data[actor].append(row['Genre'])
        else:
            test_data[actor].append(row['Genre'])
            
            
G_train = nx.Graph(name="TrainData")
G_test = nx.Graph(name="TestData")

# generating train graph
for actor in train_data:
    movies = train_data[actor]
    actor = actor.strip()
    G_train.add_node(actor, actor_attributes=movies)

    for node, attrs in G_train.nodes(data=True):
        if node != actor:
            for a in attrs['actor_attributes']:
                if a in movies and not G_train.has_edge(actor, node):
                    G_train.add_edge(actor, node, film=a)
                
# generate test graph
for actor in test_data:
    movies = test_data[actor]
    actor = actor.strip()
    G_test.add_node(actor, actor_attributes=movies)

    for node, attrs in G_test.nodes(data=True):
        if node != actor:
            for a in attrs['actor_attributes']:
                if a in movies and not G_test.has_edge(actor, node):
                    G_test.add_edge(actor, node, film=a)
           

indexes = []
for u, v, p in nx.jaccard_coefficient(G_train):
    indexes.append(tuple((u, v)))
                    
edge_df = pd.DataFrame(index=indexes)
edge_df['Jaccard'] = [i[2] for i in nx.jaccard_coefficient(G_train, edge_df.index)]
edge_df['Adamic-Adar'] = [i[2] for i in nx.adamic_adar_index(G_train, edge_df.index)]
edge_df['Pref-Attach'] = [i[2] for i in nx.preferential_attachment(G_train, edge_df.index)]
edge_df['Res-Alloc'] = [i[2] for i in nx.resource_allocation_index(G_train, edge_df.index)]
edge_df['Y'] = [1 if G_test.has_edge(u, v) else 0 for u, v in indexes]

edge_df = edge_df[(edge_df['Jaccard'] > 0.001) & (edge_df['Adamic-Adar'] > 0.007) & (edge_df['Pref-Attach'] > 0.001) & (edge_df['Res-Alloc'] > 0.001)]
i = 0
for index, row in edge_df.iterrows():
    if row['Y'] == 1:
        i += 1
print("Intersection: {}\nDataset shape: {}".format(i, edge_df.shape[0]))
edge_df.describe()

Intersection: 625
Dataset shape: 8298


Unnamed: 0,Jaccard,Adamic-Adar,Pref-Attach,Res-Alloc,Y
count,8298.0,8298.0,8298.0,8298.0,8298.0
mean,0.046229,0.362734,232.83731,0.045691,0.075319
std,0.031699,0.228234,198.71257,0.038472,0.263922
min,0.010989,0.253085,6.0,0.019231,0.0
25%,0.029412,0.271085,105.0,0.025,0.0
50%,0.038462,0.306928,182.0,0.038462,0.0
75%,0.05,0.328459,297.0,0.047619,0.0
max,0.708333,5.451449,2080.0,0.756181,1.0


In [64]:
# samples: 2006-2013, 2014-2016; connected by genres
train_data = defaultdict(list)
test_data = defaultdict(list)

for index, row in df.iterrows():
    actors = row['Actors'].split(',')
    year = row['Year']
    for actor in actors:
        if year < 2014:
            train_data[actor].append(row['Genre'])
        else:
            test_data[actor].append(row['Genre'])
            
            
G_train = nx.Graph(name="TrainData")
G_test = nx.Graph(name="TestData")

# generating train graph
for actor in train_data:
    movies = train_data[actor]
    actor = actor.strip()
    G_train.add_node(actor, actor_attributes=movies)

    for node, attrs in G_train.nodes(data=True):
        if node != actor:
            for a in attrs['actor_attributes']:
                if a in movies and not G_train.has_edge(actor, node):
                    G_train.add_edge(actor, node, film=a)
                
# generate test graph
for actor in test_data:
    movies = test_data[actor]
    actor = actor.strip()
    G_test.add_node(actor, actor_attributes=movies)

    for node, attrs in G_test.nodes(data=True):
        if node != actor:
            for a in attrs['actor_attributes']:
                if a in movies and not G_test.has_edge(actor, node):
                    G_test.add_edge(actor, node, film=a)
           

indexes = []
for u, v, p in nx.jaccard_coefficient(G_train):
    indexes.append(tuple((u, v)))
                    
edge_df = pd.DataFrame(index=indexes)
edge_df['Jaccard'] = [i[2] for i in nx.jaccard_coefficient(G_train, edge_df.index)]
edge_df['Adamic-Adar'] = [i[2] for i in nx.adamic_adar_index(G_train, edge_df.index)]
edge_df['Pref-Attach'] = [i[2] for i in nx.preferential_attachment(G_train, edge_df.index)]
edge_df['Res-Alloc'] = [i[2] for i in nx.resource_allocation_index(G_train, edge_df.index)]
edge_df['Y'] = [1 if G_test.has_edge(u, v) else 0 for u, v in indexes]

edge_df = edge_df[(edge_df['Jaccard'] > 0.001) & (edge_df['Adamic-Adar'] > 0.007) & (edge_df['Pref-Attach'] > 0.001) & (edge_df['Res-Alloc'] > 0.001)]
i = 0
for index, row in edge_df.iterrows():
    if row['Y'] == 1:
        i += 1
print("Intersection: {}\nDataset shape: {}".format(i, edge_df.shape[0]))
edge_df.describe()

Intersection: 3703
Dataset shape: 206533


Unnamed: 0,Jaccard,Adamic-Adar,Pref-Attach,Res-Alloc,Y
count,206533.0,206533.0,206533.0,206533.0,206533.0
mean,0.027627,0.65852,2782.627963,0.030343,0.017929
std,0.030866,0.852166,2773.357931,0.043102,0.132695
min,0.003367,0.185093,4.0,0.004505,0.0
25%,0.013072,0.223347,867.0,0.011245,0.0
50%,0.020979,0.4154,1890.0,0.019361,0.0
75%,0.034188,0.795255,3792.0,0.034868,0.0
max,0.870968,20.181216,41292.0,1.137035,1.0
