In [2]:
import pandas as pd
from collections import defaultdict
import networkx as nx
import linkpred


df = pd.read_csv('imdb.csv')

In [10]:
# samples: 2006-2011, 2012-2016; connected by movies
train_data = defaultdict(list)
test_data = defaultdict(list)

for index, row in df.iterrows():
    actors = row['Actors'].split(',')
    year = row['Year']
    for actor in actors:
        if year < 2012:
            train_data[actor].append(row['Title'])
        else:
            test_data[actor].append(row['Title'])
            
            
G_train = nx.Graph(name="TrainData")
G_test = nx.Graph(name="TestData")

# generating train graph
for actor in train_data:
    movies = train_data[actor]
    actor = actor.strip()
    G_train.add_node(actor, actor_attributes=movies)

    for node, attrs in G_train.nodes(data=True):
        if node != actor:
            for a in attrs['actor_attributes']:
                if a in movies and not G_train.has_edge(actor, node):
                    G_train.add_edge(actor, node, film=a)
                
# generate test graph
for actor in test_data:
    movies = test_data[actor]
    actor = actor.strip()
    G_test.add_node(actor, actor_attributes=movies)

    for node, attrs in G_test.nodes(data=True):
        if node != actor:
            for a in attrs['actor_attributes']:
                if a in movies and not G_test.has_edge(actor, node):
                    G_test.add_edge(actor, node, film=a)
                   
                
com_neigh_pred = linkpred.predictors.CommonNeighbours(G_train)
aa_results = com_neigh_pred.predict()

as_pred = linkpred.predictors.AssociationStrength(G_train)
as_results = as_pred.predict()

cos_pred = linkpred.predictors.Cosine(G_train)
cos_results = cos_pred.predict()

indexes = []
for pair, value in aa_results.items():
    indexes.append(tuple(pair))
    
lp_df = pd.DataFrame(index=indexes)
lp_df['Common_Neighbor'] = [i for i in list(aa_results.values())]
lp_df['Assoc_Strength'] = [i for i in list(as_results.values())]
lp_df['Cosine'] = [i for i in list(cos_results.values())]
lp_df['Y'] = [1 if G_test.has_edge(u, v) else 0 for u, v in indexes]

lp_df = lp_df[(lp_df['Common_Neighbor'] > 1.5) & (lp_df['Assoc_Strength'] > 0.1) & (lp_df['Cosine'] > 0.3)]
i = 0
for index, row in lp_df.iterrows():
    if row['Y'] == 1:
        i += 1
print("Intersection: {}\nDataset shape: {}".format(i, lp_df.shape[0]))
lp_df.describe()

Intersection: 14
Dataset shape: 830


Unnamed: 0,Common_Neighbor,Assoc_Strength,Cosine,Y
count,830.0,830.0,830.0,830.0
mean,2.03253,0.177448,0.592492,0.016867
std,0.214441,0.053432,0.095304,0.128852
min,2.0,0.107143,0.471405,0.0
25%,2.0,0.111111,0.471405,0.0
50%,2.0,0.222222,0.666667,0.0
75%,2.0,0.222222,0.666667,0.0
max,4.0,0.5,1.0,1.0


In [11]:
# samples: 2006-2008, 2009-2016; connected by movies
train_data = defaultdict(list)
test_data = defaultdict(list)

for index, row in df.iterrows():
    actors = row['Actors'].split(',')
    year = row['Year']
    for actor in actors:
        if year < 2009:
            train_data[actor].append(row['Title'])
        else:
            test_data[actor].append(row['Title'])
            
            
G_train = nx.Graph(name="TrainData")
G_test = nx.Graph(name="TestData")

# generating train graph
for actor in train_data:
    movies = train_data[actor]
    actor = actor.strip()
    G_train.add_node(actor, actor_attributes=movies)

    for node, attrs in G_train.nodes(data=True):
        if node != actor:
            for a in attrs['actor_attributes']:
                if a in movies and not G_train.has_edge(actor, node):
                    G_train.add_edge(actor, node, film=a)
                
# generate test graph
for actor in test_data:
    movies = test_data[actor]
    actor = actor.strip()
    G_test.add_node(actor, actor_attributes=movies)

    for node, attrs in G_test.nodes(data=True):
        if node != actor:
            for a in attrs['actor_attributes']:
                if a in movies and not G_test.has_edge(actor, node):
                    G_test.add_edge(actor, node, film=a)
                   
                
com_neigh_pred = linkpred.predictors.CommonNeighbours(G_train)
aa_results = com_neigh_pred.predict()

as_pred = linkpred.predictors.AssociationStrength(G_train)
as_results = as_pred.predict()

cos_pred = linkpred.predictors.Cosine(G_train)
cos_results = cos_pred.predict()

indexes = []
for pair, value in aa_results.items():
    indexes.append(tuple(pair))
    
lp_df = pd.DataFrame(index=indexes)
lp_df['Common_Neighbor'] = [i for i in list(aa_results.values())]
lp_df['Assoc_Strength'] = [i for i in list(as_results.values())]
lp_df['Cosine'] = [i for i in list(cos_results.values())]
lp_df['Y'] = [1 if G_test.has_edge(u, v) else 0 for u, v in indexes]

lp_df = lp_df[(lp_df['Common_Neighbor'] > 1.5) & (lp_df['Assoc_Strength'] > 0.1) & (lp_df['Cosine'] > 0.3)]
i = 0
for index, row in lp_df.iterrows():
    if row['Y'] == 1:
        i += 1
print("Intersection: {}\nDataset shape: {}".format(i, lp_df.shape[0]))
lp_df.describe()

Intersection: 31
Dataset shape: 643


Unnamed: 0,Common_Neighbor,Assoc_Strength,Cosine,Y
count,643.0,643.0,643.0,643.0
mean,2.013997,0.18335,0.600315,0.048212
std,0.152211,0.052338,0.093174,0.21438
min,2.0,0.107143,0.471405,0.0
25%,2.0,0.111111,0.471405,0.0
50%,2.0,0.222222,0.666667,0.0
75%,2.0,0.222222,0.666667,0.0
max,4.0,0.333333,1.0,1.0


In [13]:
# samples: 2006-2015, 2016; connected by movies
train_data = defaultdict(list)
test_data = defaultdict(list)

for index, row in df.iterrows():
    actors = row['Actors'].split(',')
    year = row['Year']
    for actor in actors:
        if year < 2016:
            train_data[actor].append(row['Title'])
        else:
            test_data[actor].append(row['Title'])
            
            
G_train = nx.Graph(name="TrainData")
G_test = nx.Graph(name="TestData")

# generating train graph
for actor in train_data:
    movies = train_data[actor]
    actor = actor.strip()
    G_train.add_node(actor, actor_attributes=movies)

    for node, attrs in G_train.nodes(data=True):
        if node != actor:
            for a in attrs['actor_attributes']:
                if a in movies and not G_train.has_edge(actor, node):
                    G_train.add_edge(actor, node, film=a)
                
# generate test graph
for actor in test_data:
    movies = test_data[actor]
    actor = actor.strip()
    G_test.add_node(actor, actor_attributes=movies)

    for node, attrs in G_test.nodes(data=True):
        if node != actor:
            for a in attrs['actor_attributes']:
                if a in movies and not G_test.has_edge(actor, node):
                    G_test.add_edge(actor, node, film=a)
                   
                
com_neigh_pred = linkpred.predictors.CommonNeighbours(G_train)
aa_results = com_neigh_pred.predict()

as_pred = linkpred.predictors.AssociationStrength(G_train)
as_results = as_pred.predict()

cos_pred = linkpred.predictors.Cosine(G_train)
cos_results = cos_pred.predict()

indexes = []
for pair, value in aa_results.items():
    indexes.append(tuple(pair))
    
lp_df = pd.DataFrame(index=indexes)
lp_df['Common_Neighbor'] = [i for i in list(aa_results.values())]
lp_df['Assoc_Strength'] = [i for i in list(as_results.values())]
lp_df['Cosine'] = [i for i in list(cos_results.values())]
lp_df['Y'] = [1 if G_test.has_edge(u, v) else 0 for u, v in indexes]

lp_df = lp_df[(lp_df['Common_Neighbor'] > 1.5) & (lp_df['Assoc_Strength'] > 0.1) & (lp_df['Cosine'] > 0.3)]
i = 0
for index, row in lp_df.iterrows():
    if row['Y'] == 1:
        i += 1
print("Intersection: {}\nDataset shape: {}".format(i, lp_df.shape[0]))
lp_df.describe()

Intersection: 3
Dataset shape: 991


Unnamed: 0,Common_Neighbor,Assoc_Strength,Cosine,Y
count,991.0,991.0,991.0,991.0
mean,2.060545,0.182565,0.604062,0.003027
std,0.394875,0.050781,0.0918,0.054965
min,2.0,0.107143,0.471405,0.0
25%,2.0,0.111111,0.5,0.0
50%,2.0,0.222222,0.666667,0.0
75%,2.0,0.222222,0.666667,0.0
max,6.0,0.333333,0.857143,1.0


In [14]:
# samples: 2006-2011, 2012-2016; connected by genres
train_data = defaultdict(list)
test_data = defaultdict(list)

for index, row in df.iterrows():
    actors = row['Actors'].split(',')
    year = row['Year']
    for actor in actors:
        if year < 2012:
            train_data[actor].append(row['Genre'])
        else:
            test_data[actor].append(row['Genre'])
            
            
G_train = nx.Graph(name="TrainData")
G_test = nx.Graph(name="TestData")

# generating train graph
for actor in train_data:
    movies = train_data[actor]
    actor = actor.strip()
    G_train.add_node(actor, actor_attributes=movies)

    for node, attrs in G_train.nodes(data=True):
        if node != actor:
            for a in attrs['actor_attributes']:
                if a in movies and not G_train.has_edge(actor, node):
                    G_train.add_edge(actor, node, film=a)
                
# generate test graph
for actor in test_data:
    movies = test_data[actor]
    actor = actor.strip()
    G_test.add_node(actor, actor_attributes=movies)

    for node, attrs in G_test.nodes(data=True):
        if node != actor:
            for a in attrs['actor_attributes']:
                if a in movies and not G_test.has_edge(actor, node):
                    G_test.add_edge(actor, node, film=a)
                   
                
com_neigh_pred = linkpred.predictors.CommonNeighbours(G_train)
aa_results = com_neigh_pred.predict()

as_pred = linkpred.predictors.AssociationStrength(G_train)
as_results = as_pred.predict()

cos_pred = linkpred.predictors.Cosine(G_train)
cos_results = cos_pred.predict()

indexes = []
for pair, value in aa_results.items():
    indexes.append(tuple(pair))
    
lp_df = pd.DataFrame(index=indexes)
lp_df['Common_Neighbor'] = [i for i in list(aa_results.values())]
lp_df['Assoc_Strength'] = [i for i in list(as_results.values())]
lp_df['Cosine'] = [i for i in list(cos_results.values())]
lp_df['Y'] = [1 if G_test.has_edge(u, v) else 0 for u, v in indexes]

lp_df = lp_df[(lp_df['Common_Neighbor'] > 1.5) & (lp_df['Assoc_Strength'] > 0.1) & (lp_df['Cosine'] > 0.3)]
i = 0
for index, row in lp_df.iterrows():
    if row['Y'] == 1:
        i += 1
print("Intersection: {}\nDataset shape: {}".format(i, lp_df.shape[0]))
lp_df.describe()

Intersection: 10
Dataset shape: 304


Unnamed: 0,Common_Neighbor,Assoc_Strength,Cosine,Y
count,304.0,304.0,304.0,304.0
mean,3.733553,0.171597,0.737537,0.032895
std,1.97126,0.049472,0.111795,0.178655
min,2.0,0.107143,0.471405,0.0
25%,2.0,0.122449,0.666667,0.0
50%,2.0,0.16,0.666667,0.0
75%,6.0,0.222222,0.857143,0.0
max,7.0,0.222222,0.875,1.0


In [16]:
# samples: 2006-2008, 2009-2016; connected by genres
train_data = defaultdict(list)
test_data = defaultdict(list)

for index, row in df.iterrows():
    actors = row['Actors'].split(',')
    year = row['Year']
    for actor in actors:
        if year < 2009:
            train_data[actor].append(row['Genre'])
        else:
            test_data[actor].append(row['Genre'])
            
            
G_train = nx.Graph(name="TrainData")
G_test = nx.Graph(name="TestData")

# generating train graph
for actor in train_data:
    movies = train_data[actor]
    actor = actor.strip()
    G_train.add_node(actor, actor_attributes=movies)

    for node, attrs in G_train.nodes(data=True):
        if node != actor:
            for a in attrs['actor_attributes']:
                if a in movies and not G_train.has_edge(actor, node):
                    G_train.add_edge(actor, node, film=a)
                
# generate test graph
for actor in test_data:
    movies = test_data[actor]
    actor = actor.strip()
    G_test.add_node(actor, actor_attributes=movies)

    for node, attrs in G_test.nodes(data=True):
        if node != actor:
            for a in attrs['actor_attributes']:
                if a in movies and not G_test.has_edge(actor, node):
                    G_test.add_edge(actor, node, film=a)
                   
                
com_neigh_pred = linkpred.predictors.CommonNeighbours(G_train)
aa_results = com_neigh_pred.predict()

as_pred = linkpred.predictors.AssociationStrength(G_train)
as_results = as_pred.predict()

cos_pred = linkpred.predictors.Cosine(G_train)
cos_results = cos_pred.predict()

indexes = []
for pair, value in aa_results.items():
    indexes.append(tuple(pair))
    
lp_df = pd.DataFrame(index=indexes)
lp_df['Common_Neighbor'] = [i for i in list(aa_results.values())]
lp_df['Assoc_Strength'] = [i for i in list(as_results.values())]
lp_df['Cosine'] = [i for i in list(cos_results.values())]
lp_df['Y'] = [1 if G_test.has_edge(u, v) else 0 for u, v in indexes]

lp_df = lp_df[(lp_df['Common_Neighbor'] > 1.5) & (lp_df['Assoc_Strength'] > 0.1) & (lp_df['Cosine'] > 0.3)]
i = 0
for index, row in lp_df.iterrows():
    if row['Y'] == 1:
        i += 1
print("Intersection: {}\nDataset shape: {}".format(i, lp_df.shape[0]))
lp_df.describe()

Intersection: 16
Dataset shape: 382


Unnamed: 0,Common_Neighbor,Assoc_Strength,Cosine,Y
count,382.0,382.0,382.0,382.0
mean,3.664921,0.167248,0.725736,0.041885
std,1.902554,0.049692,0.125197,0.200589
min,2.0,0.111111,0.471405,0.0
25%,2.0,0.122449,0.666667,0.0
50%,2.0,0.138889,0.666667,0.0
75%,6.0,0.222222,0.857143,0.0
max,6.0,0.222222,0.857143,1.0


In [4]:
# weighted graph, connected by movies
train_data = defaultdict(list)
test_data = defaultdict(list)

for index, row in df.iterrows():
    actors = row['Actors'].split(',')
    year = row['Year']
    for actor in actors:
        if year < 2009:
            train_data[actor].append(row['Title'])
        else:
            test_data[actor].append(row['Title'])
            
            
G_train = nx.Graph(name="TrainData")
G_test = nx.Graph(name="TestData")

# generating train graph
for actor in train_data:
    movies = train_data[actor]
    actor = actor.strip()
    G_train.add_node(actor, actor_attributes=movies)

    for node, attrs in G_train.nodes(data=True):
        if node != actor:
            for a in attrs['actor_attributes']:
                if a in movies and not G_train.has_edge(actor, node):
                    set_movies_actor = set(nx.get_node_attributes(G_train,'actor_attributes')[actor])
                    set_movies_node = set(nx.get_node_attributes(G_train,'actor_attributes')[node])
                    G_train.add_edge(actor, node, film=a, num_common_movies=len(set_movies_actor.intersection(set_movies_node)))
                
# generate test graph
for actor in test_data:
    movies = test_data[actor]
    actor = actor.strip()
    G_test.add_node(actor, actor_attributes=movies)

    for node, attrs in G_test.nodes(data=True):
        if node != actor:
            for a in attrs['actor_attributes']:
                if a in movies and not G_test.has_edge(actor, node):
                    set_movies_actor = set(nx.get_node_attributes(G_test,'actor_attributes')[actor])
                    set_movies_node = set(nx.get_node_attributes(G_test,'actor_attributes')[node])
                    G_test.add_edge(actor, node, film=a, num_common_movies=len(set_movies_actor.intersection(set_movies_node)))
                   
                
com_neigh_pred = linkpred.predictors.CommonNeighbours(G_train)
aa_results = com_neigh_pred.predict()

as_pred = linkpred.predictors.AssociationStrength(G_train)
as_results = as_pred.predict()

cos_pred = linkpred.predictors.Cosine(G_train)
cos_results = cos_pred.predict()

indexes = []
for pair, value in aa_results.items():
    indexes.append(tuple(pair))
    
lp_df = pd.DataFrame(index=indexes)
lp_df['Common_Neighbor'] = [i for i in list(aa_results.values())]
lp_df['Assoc_Strength'] = [i for i in list(as_results.values())]
lp_df['Cosine'] = [i for i in list(cos_results.values())]
lp_df['Y'] = [1 if G_test.has_edge(u, v) else 0 for u, v in indexes]

lp_df = lp_df[(lp_df['Common_Neighbor'] > 1.5) & (lp_df['Assoc_Strength'] > 0.1) & (lp_df['Cosine'] > 0.3)]
i = 0
for index, row in lp_df.iterrows():
    if row['Y'] == 1:
        i += 1
print("Intersection: {}\nDataset shape: {}".format(i, lp_df.shape[0]))
lp_df.describe()

Intersection: 31
Dataset shape: 643


Unnamed: 0,Common_Neighbor,Assoc_Strength,Cosine,Y
count,643.0,643.0,643.0,643.0
mean,2.013997,0.18335,0.600315,0.048212
std,0.152211,0.052338,0.093174,0.21438
min,2.0,0.107143,0.471405,0.0
25%,2.0,0.111111,0.471405,0.0
50%,2.0,0.222222,0.666667,0.0
75%,2.0,0.222222,0.666667,0.0
max,4.0,0.333333,1.0,1.0
