In [31]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import networkx as nx
import pandas as pd
from tqdm import tqdm
from collections import defaultdict
from networkx.algorithms import community
import networkx.algorithms.community as nxcom

from community import community_louvain


In [17]:
nodes = pd.read_csv('youtube_raw.csv').drop('id.1', axis=1)
nodes = nodes.iloc[:, 0:9]
nodes.head()

Unnamed: 0,id,uploader,age,category,length,views,rate,ratings,comments
0,2rwktobtv9s,EA,742.0,Gadgets & Games,83.0,389536.0,2.65,2294.0,268.0
1,h6Ghupxbj9g,KB42PAH,742.0,Sports,28.0,276207.0,4.57,297.0,424.0
2,mfeZibn3vmU,Gromek66,742.0,Comedy,278.0,151693.0,4.68,228.0,96.0
3,86Fe6LICKKk,lonelygirl15,742.0,People & Blogs,148.0,125061.0,2.77,1343.0,1419.0
4,XbRkmBcVWlc,Htiwan,742.0,Film & Animation,79.0,108868.0,4.33,282.0,245.0


In [18]:
train_set = pd.read_csv('train_set.csv', index_col =0)
train_set.head()

Unnamed: 0,Source,Target,Edge
0,xuOtyEjQ-v4,WUvCeCflehk,1
1,-V1Nw7kIY7o,WNsrs_cjZH8,1
2,zhdUqcvcdDY,3xdO4n-Vk4Y,1
3,RVbDwqSq4Ec,J7X2ieNiixE,1
4,aKWP6f_Wvoc,EQbLNEiM17M,1


In [19]:
train_edges = train_set.values.tolist()

In [20]:
test_set = pd.read_csv('test_set.csv', index_col =0)
test_set.head()

Unnamed: 0,Source,Target,Edge
0,rGBhluYrbzU,XZ8tdf1gyHI,1
1,ClQHYZqW_Ns,fvjGMoE7oSU,1
2,EfvQAu6NRrk,1Ex1ZcQJEv8,1
3,3jiTk_diJCw,PshwMfxvI-I,1
4,KTNXjOsXpUc,QsHbd8nPFbc,1


In [21]:
test_edges = test_set.values.tolist()

In [22]:
G_train = nx.read_edgelist("train.edgelist", delimiter=',')
G_test = nx.read_edgelist("test.edgelist", delimiter=',')

In [23]:
# Turn cols into lists 
for i in nodes.columns:
    globals()[i]=nodes[i].values.tolist()

In [24]:
#print all variables that are now available as lists
nodes.columns

Index(['id', 'uploader', 'age', 'category', 'length', 'views', 'rate',
       'ratings', 'comments'],
      dtype='object')

## Communities

In [27]:
# centrality
deg_centrality_train = nx.degree_centrality(G_train)
deg_centrality_test = nx.degree_centrality(G_test)

eigen_cent_train = nx.eigenvector_centrality(G_train)
eigen_cent_test = nx.eigenvector_centrality(G_test)

In [32]:
# first computing the best partition
partition_train = community_louvain.best_partition(G_train)
partition_test = community_louvain.best_partition(G_test)

In [33]:
def social_features(source, target, type = "train"):

  if type == "train":
    deg_centrality = deg_centrality_train
    eigen_cent = eigen_cent_train
    partition = partition_train
    graph = G_train
  else:
    deg_centrality = deg_centrality_test
    eigen_cent = eigen_cent_test
    partition = partition_test
    graph = G_test
  ############### common neighbors ###############
  neigbors = len(list(nx.common_neighbors(graph,source, target)))

  ############### Same community ###############
  same_partition = 1 if partition.get(source)== partition.get(target) else 0
    
  ############### Centrality ###############
  diff_deg_cen = deg_centrality[source] - deg_centrality[target]
  diff_eigen_cen = eigen_cent[source] - eigen_cent[target]
     
  return [neigbors,
          same_partition,
          diff_deg_cen,
          diff_eigen_cen 
          ]

In [34]:
#compute features for training set
x_train = []
y_train = []
for i in tqdm(train_edges):
    x_train.append(social_features(i[0], i[1],"train"))
    y_train.append(int(i[2]))

100%|██████████| 307274/307274 [00:13<00:00, 22051.15it/s]


In [35]:
x_train = pd.DataFrame(x_train, columns = [
    "neigbors",
    "same_partition",
    "diff_deg_cen",
    "diff_eigen_cen" 
    ])
            
x_train["y"] = y_train

In [36]:
x_train.to_csv("social_theory_train.csv", index=False)

In [37]:
#compute feautures for test set
x_test = []
y_test = []
for i in tqdm(test_edges):
    x_test.append(social_features(i[0], i[1],"test"))
    y_test.append(int(i[2]))

100%|██████████| 170706/170706 [00:18<00:00, 9230.73it/s] 


In [38]:
x_test = pd.DataFrame(x_test, columns = [
    "neigbors",
    "same_partition",
    "diff_deg_cen",
    "diff_eigen_cen" 
    ])
            
x_test["y"] = y_test

In [39]:
x_test.to_csv("social_theory_test.csv", index=False)

In [42]:
x_train.groupby("y").mean()

Unnamed: 0_level_0,neigbors,same_partition,diff_deg_cen,diff_eigen_cen
y,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,0.000378,0.006873,1.220336e-08,-7.876984e-07
1,4.298385,0.923117,2.929163e-05,0.0002502677


In [43]:
x_train

Unnamed: 0,neigbors,same_partition,diff_deg_cen,diff_eigen_cen,y
0,2,1,0.000097,6.199184e-09,1
1,3,1,0.000000,2.372001e-06,1
2,1,1,0.000737,9.013412e-05,1
3,11,1,-0.000013,-5.773385e-11,1
4,1,1,0.000071,5.126576e-09,1
...,...,...,...,...,...
307269,0,0,0.000026,-1.034136e-06,0
307270,0,0,-0.000019,6.091262e-08,0
307271,0,0,-0.000078,-1.618421e-07,0
307272,0,0,0.000006,-6.707615e-08,0
