# **Dependencies**

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
import networkx as nx
from networkx.algorithms.community import greedy_modularity_communities

In [3]:
import json
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt

In [4]:
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import  LogisticRegression
from sklearn.metrics import roc_auc_score

In [5]:
df_friends_nodes = pd.read_csv('/content/drive/My Drive/Facebook_Data_Bruno_Leme/Facebook_Friends_Nodes.csv')
df_friends_edges = pd.read_csv('/content/drive/My Drive/Facebook_Data_Bruno_Leme/Facebook_Friends_Edges.csv')
df_friends_wedding_inviteds = pd.read_csv('/content/drive/My Drive/Facebook_Data_Bruno_Leme/Facebook_Friends_Wedding_Inviteds.csv', sep=';')

# **Data Prep**

In [6]:
df_friends_nodes

Unnamed: 0,Id,Label,timeset,sex,locale,agerank
0,530759938,Alexandre Kazuo Yassuda,,male,pt_BR,1202
1,531013568,Carol Senger,,female,pt_BR,1201
2,534275844,Ramon Prado,,male,en_US,1200
3,534300238,Sylvia Regina,,female,pt_BR,1199
4,537347192,Maria Aparecida Gameiro,,female,de_DE,1198
...,...,...,...,...,...,...
1197,100006665768869,Ernesto Sérgio,,male,pt_BR,5
1198,100006826684608,Samara Carneiro,,female,pt_BR,4
1199,100006879907526,Roberto Salles,,male,pt_BR,3
1200,100006924008870,Ricardo Barbosa,,male,pt_BR,2


In [7]:
dict_friends_nodes = {Id:Label for Id, Label in df_friends_nodes[['Id', 'Label']].values}

In [8]:
df_friends_edges['Source_Label'] = df_friends_edges.Source.apply(lambda x: dict_friends_nodes[x])
df_friends_edges['Target_Label'] = df_friends_edges.Target.apply(lambda x: dict_friends_nodes[x])

In [9]:
df_friends_edges

Unnamed: 0,Source,Target,Type,Id,Label,timeset,Weight,Source_Label,Target_Label
0,531013568,631274226,Undirected,0,,,1,Carol Senger,Breno Amaro
1,531013568,645684047,Undirected,1,,,1,Carol Senger,Fernanda Brum
2,531013568,657927805,Undirected,2,,,1,Carol Senger,Isaac David Stern
3,534275844,534300238,Undirected,3,,,1,Ramon Prado,Sylvia Regina
4,540657595,572592023,Undirected,4,,,1,Luciane Mastropaschoa,Jana Lima
...,...,...,...,...,...,...,...,...,...
25599,100005894858656,100006297810521,Undirected,25599,,,1,Solange Cruz III,Danilo Tadeu
25600,100005894858656,100006826684608,Undirected,25600,,,1,Solange Cruz III,Samara Carneiro
25601,100006036463865,100006172444215,Undirected,25601,,,1,Fernanda Oliveira de Souza,Ana Paula Batista Botelho
25602,100006040086262,100006665768869,Undirected,25602,,,1,Samuel Aguilar,Ernesto Sérgio


# **Graph Creation**

In [10]:
G = nx.Graph()

In [11]:
df_friends_nodes['Label'].values

array(['Alexandre Kazuo Yassuda', 'Carol Senger', 'Ramon Prado', ...,
       'Roberto Salles', 'Ricardo Barbosa', 'Janaina Lucena'],
      dtype=object)

In [12]:
df_friends_nodes['Id'].values

array([      530759938,       531013568,       534275844, ...,
       100006879907526, 100006924008870, 100007033297460])

In [13]:
df_friends_edges[['Source_Label','Target_Label']].values

array([['Carol Senger', 'Breno Amaro'],
       ['Carol Senger', 'Fernanda Brum'],
       ['Carol Senger', 'Isaac David Stern'],
       ...,
       ['Fernanda Oliveira de Souza', 'Ana Paula Batista Botelho'],
       ['Samuel Aguilar', 'Ernesto Sérgio'],
       ['Danilo Tadeu', 'Adriano Paz']], dtype=object)

In [14]:
df_friends_edges[['Source','Target']].values

array([[      531013568,       631274226],
       [      531013568,       645684047],
       [      531013568,       657927805],
       ...,
       [100006036463865, 100006172444215],
       [100006040086262, 100006665768869],
       [100006297810521, 100006384997471]])

In [15]:
G.add_nodes_from(df_friends_nodes['Id'].values)
G.add_edges_from(df_friends_edges[['Source','Target']].values)

# **Network Metrics Calculation (SNA Features)**

In [16]:
#metrics
degree_centrality = nx.degree_centrality(G)
closeness_centrality = nx.closeness_centrality(G)
betweenness_centrality = nx.betweenness_centrality(G)
eigenvector_centrality = nx.eigenvector_centrality(G)
subgraph_centrality = nx.subgraph_centrality(G)

In [17]:
communities = greedy_modularity_communities(G)

In [18]:
def get_cummunity_id(x):
  for id, c in enumerate(communities):
    if x in list(c):
      return id
  return -1

In [19]:
df_friends_nodes['degree_centrality'] = df_friends_nodes['Id'].apply(lambda x: degree_centrality[x])
df_friends_nodes['closeness_centrality'] = df_friends_nodes['Id'].apply(lambda x: closeness_centrality[x])
df_friends_nodes['betweenness_centrality'] = df_friends_nodes['Id'].apply(lambda x: betweenness_centrality[x])
df_friends_nodes['eigenvector_centrality'] = df_friends_nodes['Id'].apply(lambda x: eigenvector_centrality[x])
df_friends_nodes['subgraph_centrality'] = df_friends_nodes['Id'].apply(lambda x: subgraph_centrality[x])
df_friends_nodes['cummunity_id'] = df_friends_nodes['Id'].apply(get_cummunity_id)

In [20]:
df_friends_nodes

Unnamed: 0,Id,Label,timeset,sex,locale,agerank,degree_centrality,closeness_centrality,betweenness_centrality,eigenvector_centrality,subgraph_centrality,cummunity_id
0,530759938,Alexandre Kazuo Yassuda,,male,pt_BR,1202,0.011657,0.276928,0.000120,0.000001,5.658480e+42,0
1,531013568,Carol Senger,,female,pt_BR,1201,0.034971,0.253463,0.000430,0.000011,3.725105e+44,4
2,534275844,Ramon Prado,,male,en_US,1200,0.016653,0.295424,0.000835,0.002146,1.504496e+49,1
3,534300238,Sylvia Regina,,female,pt_BR,1199,0.009159,0.257219,0.000003,0.000095,2.944517e+46,1
4,537347192,Maria Aparecida Gameiro,,female,de_DE,1198,0.010824,0.324916,0.000064,0.000107,3.663454e+46,7
...,...,...,...,...,...,...,...,...,...,...,...,...
1197,100006665768869,Ernesto Sérgio,,male,pt_BR,5,0.038301,0.350567,0.003583,0.000295,2.763967e+47,0
1198,100006826684608,Samara Carneiro,,female,pt_BR,4,0.099084,0.344454,0.000882,0.067231,1.476331e+52,1
1199,100006879907526,Roberto Salles,,male,pt_BR,3,0.006661,0.290333,0.000005,0.000006,1.332682e+44,0
1200,100006924008870,Ricardo Barbosa,,male,pt_BR,2,0.004163,0.289745,0.000001,0.000008,1.958194e+44,0


# **SNA Features - Descriptive Analysis**

## **Calculated Metrics**

In [21]:
df_friends_nodes['invited'] = df_friends_nodes['Id'].apply(lambda x: 1 if x in df_friends_wedding_inviteds.Id.values else 0)

In [22]:
df_friends_nodes[['Label', 'degree_centrality']].sort_values('degree_centrality', ascending=False).head(10)

Unnamed: 0,Label,degree_centrality
339,Sidnei França,0.217319
720,Fabio Carromeu,0.207327
932,Aline Oliveira,0.203164
400,Jocimar Martins Martins,0.197336
817,Comissão da Morada,0.194838
915,Adriana Gomes,0.190674
481,Su Farias,0.183181
927,Erica Ferreira,0.183181
684,Priscila Leme,0.181515
83,Lika Murfhy,0.17985


In [23]:
df_friends_nodes[['Label', 'closeness_centrality']].sort_values('closeness_centrality', ascending=False).head(10)

Unnamed: 0,Label,closeness_centrality
684,Priscila Leme,0.470881
162,Rafaela Karoliny,0.42621
775,Bruno Zamboni,0.421041
880,Rafael Cruz,0.417664
126,Marcio Peres Pereira,0.415543
817,Comissão da Morada,0.414191
945,Guardioes da Morada,0.41107
378,Emerson Ramires,0.410775
777,Marília Palazini,0.410187
122,Vinicius Cruz,0.407706


In [24]:
df_friends_nodes[['Label', 'betweenness_centrality']].sort_values('betweenness_centrality', ascending=False).head(10)

Unnamed: 0,Label,betweenness_centrality
684,Priscila Leme,0.222842
551,Andre Dantas,0.071518
162,Rafaela Karoliny,0.055357
777,Marília Palazini,0.054202
248,Alexandre Leme Neto,0.042346
171,Isabela Roque,0.034248
702,Rodrigo Franco,0.031205
126,Marcio Peres Pereira,0.024375
227,Patricia Coelho,0.024159
761,Mariana De Faria,0.023611


In [25]:
df_friends_nodes[['Label', 'eigenvector_centrality']].sort_values('eigenvector_centrality', ascending=False).head(10)

Unnamed: 0,Label,eigenvector_centrality
720,Fabio Carromeu,0.111784
339,Sidnei França,0.11067
400,Jocimar Martins Martins,0.106912
932,Aline Oliveira,0.106681
927,Erica Ferreira,0.1054
481,Su Farias,0.104783
915,Adriana Gomes,0.103006
817,Comissão da Morada,0.102146
83,Lika Murfhy,0.101468
939,Carlos Ed Murph Farias,0.099515


In [26]:
df_friends_nodes[['Label', 'subgraph_centrality']].sort_values('subgraph_centrality', ascending=False).head(10)

Unnamed: 0,Label,subgraph_centrality
720,Fabio Carromeu,4.081325e+52
339,Sidnei França,4.000352e+52
400,Jocimar Martins Martins,3.733337e+52
932,Aline Oliveira,3.717217e+52
927,Erica Ferreira,3.628448e+52
481,Su Farias,3.58612e+52
915,Adriana Gomes,3.465527e+52
817,Comissão da Morada,3.407893e+52
83,Lika Murfhy,3.362833e+52
939,Carlos Ed Murph Farias,3.234626e+52


## **Detected Communities**

In [27]:
df_friends_nodes[['Label', 'cummunity_id']].groupby('cummunity_id').aggregate('count')

Unnamed: 0_level_0,Label
cummunity_id,Unnamed: 1_level_1
0,505
1,380
2,71
3,65
4,58
5,52
6,22
7,18
8,10
9,3


In [28]:
df_friends_nodes[df_friends_nodes['cummunity_id'] == 1]['Label'].sample(10, random_state=123) #Samba School (Mocidade Alegre) Friends

819                Patty Vieira
756         Marabel Nunes Pires
973            Eduardo Esposito
542          Luciana Nascimento
839              Jesse Teixeira
662                 Lucy Garcia
597       Renata Comunidade Csa
711                 Célia Prado
950     Felipe Pereira da Silva
661    Leticia Campos Marcolino
Name: Label, dtype: object

In [29]:
df_friends_nodes[df_friends_nodes['cummunity_id'] == 2]['Label'].sample(10, random_state=123) #My father's side family and my wife's family/friends

271                     Gi India
840                  Luiz Otavio
837                   Pri Maitan
707               Vanessa Santos
781    Camila Rodrigues Domingos
568                  Adriana Vaz
277                  Hugo Amaral
547              Marcos Ferreira
829                 Eduardo Leme
141    Elaine Cristina L Miranda
Name: Label, dtype: object

In [30]:
df_friends_nodes[df_friends_nodes['cummunity_id'] == 3]['Label'].sample(10, random_state=123) #Samba School (Mocidade Alegre) Friends - Founders Family

1058      Márcia Garcia Cruz
679           Patricia Sousa
627            Veronica Rosa
871               Clara Cruz
953                Tina Cruz
1172       Bruno Pombo Cunha
918         Bruninho Martins
350         Bruno Spiandorim
851           Elder Gravalos
575     Cláudia Cruz Zamboni
Name: Label, dtype: object

In [31]:
df_friends_nodes[df_friends_nodes['cummunity_id'] == 4]['Label'].sample(10, random_state=123) #Coworkers from DTM Ibope

727                    Josie Franco
485                   Weslley Moura
152                 Rodrigo Ribeiro
806                  Bernardo Brito
357                 Tarsila Tavares
351                 Gabriel Marquez
947                 Karla Guimarães
464                      Caio Serra
206                  Fran Rodrigues
303    Alexandre Pereira Cavalheiro
Name: Label, dtype: object

In [32]:
df_friends_nodes[df_friends_nodes['cummunity_id'] == 5]['Label'].sample(10, random_state=123) #Coworkers from BNP Paribas Cardif

1127            Bruna Bravin
607              Ivan Orosco
559     Fernanda Gomes Alves
511          Marcio Mainardi
418         Evaristo Moreira
316           Edson Teramoto
446      Pedro Vinicius Melo
348             Cintia Moura
890         Tarsila Viggiani
29            Ronaldo França
Name: Label, dtype: object

In [33]:
df_friends_nodes[df_friends_nodes['cummunity_id'] == 6]['Label'].sample(10, random_state=123) #Classmates from Post Graduation

148             Alex Ferreira
293           Keith Matsumoto
153     Tatiane Couto Martins
928              Gustavo Abud
107     Viviana Sayão Gabriel
788             André Kawauti
335         Paulo Bueno Bruno
1150          Renato Teixeira
702            Rodrigo Franco
638           Data Mining Fia
Name: Label, dtype: object

In [34]:
df_friends_nodes[df_friends_nodes['cummunity_id'] == 7]['Label'].sample(10, random_state=123) #My mother's side family

225            Edmilson Cardoso
605             Luciana Gameiro
872          Caio Batista Kempe
448              Raphael Piotto
289            Flavia Rodrigues
383             Viviane Machado
4       Maria Aparecida Gameiro
884                 Rafael Lino
1175         Compania Dos Cases
507            Rosemeire Piotto
Name: Label, dtype: object

In [35]:
#Export to generate graphs on Gephi
df_friends_nodes.to_csv('/content/drive/My Drive/Facebook_Data_Bruno_Leme/Facebook_Friends_Nodes_with_SNA_Metrics.csv', index=False)

# **Using SNA Features in a Classification Task**

## **Data Prep**

In [36]:
x_samples = df_friends_nodes[['sex','degree_centrality', 'closeness_centrality',
                             'betweenness_centrality', 'eigenvector_centrality',
                             'subgraph_centrality', 'cummunity_id']]

In [37]:
y_samples = df_friends_nodes[['invited']]

In [38]:
x_samples.dtypes

sex                        object
degree_centrality         float64
closeness_centrality      float64
betweenness_centrality    float64
eigenvector_centrality    float64
subgraph_centrality       float64
cummunity_id                int64
dtype: object

In [39]:
x_samples['cummunity_id'] = x_samples['cummunity_id'].astype(object)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [40]:
x_samples.dtypes

sex                        object
degree_centrality         float64
closeness_centrality      float64
betweenness_centrality    float64
eigenvector_centrality    float64
subgraph_centrality       float64
cummunity_id               object
dtype: object

In [41]:
x_samples.columns

Index(['sex', 'degree_centrality', 'closeness_centrality',
       'betweenness_centrality', 'eigenvector_centrality',
       'subgraph_centrality', 'cummunity_id'],
      dtype='object')

In [42]:
x_samples = pd.get_dummies(x_samples)

In [43]:
x_samples = x_samples.drop(['sex_female', 'cummunity_id_0'], axis=1)

## **Model Fitting**

In [44]:
scaler = StandardScaler()
clf = LogisticRegression(penalty='l1', solver='liblinear', C=0.1)

In [45]:
clf.fit(scaler.fit_transform(x_samples), y_samples)

  y = column_or_1d(y, warn=True)


LogisticRegression(C=0.1, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l1',
                   random_state=None, solver='liblinear', tol=0.0001, verbose=0,
                   warm_start=False)

In [46]:
print('AUC', roc_auc_score(y_samples, clf.predict_proba(scaler.transform(x_samples))[:,1]))

AUC 0.8587658802177858
