In [1]:
import pandas as pd
import os
from igraph import *
import numpy as np
from sklearn.model_selection import StratifiedKFold
from scipy import stats
from sklearn.semi_supervised import LabelPropagation
from sklearn import preprocessing
from collections import Counter
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import auc
from sklearn.metrics import roc_curve
from sklearn.metrics import confusion_matrix
from functools import reduce

In [2]:

def create_graph(unweighted_edges, boolean):
    g = Graph.TupleList(G, directed=boolean)
    g.es["weight"] = 1
    g.simplify(combine_edges={"weight": "sum"})
    return(g)

In [3]:
def descriptive_stats(g):
    #print("Indegreeg.indegree()[1])
    print("Maximum indegree for a node : ",max(g.indegree()))    
    print("Maximum outdegree for a node : ",max(g.outdegree()))
    print("Maximum retweeted handle : ",g.vs.select(_indegree = max(g.indegree()))["name"])
    print("Maximum retweeter handle : ",g.vs.select(_outdegree = max(g.outdegree()))["name"])
    print("Number of vertices in retweet network : ",g.vcount())        
    print("Number of edes in retweet network : ",g.ecount())
    print("Number of cliques in retweet network : ",g.clique_number())                                  
    #g.reciprocity(ignore_loops=False, mode="default"),  #reciprocity 
    #g.transitivity_undirected(mode="nan"),              #transitivity
    #g.density(loops=False)) 

In [4]:
def graph_nodes_df(g):
    nodes_list = [] 
    for i in range(len(g.vs)):
        nodes_list.append(g.vs[i].attributes())
    df =  pd.DataFrame(nodes_list)
    #print(df.loc[0])
    #print(df.loc[1])
    return df

In [5]:
rdf=pd.read_csv('/Volumes/Ashwin/rt-data-stateonly.csv')
rdf = rdf[rdf['rt_screen'].notna()]
rdf = rdf[rdf['screen_name'].notna()]
G = [tuple(x) for x in rdf[['screen_name', 'rt_screen']].values]
g_directed = create_graph(G, True)
g_undirected = create_graph(G, False)

In [None]:
descriptive_stats(g_directed)

In [6]:
nodes_df = graph_nodes_df(g_undirected)
nodes_df = nodes_df.rename(columns={'name': 'user'})
nodes_list = nodes_df['user'].values.tolist()

In [7]:
seeds_df=pd.read_csv('/Users/ashwinshreyasm/Desktop/polar_additional.csv')
len(seeds_df)

158

In [8]:
seeds_df['polarization'] = seeds_df['polarization'].map({'Pro-Science': 1, 'Conspiracy-Pseudoscience': 2})

In [9]:
def classify_label_propagation(seeds_df, nodes_df, split_num):
    t_ideo_X = np.array(seeds_df['user'])
    t_ideo_y = np.array(list(seeds_df['polarization']))
    t_ideo_skf = StratifiedKFold(n_splits= split_num)
    t_ideo_skf.get_n_splits(t_ideo_X, t_ideo_y)
    ct=0
    merged_df_list = []
    predicted_labels_list = []
    for train_index, test_index in t_ideo_skf.split(t_ideo_X, t_ideo_y):
        ct+=1
        t_ideo_equiv = { 1: True, 2: True, -1: False}
        equiv_t_ideo = {0: 1, 1: 2, -1: -1}
    
        df_train = pd.DataFrame({'user': t_ideo_X[train_index], 'label': t_ideo_y[train_index]})
        df_test = pd.DataFrame({'user': t_ideo_X[test_index], 'label': t_ideo_y[test_index]})
        print("Test-Set:",len(df_test))
        df_test.to_csv('round'+str(ct)+'.csv')
        print(len(nodes_df))
        df_train =  nodes_df.merge(df_train, how='left', on='user').fillna(-1)
        duplicate_bool = df_train.duplicated(subset=['user'], keep='first')
        duplicate = df_train.loc[duplicate_bool == True]
        df_train['fixed'] = df_train['label'].map(t_ideo_equiv)
        df_train=df_train.drop_duplicates(subset=['user'], keep='first').reset_index()
        print(len(df_train))
        label_prop = Graph.community_label_propagation(g_undirected , weights = 'weight',initial = df_train['label'],fixed = df_train['fixed'])
        
        for n in range(0,len(label_prop)):
            print('Community #', n, 'size:', len(label_prop[n]))
            
        df_train['predicted_label'] = label_prop.membership
        print(df_train['predicted_label'].unique())
        df_train['predicted_label'] = df_train['predicted_label'].map(equiv_t_ideo)
        predicted_labels_list.append(df_train)
        
        df =  pd.merge(left=df_test, right= df_train, how='left', 
                                     left_on='user', right_on='user').dropna(how='any')
        merged_df_list.append(df)
    return merged_df_list, predicted_labels_list
    

In [10]:
def getScore(df_list, measure):
    score_list = []
    ct=0
    for df in df_list: 
        ct+=1
        #print(df['label_x'][0])
        #print(df['label_x'].unique())
        #print(df['predicted_label'].unique())
        df.to_csv('t'+str(ct)+'.csv')
        score = measure(df['label_x'], df['predicted_label'], average='micro')
        print(score)
        score_list.append(score)
    return sum(score_list)/len(df_list)

In [11]:
def getAuc(df_list):
    score_list = []
    for df in df_list: 
        #print(df['label_x'][0])
        fpr, tpr, thresholds = roc_curve(df['label_x'],df['predicted_label'], pos_label=2)
        score = auc(fpr,tpr)
        print(score)
        score_list.append(score)
    return sum(score_list)/len(df_list)

In [12]:
def getCommunityMembership(community_list):
    for i in range(0,len(community_list)):
        print(Counter(community_list[i]['predicted_label']))

In [13]:
def getFinalLabelsForNodes(list_of_dfs):
    df_list = []
    for df in list_of_dfs:
        df_list.append(df[['user', 'predicted_label']]) 
    df = reduce(lambda left,right: pd.merge(left,right,on='user'), df_list)
    print(df.dtypes)
    df = df.replace(-1, 0)
    df['label'] = df.iloc[:,1:6].sum(axis = 1)/5 #col numbers between in the bracket and divided by the number of splits 
    df = df[['user', 'label']]
    df['label'] = df['label'].apply(lambda x: 0 if x == 0 else 1 if 0 < x <= 1.5 else 2)
    df['label'] = df['label'].map({0: 'no_label', 1: 'Pro-Science', 2: 'Conspiracy-Pseudoscience'})
    return df

In [14]:
label_prop_df_list = classify_label_propagation(seeds_df,nodes_df,5)

Test-Set: 32
1815920
1815920
Community # 0 size: 1474789
Community # 1 size: 198998
[ 0  1 -1]
Test-Set: 32
1815920
1815920
Community # 0 size: 1478381
Community # 1 size: 195406
[ 0  1 -1]
Test-Set: 32
1815920
1815920
Community # 0 size: 1476833
Community # 1 size: 196954
[ 0  1 -1]
Test-Set: 31
1815920
1815920
Community # 0 size: 1481870
Community # 1 size: 191917
[ 0  1 -1]
Test-Set: 31
1815920
1815920
Community # 0 size: 1478655
Community # 1 size: 195132
[ 0  1 -1]


In [15]:
getScore(label_prop_df_list[0], precision_score)

0.84375
0.8387096774193549
0.9259259259259259
0.967741935483871
1.0


0.9152255077658303

In [16]:
getAuc(label_prop_df_list[0])

0.84375
0.8333333333333333
0.9
0.9666666666666667
1.0


0.9087500000000001

In [17]:
getCommunityMembership(label_prop_df_list[1])

Counter({1: 1474789, 2: 198998, -1: 142133})
Counter({1: 1478381, 2: 195406, -1: 142133})
Counter({1: 1476833, 2: 196954, -1: 142133})
Counter({1: 1481870, 2: 191917, -1: 142133})
Counter({1: 1478655, 2: 195132, -1: 142133})


In [18]:
user_label_df = getFinalLabelsForNodes(label_prop_df_list[1])

user                 object
predicted_label_x     int64
predicted_label_y     int64
predicted_label_x     int64
predicted_label_y     int64
predicted_label       int64
dtype: object


In [19]:
user_label_df['label'].value_counts()

Pro-Science                 1482607
Conspiracy-Pseudoscience     191180
no_label                     142133
Name: label, dtype: int64

In [20]:
user_label_df.to_csv('user_label_df_additional.csv')

In [21]:
users_df=pd.read_csv('user_label_df_additional.csv')
outdegree=[]
print(len(users_df))
seeds=seeds_df['user'].tolist()
print(len(seeds))
for i in range(len(users_df)):
    if users_df['user'].iloc[i] not in seeds:
        outdegree.append(g_directed.degree(users_df['user'].iloc[i],mode='in'))
    else:
        outdegree.append(-1)
users_df['retweet_count']=outdegree
users_df=users_df[users_df.retweet_count != -1]
users_df=users_df[users_df.retweet_count != 0]
users_df=users_df.sort_values('retweet_count',ascending=False)
print(len(users_df))
#print(users_df['retweet_count'])
users_df.to_csv('user_label_degree_df_additional.csv',index=False)

1815920
158
452209
