In [None]:
import pandas as pd
#import json
import networkx as nx

In [None]:
import pysad
import pysad.utils
import pysad.collect
import pysad.synthesis

# Creating the graph

In [None]:
import importlib
importlib.reload(pysad.collect)
importlib.reload(pysad.synthesis)

In [None]:
nb_nodes = 200
nb_edges = 500
G = nx.gnm_random_graph(nb_nodes, nb_edges)

In [None]:
graph_handle = pysad.synthesis.graph(G)

In [None]:
exploration_depth = 5 # mention of mention of mention of ... up to exploration depth

In [None]:
total_user_list, total_nodes_df, total_edges_df = pysad.collect.spiky_ball([20], 
                                                                               graph_handle, 
                                                                               exploration_depth=exploration_depth,
                                                                               random_subset_size=10)

In [None]:
print('Total number of users mentioned:',len(total_user_list))

In [None]:
total_user_list

In [None]:
import pysad.graph
import pysad.clusters

In [None]:

node_df, edge_df = total_nodes_df, total_edges_df
node_df = pysad.graph.reshape_node_data(node_df)

## Creating the graph

In [None]:
import importlib
importlib.reload(pysad.graph)
importlib.reload(pysad.collect)

In [None]:
MIN_WEIGHT = 2
MIN_DEGREE = 2 # Minimal number of connections in the graph

G = pysad.graph.graph_from_edgeslist(edge_df, MIN_WEIGHT)
#G = pysad.graph.graph_from_edgeslist(df_pop,DEGREE_MIN)
G = pysad.graph.add_node_attributes(G,node_df)
G = pysad.graph.reduce_graph(G,MIN_DEGREE)
# Warning, graph properties are not saved by networkx in gexf files except graph name
G.graph['end_date'] = max(edge_df['date']).strftime("%d/%m/%Y") 
G.graph['start_date'] = min(edge_df['date']).strftime("%d/%m/%Y")
G.graph['name'] = category_name + ' ' + G.graph['start_date'] + ' - ' + G.graph['end_date'] 
print('Period from {} to {}.'.format(G.graph['start_date'],G.graph['end_date']))

In [None]:
# Complete the info of the nodes not collected
nodes_missing_info = [node for node,data in G.nodes(data=True) if 'name' not in data]
print('Number of nodes with missing info:',len(nodes_missing_info))

# 2 options: 1) remove nodes with missing info or 2) rerun the collection to collect the missing info
option = 1

In [None]:
if option == 1:
    # Option 1:
    print('Removing node with missing info from the graph')
    G.remove_nodes_from(nodes_missing_info)
    print('Number of nodes after removal:',G.number_of_nodes())
else:
    # Option 2: collect the missing node data
    nodeinfo_df = pd.DataFrame()
    nb_missing = len(nodes_missing_info)
    for idx,node in enumerate(nodes_missing_info):
        print('collecting info for {} - {} / {} '.format(node,idx,nb_missing))
        edgeinfo,nodeinfo = pysad.collect.collect_user_data(node,python_tweets,max_day_old)
        nodeinfo_df = nodeinfo_df.append(nodeinfo)

    nodeinfo_df = pysad.graph.reshape_node_data(nodeinfo_df)
    G = pysad.graph.add_node_attributes(G,nodeinfo_df)
    print('Node info added to the graph.')
# Check integrity
i=0
for node,data in G.nodes(data=True):
    if 'name' not in data:
        print('Missing information for',node)

In [None]:
G.number_of_nodes()

In [None]:
list(G.nodes)

In [None]:
dict(G.degree())

## Community detection to get the clusters

In [None]:
import importlib
importlib.reload(pysad.clusters)
importlib.reload(pysad.graph)

In [None]:
G,clusters = pysad.graph.detect_communities(G)
G.nb_communities = len(clusters)
#c_connectivity = pysad.clusters.cluster_connectivity(G)

In [None]:
G = pysad.graph.remove_small_communities(G,clusters,min_size=10)

In [None]:
# Save the graph
import networkx as nx
import json
# Save as gexf file
min_mentions = graph_handle.rules['min_mentions']
graphname = '' + category_name
graphfilename = results_data_path + graphname + '_t' + str(min_mentions) + '_md' + str(MIN_DEGREE) +'_graph.gexf'
jsongraphfilename = results_data_path + graphname + '_t' + str(min_mentions) + '_md' + str(MIN_DEGREE) +'_graph.json'
nx.write_gexf(G,graphfilename)
print('Wrote',graphfilename)

# Save as json file
Gnld = nx.readwrite.json_graph.node_link_data(G)
with open(jsongraphfilename, 'w') as outfile:
    json.dump(Gnld, outfile)
print('Wrote',jsongraphfilename)

In [None]:
#c_connectivity

## Analyze clusters

In [None]:
import importlib
importlib.reload(pysad.clusters)
importlib.reload(pysad.utils)

## Automatic processing of all clusters

In [None]:
# Extracting the data from the clusters
cluster_info_dic = {}
for c_id in clusters:
    cgraph = clusters[c_id]
    if cgraph.number_of_nodes()==0: #in case a cluster has been removed
        cluster_info_dic[c_id] = {}
        continue
    cgraph = pysad.clusters.cluster_attributes(cgraph)
    table_dic = pysad.clusters.cluster_tables(cgraph)
    #node_details = 
    cluster_filename = results_data_path + 'cluster' + str(c_id)
    cluster_info_dic[c_id] = {}
    cluster_info_dic[c_id]['info_table'] = table_dic
    #cluster_info_dic[c_id]['user_details'] = node_details
    cluster_info_dic[c_id]['filename'] = cluster_filename    

In [None]:
# Adding global infos
# keywords
corpus = pysad.clusters.get_corpus(cluster_info_dic)
keyword_dic = pysad.clusters.tfidf(corpus)
# save in the cluster info dic
for c_id in clusters:
    if clusters[c_id].number_of_nodes()>0:
        cluster_info_dic[c_id]['info_table']['keywords'] = keyword_dic[c_id]

In [None]:
# gathering global info
# Saving in excel files
for c_id in cluster_info_dic:
    if not cluster_info_dic[c_id]:
        continue
    info_table = cluster_info_dic[c_id]['info_table']
    cluster_general_info = {'cluster id': c_id, 'Nb users': clusters[c_id].number_of_nodes(), 
                           'Nb of tweets':clusters[c_id].size(weight='weight'),
                           'Start date': str(G.graph['start_date']),
                           'End date': str(G.graph['end_date']),
                           'Search topic': category_name}
                           #'cluster connectivity': c_connectivity[c_id]}
    cluster_general_df = pd.DataFrame.from_dict([cluster_general_info])
    #info_table = {'cluster':cluster_general_df, **info_table}
    sheet1 = pd.concat([cluster_general_df,info_table['hashtags'],info_table['keywords']],axis=1)
    tweet_table = info_table['text']
    #user_table = 
    cluster_indicators = pd.DataFrame([pysad.clusters.compute_cluster_indicators(clusters[c_id])])
    excel_data = {'cluster':sheet1, 'tweets':tweet_table, 'indicators': cluster_indicators, 'users': node_df}
    #excel_data = info_table
    pysad.clusters.save_excel(excel_data,cluster_info_dic[c_id]['filename'] + '_infos.xlsx', table_format='Fanny')
    pysad.graph.save_graph(clusters[c_id],cluster_info_dic[c_id]['filename'] + 'graph.gexf')

## Saving clusters info to be displayed with the graph

In [None]:
# Writing clusters info to the graph
G = pysad.clusters.clutersprop2graph(G,cluster_info_dic,clusters)

In [None]:
# Save the graph as a json file
import networkx as nx

graphname = 'graph'
jsongraphfilename = results_data_path + graphname + '_t' + str(min_mentions) + '_md' + str(MIN_DEGREE) +'_graph.json'

Gnld = nx.readwrite.json_graph.node_link_data(G)
with open(jsongraphfilename, 'w') as outfile:
    json.dump(Gnld, outfile)
print('Wrote',jsongraphfilename)

In [None]:
# save clusters info as a json file
clusterinfotoviz = G.graph['clusters']
jsonfilename = results_data_path + graphname + '_t' + str(min_mentions) + '_md' + str(MIN_DEGREE) +'_clusters.json'

with open(jsonfilename, 'w') as outfile:
    json.dump(clusterinfotoviz, outfile)
print('Wrote',jsonfilename)