In [None]:
import pandas as pd
#import json
from datetime import date
import os

In [None]:
import twitutils
init_data = twitutils.init()
print(init_data)

In [None]:
# The code require the pysad module
# This module has to be installed separately (see the README file)
# Once installed, the path where the pysad module is located must be set in the config.json file under 'pysad_path'
import sys
sys.path.append(init_data['pysad_path'])

import pysad
#import pysad.utils
import pysad.collect
import pysad.twitter

# Collecting the data from Twitter

In [None]:
graph_handle = pysad.twitter.twitter_network(init_data['credentials_file'])

In [None]:
import importlib
importlib.reload(pysad.collect)
importlib.reload(pysad.twitter)
importlib.reload(twitutils)

In [None]:
init_accounts = pysad.twitter.initial_accounts(init_data['accounts_file'])

In [None]:
init_accounts.list()

In [None]:
######Choose a category##############    
category_name = 'Benjamin'

username_list = init_accounts.accounts(category_name)

In [None]:
import importlib
importlib.reload(pysad.collect)

In [None]:
graph_handle.rules['min_mentions'] = 1 # minimal number of mentions of a user to be followed
graph_handle.rules['max_day_old'] = 10 # number max of days in the past
exploration_depth = 2 # mention of mention of mention of ... up to exploration depth

In [None]:
total_user_list, total_nodes_df, total_edges_df = pysad.collect.spiky_ball(username_list, 
                                                                               graph_handle, 
                                                                               exploration_depth=exploration_depth,
                                                                                mode='constant',
                                                                               random_subset_size=200,
                                                                                spread_type='sharp',
                                                                          logger_level='verbose')

In [None]:
print('Total number of users mentioned:',len(total_user_list))

In [None]:
# Save the collected data in json files

# create the path to save the experiment indexed with the date of today
today = date.today()
date_string = today.strftime("%Y%m%d")
print("date string =", date_string)

tweet_data_path_list = [init_data['data_path'], category_name, date_string]
results_data_path_list = [init_data['results_path'], category_name, date_string]

#tweet_data_path = ''.join(tweet_data_path_list)
#results_data_path = ''.join(results_data_path_list)

# Initialize folders (create or clean them if they exist)
# Set erase=False if you need to keep the previous collection
tweet_data_path = twitutils.initialize_folder(tweet_data_path_list, erase=True)
results_data_path = twitutils.initialize_folder(results_data_path_list, erase=False)

# save data
pysad.collect.save_data(total_nodes_df,total_edges_df,tweet_data_path)

# Loading the saved data into an edge table

In [None]:
import pysad.graph
import twitclusters
import importlib
importlib.reload(pysad.twitter)
importlib.reload(twitclusters)

In [None]:
node_df, edge_df = pysad.collect.load_data(tweet_data_path)
end_date = max(edge_df['date']).strftime("%d/%m/%Y") 
start_date = min(edge_df['date']).strftime("%d/%m/%Y")
print('Range of tweets date from {} to {}'.format(start_date,end_date))
node_df = pysad.twitter.reshape_node_data(node_df)
edge_df = pysad.twitter.reshape_edge_data(edge_df,min_weight=2)

## Creating the graph

In [None]:
import importlib
importlib.reload(pysad.graph)
importlib.reload(pysad.collect)

In [None]:
MIN_WEIGHT = 2
MIN_DEGREE = 2 # Minimal number of connections in the graph

G = pysad.graph.graph_from_edgeslist(edge_df, MIN_WEIGHT)
#G = pysad.graph.graph_from_edgeslist(df_pop,DEGREE_MIN)
G = pysad.graph.add_edges_attributes(G,edge_df)
G = pysad.graph.add_node_attributes(G,node_df)
G = pysad.graph.reduce_graph(G,MIN_DEGREE)
G = pysad.graph.handle_spikyball_neighbors(G,graph_handle)#,remove=False)
# Warning, graph properties are not saved by networkx in gexf files except graph name
G.graph['end_date'] = end_date 
G.graph['start_date'] = start_date
G.graph['name'] = category_name + ' ' + G.graph['start_date'] + ' - ' + G.graph['end_date'] 
print('Period from {} to {}.'.format(G.graph['start_date'],G.graph['end_date']))

## Community detection to get the clusters

In [None]:
import importlib
importlib.reload(twitclusters)
importlib.reload(pysad.graph)

In [None]:
G,clusters = pysad.graph.detect_communities(G)
G.nb_communities = len(clusters)
#c_connectivity = pysad.clusters.cluster_connectivity(G)

In [None]:
G = pysad.graph.remove_small_communities(G,clusters,min_size=2)

In [None]:
G.number_of_nodes()

In [None]:
# Save the graph
import networkx as nx
import json
# Save as gexf file
min_mentions = graph_handle.rules['min_mentions']
graphname = '' + category_name
graphfilename = os.path.join(results_data_path, graphname + '_t' + str(min_mentions) + '_md' + str(MIN_DEGREE) +'_graph.gexf')
jsongraphfilename = os.path.join(results_data_path, graphname + '_t' + str(min_mentions) + '_md' + str(MIN_DEGREE) +'_graph.json')
nx.write_gexf(G,graphfilename)
print('Wrote',graphfilename)

# Save as json file
Gnld = nx.readwrite.json_graph.node_link_data(G)
with open(jsongraphfilename, 'w') as outfile:
    json.dump(Gnld, outfile)
print('Wrote',jsongraphfilename)

In [None]:
#c_connectivity

## Analyze clusters

In [None]:
import importlib
importlib.reload(twitclusters)
importlib.reload(twitutils)

## Automatic processing of all clusters

In [None]:
# Extracting the data from the clusters
cluster_info_dic = {}
for c_id in clusters:
    cgraph = clusters[c_id]
    if cgraph.number_of_nodes()==0: #in case a cluster has been removed
        cluster_info_dic[c_id] = {}
        continue
    cgraph = twitclusters.cluster_attributes(cgraph)
    table_dic = twitclusters.cluster_tables(cgraph)
    #node_details = 
    cluster_filename = os.path.join(results_data_path, 'cluster' + str(c_id))
    cluster_info_dic[c_id] = {}
    cluster_info_dic[c_id]['info_table'] = table_dic
    #cluster_info_dic[c_id]['user_details'] = node_details
    cluster_info_dic[c_id]['filename'] = cluster_filename    

In [None]:
# Adding global infos
# keywords
corpus = twitclusters.get_corpus(cluster_info_dic)
keyword_dic = twitclusters.tfidf(corpus)
# save in the cluster info dic
for c_id in clusters:
    if clusters[c_id].number_of_nodes()>0:
        cluster_info_dic[c_id]['info_table']['keywords'] = keyword_dic[c_id]

In [None]:
# gathering global info
# Saving in excel files
for c_id in cluster_info_dic:
    if not cluster_info_dic[c_id]:
        continue
    info_table = cluster_info_dic[c_id]['info_table']
    cluster_general_info = {'cluster id': c_id, 'Nb users': clusters[c_id].number_of_nodes(), 
                           'Nb of tweets':clusters[c_id].size(weight='weight'),
                           'Start date': str(G.graph['start_date']),
                           'End date': str(G.graph['end_date']),
                           'Search topic': category_name}
                           #'cluster connectivity': c_connectivity[c_id]}
    cluster_general_df = pd.DataFrame.from_dict([cluster_general_info])
    #info_table = {'cluster':cluster_general_df, **info_table}
    sheet1 = pd.concat([cluster_general_df,info_table['hashtags'],info_table['keywords']],axis=1)
    tweet_table = info_table['text']
    #user_table = 
    cluster_indicators = pd.DataFrame([twitclusters.compute_cluster_indicators(clusters[c_id])])
    excel_data = {'cluster':sheet1, 'tweets':tweet_table, 'indicators': cluster_indicators, 'users': node_df}
    #excel_data = info_table
    twitclusters.save_excel(excel_data,cluster_info_dic[c_id]['filename'] + '_infos.xlsx', table_format='Fanny')
    pysad.graph.save_graph(clusters[c_id],cluster_info_dic[c_id]['filename'] + 'graph.gexf')

## Saving clusters info to be displayed with the graph

In [None]:
# Writing clusters info to the graph
G = twitclusters.clutersprop2graph(G,cluster_info_dic,clusters)

In [None]:
# Save the graph as a json file
import networkx as nx

graphname = 'graph'
jsongraphfilename = results_data_path + graphname + '_t' + str(min_mentions) + '_md' + str(MIN_DEGREE) +'_graph.json'

Gnld = nx.readwrite.json_graph.node_link_data(G)
with open(jsongraphfilename, 'w') as outfile:
    json.dump(Gnld, outfile)
print('Wrote',jsongraphfilename)

In [None]:
# save clusters info as a json file
clusterinfotoviz = G.graph['clusters']
jsonfilename = results_data_path + graphname + '_t' + str(min_mentions) + '_md' + str(MIN_DEGREE) +'_clusters.json'

with open(jsonfilename, 'w') as outfile:
    json.dump(clusterinfotoviz, outfile)
print('Wrote',jsonfilename)