In [None]:
import pandas as pd
#import json
from datetime import date

# Collecting the data from Twitter

In [None]:
# Import the Twython class
from twython import Twython
import json

# Load credentials from json file
with open("twitter_credentials.json", "r") as file:
    creds = json.load(file)

# Instantiate an object
python_tweets = Twython(creds['CONSUMER_KEY'], creds['CONSUMER_SECRET'])


In [None]:
import pysad
import pysad.utils
import pysad.collect

In [None]:
import importlib
importlib.reload(pysad)

In [None]:
init_accounts = pysad.utils.initial_accounts()

In [None]:
init_accounts.list()

In [None]:
######Choose a category##############    
#category_name = 'swiss_climate_controversial'
#category_name = 'swiss_climate_regular'
#category_name = 'french_tech_lesechos'
#category_name = 'swiss_immigration'
#category_name = 'swiss_immigration2'
#category_name = 'debat_burqa'
category_name = 'hackathlon'
category_name = 'hackathlon_popular'

#####################################

username_list = init_accounts.accounts(category_name)

# create the path to save the experiment indexed with the date of today
today = date.today()
date_string = today.strftime("%Y%m%d")
print("date string =", date_string)

tweet_data_path_list = ['tweetdata', category_name, date_string]
results_data_path_list = ['resultsdata2', category_name, date_string]


In [None]:
tweet_data_path = ''.join(tweet_data_path_list)
results_data_path = ''.join(results_data_path_list)

In [None]:
# Initialize folders (create or clean them if they exist)
tweet_data_path = pysad.utils.initialize_folder(tweet_data_path_list)
results_data_path = pysad.utils.initialize_folder(results_data_path_list)

In [None]:
#username_list

In [None]:
min_mentions = 2 # minimal number of mentions of a user to be followed
max_day_old = None # number max of days in the past
exploration_depth = 1 # mention of mention of mention of ... up to exploration depth

In [None]:
total_user_list = pysad.collect.collect_tweets(username_list, tweet_data_path, python_tweets, min_mentions=min_mentions,
               max_day_old=max_day_old, exploration_depth=exploration_depth)

In [None]:
len(total_user_list)

# Loading the saved data into an edge table

In [None]:
import glob

edge_df = pd.DataFrame()
#for filename in glob.glob(tweet_data_path + '*_mentions' +'_t' +str(min_mentions)+ '.json'):
for filename in glob.glob(tweet_data_path + '*_mentions*' + '.json'):
    new_edge_df = pd.read_json(filename)
    print('{} with {} tweets.'.format(filename,len(new_edge_df)))
    edge_df = edge_df.append(new_edge_df)
edge_df.reset_index(drop=True, inplace=True)

In [None]:
# Keeping only the most popular
df_pop = edge_df[edge_df['retweet_count'] > 1000]
df_pop

## Creating the graph

In [None]:
import pysad.graph
import pysad.clusters

In [None]:
import importlib
importlib.reload(pysad.graph)

In [None]:
DEGREE_MIN = 2 # Minimal number of connections in the graph

#G = pysad.graph.graph_from_edgeslist(edge_df,DEGREE_MIN)
G = pysad.graph.graph_from_edgeslist(df_pop,DEGREE_MIN)
G.name = category_name
G.end_date = max(edge_df['date']) #max(edge_df['date'].apply(max))
G.start_date = min(edge_df['date']) #min(edge_df['date'].apply(min))
print('Period from {} to {}.'.format(G.start_date,G.end_date))

## Community detection to get the clusters

In [None]:
import importlib
importlib.reload(pysad.clusters)
importlib.reload(pysad.graph)

In [None]:
G,clusters = pysad.graph.detect_communities(G)
G.nb_communities = len(clusters)
c_connectivity = pysad.clusters.cluster_connectivity(G)

In [None]:
G = pysad.graph.remove_small_communities(G,clusters,min_size=5)

In [None]:
# Save the graph
import networkx as nx

graphname = 'globalgraph_pop2'
graphfilename = results_data_path + graphname + '_t' + str(min_mentions) + '_md' + str(DEGREE_MIN) +'_graph.gexf'
nx.write_gexf(G,graphfilename)
print('Wrote',graphfilename)

In [None]:
c_connectivity

## Analyze clusters

In [None]:
import importlib
importlib.reload(pysad)

## Automatic processing of all clusters

In [None]:
# Extracting the data from the clusters
cluster_info_dic = {}
for c_id in clusters:
    cgraph = clusters[c_id]
    cgraph = pysad.cluster_attributes(cgraph)
    table_dic = pysad.cluster_tables(cgraph)
    cluster_filename = results_data_path + 'cluster' + str(c_id)
    cluster_info_dic[c_id] = {}
    cluster_info_dic[c_id]['info_table'] = table_dic
    cluster_info_dic[c_id]['filename'] = cluster_filename    

In [None]:
# Adding global infos
# keywords
corpus = pysad.get_corpus(cluster_info_dic)
keyword_dic = pysad.tfidf(corpus)

In [None]:
# gathering global info
# Saving in excel files
for c_id in cluster_info_dic:
    info_table = cluster_info_dic[c_id]['info_table']
    info_table['keywords'] = keyword_dic[c_id]
    cluster_general_info = {'cluster id': c_id, 'Nb users': clusters[c_id].number_of_nodes(), 
                           'Nb of tweets':clusters[c_id].size(weight='weight'),
                           'Start date': str(G.start_date),
                           'End date': str(G.end_date),
                           'Search topic': category_name,
                           'cluster connectivity': c_connectivity[c_id]}
    cluster_general_df = pd.DataFrame.from_dict([cluster_general_info])
    #info_table = {'cluster':cluster_general_df, **info_table}
    sheet1 = pd.concat([cluster_general_df,info_table['hashtags'],info_table['keywords']],axis=1)
    tweet_table = info_table['text']
    cluster_indicators = pd.DataFrame([pysad.compute_cluster_indicators(clusters[c_id])])
    excel_data = {'cluster':sheet1, 'tweets':tweet_table, 'indicators': cluster_indicators}
    #excel_data = info_table
    pysad.save_excel(excel_data,cluster_info_dic[c_id]['filename'] + '_infos.xlsx', table_format='Fanny')
    pysad.save_graph(clusters[c_id],cluster_info_dic[c_id]['filename'] + 'graph.gexf')