In [None]:
import pandas as pd
import json
import os
from datetime import date

# Collecting the data from Twitter

In [None]:
# Import the Twython class
from twython import Twython
import json

# Load credentials from json file
with open("twitter_credentials.json", "r") as file:
    creds = json.load(file)

# Instantiate an object
python_tweets = Twython(creds['CONSUMER_KEY'], creds['CONSUMER_SECRET'])


In [None]:
import pysad

In [None]:
import importlib
importlib.reload(pysad)

In [None]:
init_accounts = pysad.initial_accounts()

In [None]:
init_accounts.list()

In [None]:
######Choose a category##############    
#category_name = 'swiss_climate_controversial'
#category_name = 'swiss_climate_regular'
#category_name = 'french_tech_lesechos'
#category_name = 'swiss_immigration'
category_name = 'swiss_immigration2'
#category_name = 'debat_burqa'
#####################################

username_list = init_accounts.accounts(category_name)

# create the path to save the experiment indexed with a date
today = date.today()
date_string = today.strftime("%Y%m%d")
print("date string =", date_string)

#date_string = '191128'

tweet_data_path_list = ['tweetdata', category_name, date_string]
results_data_path_list = ['resultsdata', category_name, date_string]
#get_tweets = python_tweets.get_user_timeline(screen_name = username,  
#         count = 200, include_rts = True)

def initialize_folder(path_folder_list):
    folder_concat = ''
    for folder in path_folder_list[:-1]:
        folder_concat += folder + '/'
        if not os.path.isdir(folder_concat):
            os.mkdir(folder_concat)
            print('Path created:',folder_concat)
    # Special treatment for the last folder
    folder_concat += path_folder_list[-1] + '/'
    if not os.path.isdir(folder_concat):
        os.mkdir(folder_concat)
        print('Path created:',folder_concat)
    else:
        for f in os.listdir(folder_concat):
            os.remove(os.path.join(folder_concat, f))
        print('Cleaned path',folder_concat)
    return folder_concat

tweet_data_path = initialize_folder(tweet_data_path_list)
results_data_path = initialize_folder(results_data_path_list)

In [None]:
username_list

In [None]:
min_mentions = 2 # minimal number of mentions of a user to be followed
max_day_old = 7 # number max of days in the past
exploration_depth = 2 # mention of mention of mention of ... up to exploration depth

In [None]:
total_user_list = pysad.collect_tweets(username_list, tweet_data_path, python_tweets, min_mentions=min_mentions,
               max_day_old=max_day_old, exploration_depth=exploration_depth)

In [None]:
len(total_user_list)

#Â Loading the saved data into an edge table

In [None]:
import glob

edge_df = pd.DataFrame()
for filename in glob.glob(tweet_data_path + '*_mentions' +'_t' +str(min_mentions)+ '.json'):
    new_edge_df = pd.read_json(filename)
    print('{} with {} tweets.'.format(filename,len(new_edge_df)))
    edge_df = edge_df.append(new_edge_df)
edge_df.reset_index(drop=True, inplace=True)

In [None]:
edge_df

In [None]:
DEGREE_MIN = 2 # Minimal number of connections in the graph

G = pysad.graph_from_edgeslist(edge_df,DEGREE_MIN)
G.name = category_name
G.end_date = max(edge_df['date']) #max(edge_df['date'].apply(max))
G.start_date = min(edge_df['date']) #min(edge_df['date'].apply(min))
print('Period from {} to {}.'.format(G.start_date,G.end_date))

## Community detection and save full graph

In [None]:
G,clusters = pysad.detect_communities(G)

In [None]:
# Save the graph
import networkx as nx

graphname = 'globalgraph'

graphfilename = results_data_path + graphname + '_t' + str(min_mentions) + '_md' + str(DEGREE_MIN) +'_graph.gexf'
nx.write_gexf(G,graphfilename)
#nx.write_yaml(G,graphfilename)
#nx.write_pajek(G,graphfilename)
#nx.node_link_data(G,graphfilename)
print('Wrote',graphfilename)

## Analyze clusters

In [None]:
import importlib
importlib.reload(pysad)

## Automatic processing of all clusters

In [None]:
cluster_info_dic = {}
for c_id in clusters:
    cgraph = clusters[c_id]
    cgraph = pysad.cluster_attributes(cgraph)
    table_dic = pysad.cluster_tables(cgraph)
    cluster_filename = results_data_path + 'cluster' + str(c_id)
    cluster_info_dic[c_id] = {}
    cluster_info_dic[c_id]['info_table'] = table_dic
    cluster_info_dic[c_id]['filename'] = cluster_filename    

In [None]:
# Adding global infos
# keywords
corpus = pysad.get_corpus(cluster_info_dic)
keyword_dic = pysad.tfidf(corpus)

In [None]:
# gathering global info
# Saving in files
for c_id in cluster_info_dic:
    info_table = cluster_info_dic[c_id]['info_table']
    info_table['keywords'] = keyword_dic[c_id]
    cluster_general_info = {'cluster id': c_id, 'Nb users': clusters[c_id].number_of_nodes(), 
                           'Nb of tweets':clusters[c_id].size(weight='weight'),
                           'Start date': str(G.start_date),
                           'End date': str(G.end_date),
                           'Search topic': category_name}
    cluster_general_df = pd.DataFrame.from_dict([cluster_general_info])
    #info_table = {'cluster':cluster_general_df, **info_table}
    sheet1 = pd.concat([cluster_general_df,info_table['hashtags'],info_table['keywords']],axis=1)
    tweet_table = info_table['text']
    excel_data = {'cluster':sheet1, 'tweets':tweet_table}
    #excel_data = info_table
    pysad.save_excel(excel_data,cluster_info_dic[c_id]['filename'] + '_infos.xlsx', table_format='Fanny')
    pysad.save_graph(clusters[c_id],cluster_info_dic[c_id]['filename'] + 'graph.gexf')