In [None]:
import pandas as pd
import json
import os
from datetime import date

# Collecting the data from Twitter

In [None]:
# Import the Twython class
from twython import Twython
import json

# Load credentials from json file
with open("twitter_credentials.json", "r") as file:
    creds = json.load(file)

# Instantiate an object
python_tweets = Twython(creds['CONSUMER_KEY'], creds['CONSUMER_SECRET'])


In [None]:
with open('initial_accounts.txt') as json_file:
    initial_user_accounts = json.load(json_file)

    
######Choose a category##############    
category_name = 'swiss_climate_controversial'
#category_name = 'swiss_climate_regular'
#category_name = 'french_tech_lesechos'
#category_name = 'swiss_immigration'
#####################################

if category_name not in initial_user_accounts:
    print('ERROR. Key "{}" is not in the list.'.format(category_name))
    print('Possible choices are: {}'.format([key for key in initial_user_accounts.keys()]))
else:
    username_list = initial_user_accounts[category_name]
    # create the path to save the experiment indexed with a date
    today = date.today()
    date_string = today.strftime("%Y%m%d")
    print("date string =", date_string)

    #date_string = '191128'

    data_path = category_name + date_string+ '/'
    #get_tweets = python_tweets.get_user_timeline(screen_name = username,  
    #                                           count = 200, include_rts = True)
    if not os.path.isdir(data_path):
        os.mkdir(data_path)
        print('Path created:',data_path)

In [None]:
import pysad

In [None]:
thres = 2 # minimal number of mentions to keep
max_day_old = 7 # number max of days in the past
print('Threshold set to {} mentions.'.format(thres))
print('Number of days covered:',max_day_old)

In [None]:
users_dic = {'username':[], 'Nb_mentions': [], 'mentions_of_mentions': []}
print('Collecting the tweets for the last {} days.'.format(max_day_old))
exploration_depth = 4
total_username_list = username_list
for depth in range(exploration_depth):
    print('')
    print('******* Processing users at {}-hop distance *******'.format(depth))
    new_users_list,users_df = pysad.process_user_list(python_tweets, data_path, username_list, thres=thres, max_day_old=max_day_old)
    #New users to collect:
    username_list = list(set(new_users_list).difference(set(total_username_list))) # remove the one already collected
    total_username_list += username_list


In [None]:
print('Total number of users collected:')
print(len(total_username_list),len(set(total_username_list)))

# Loading the saved data into an edge table

In [None]:
import glob

edge_df = pd.DataFrame()
for filename in glob.glob(data_path + '*_mentions' +'_t' +str(thres)+ '.csv'):
    new_edge_df = pd.read_csv(filename)
    print('{} with {} tweets.'.format(filename,len(new_edge_df)))
    edge_df = edge_df.append(new_edge_df)
    

In [None]:
# display edges with number of hashtags >1
edge_df[edge_df['hashtags'].apply(lambda x : len(x.split()))>1]

In [None]:
DEGREE_MIN = 4 # Minimal number of connections in the graph

G = pysad.graph_from_edgeslist(edge_df,DEGREE_MIN)

## Community detection

In [None]:
G = pysad.detect_communities(G)

In [None]:
graphname = 'AAgraph'
#graphname = 'GBRgraph'

In [None]:
# Save the graph
import networkx as nx

graphfilename = data_path + graphname + '_t' + str(thres) + '_md' + str(DEGREE_MIN) +'_graph.gexf'
nx.write_gexf(G,graphfilename)
print('Wrote',graphfilename)

# Hashtags, dates and urls
Hashtags, dates and urls are on the edges of the network.
We can get the most common hashtags within a community and also betwenn communities using the edges that connect them.

In [None]:
tags_dic, dates_dic, url_dic = pysad.community_data(G)

In [None]:
community_table = pysad.communities_date_hashtags(dates_dic, tags_dic)
community_table

### Process the urls

In [None]:
url_table = pysad.communities_urls(url_dic)
url_table = pysad.convert_bitly(url_table)
filtered_url_table = pysad.drop_twitter_urls(url_table)

In [None]:
url_table

# Sort users by community and save in a excel sheet

In [None]:
# Sort users by community and store their node degree (importance)
community_nodes = {}
for node,data in G.nodes(data=True):
    community_nb = data['community']
    if  community_nb not in community_nodes:
        community_nodes[community_nb] = [(node, G.degree(node))]
    else:
        community_nodes[community_nb].append((node, G.degree(node)))


# Display the exmaple of community c_idx
#c_idx = 0
#ddf = pd.DataFrame(community_nodes[c_idx],columns=['User','Degree'])
#print('list of most connected users in community',c_idx)
#ddf.sort_values(by='Degree',ascending=False).head(20)

In [None]:
# Save to an excel file
with pd.ExcelWriter(data_path + 'graph_infos.xlsx') as writer:
    for community_nb in community_nodes:
        ddf = pd.DataFrame(community_nodes[community_nb],columns=['User','Degree'])
        ddf = ddf.sort_values(by='Degree',ascending=False)#.head(20)
        ddf.to_excel(writer, sheet_name='Community_' + str(community_nb),index=False)
    community_table.to_excel(writer, sheet_name='Hashtags',index=False)
    #users_df.to_excel(writer, sheet_name='Initial_users_details',index=False)
    filtered_url_table.to_excel(writer, sheet_name='List_of_urls',index=False)
    # Set the column width
    column_width = 25
    for sheet in writer.sheets: 
        worksheet = writer.sheets[sheet]
        for col in ['A','B','C','D','E','F','G','H']:
            worksheet.column_dimensions[col].width = column_width
    writer.sheets['List_of_urls'].column_dimensions['A'].width = 100