In [0]:
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials

# Authenticate and create the PyDrive client
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)

In [0]:
#download user clusters
id='17u8LVQuGG5QpxhqyLrbpFvd9WJ618DRT'
downloaded = drive.CreateFile({'id': id}) 
downloaded.GetContentFile('default_user_groups.csv')

id='1cNEL_6d5bl1uy22cznhXDJYiW7s7Od3e'
downloaded = drive.CreateFile({'id': id}) 
downloaded.GetContentFile('louvain_user_groups.csv')

id='1-Ljr2TGADGkhSpKGC7eYbRa-IfM2BU11'
downloaded = drive.CreateFile({'id': id}) 
downloaded.GetContentFile('kmeans_user_groups.csv') 

# download consolidated data
id='1QmKZzdrnNfSZ0RWrW89d6V_SG8cp03AK'
downloaded = drive.CreateFile({'id': id}) 
downloaded.GetContentFile('consolidated_tweets_df.csv')

id='1_pp8dvoiEjWRRAu48KwcSYBsTcLKywUU'
downloaded = drive.CreateFile({'id': id}) 
downloaded.GetContentFile('consolidated_retweets_df.csv') 

# get topic clusters
id='12Vy0ZqeL7w4tuv7_-i0amHkt2pC0dxO0'
downloaded = drive.CreateFile({'id': id}) 
downloaded.GetContentFile('res_cluster.csv')

id='13X3b0q3l1CfiMiKD5bRQgA_YU1ldCp6N'
downloaded = drive.CreateFile({'id': id}) 
downloaded.GetContentFile('results_basic_lda.csv')

id='1oQA5tyGO_G6yxqZ4YVeSU7JgZgDK2PPL'
downloaded = drive.CreateFile({'id': id}) 
downloaded.GetContentFile('results_equal_cluster.csv')

In [0]:
# Let's import the libraries we will need
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

In [4]:
tweets_data = pd.read_csv("consolidated_tweets_df.csv", lineterminator='\n', dtype = {"tweet_id": object, "article_id": object, "tweet_created_at": str, "user_id" : str}, parse_dates = ['tweet_created_at'])
tweets_data = tweets_data.drop(columns = ["Unnamed: 0", 'test'])
print(tweets_data.shape)
tweets_data.head()

(213823, 6)


Unnamed: 0,article_id,tweet_id,tweet_created_at,retweet_count,user_id,cold_user
0,politifact13013,771487002575286273,2016-09-01 23:16:49+00:00,0,313584104,1
1,politifact13013,771271973229826048,2016-09-01 09:02:22+00:00,0,10480592,1
2,politifact13013,771288677557280768,2016-09-01 10:08:45+00:00,0,175844074,1
3,politifact13013,771223329214566400,2016-09-01 05:49:04+00:00,0,2577100238,1
4,politifact13013,771475597918740481,2016-09-01 22:31:30+00:00,0,453088398,1


In [5]:
retweets_data = pd.read_csv("consolidated_retweets_df.csv", dtype={"tweet_id": str, "retweet_id" : str, "retweet_created_at":str, "retweet_user_id": str}, parse_dates=['retweet_created_at'])
retweets_data = retweets_data.drop(columns=['Unnamed: 0', 'test'])
print(retweets_data.shape)
retweets_data.head()

(32646, 8)


Unnamed: 0,article_id,tweet_id,retweet_id,retweet_count,retweet_created_at,retweet_user_id,is_quote_status,cold_user
0,politifact13013,771417626513772544,771420531291787264,1,2016-09-01 18:52:41+00:00,49053573,0.0,0
1,politifact13013,771130571489550336,771130907533139973,1,2016-08-31 23:41:49+00:00,2935416411,0.0,1
2,politifact13013,773849741524344832,773871933574279168,1,2016-09-08 13:13:41+00:00,862238221,0.0,1
3,politifact13013,771316782934155264,771585933787340800,1,2016-09-02 05:49:56+00:00,318417786,0.0,1
4,politifact13013,771078878529892353,771079225751339009,1,2016-08-31 20:16:28+00:00,2611772656,0.0,1


In [6]:
article_topics = pd.read_csv("results_equal_cluster.csv", header = 0)
article_topics = article_topics.rename(columns = {'cluster': 'Topic'})
article_topics.head()

Unnamed: 0,article_id,Topic
0,politifact1820,2
1,politifact537,2
2,politifact12104,2
3,politifact12755,2
4,politifact1216,2


In [0]:
tweet_user_articles = tweets_data.join(article_topics.set_index("article_id"), on = 'article_id')
tweet_user_articles['month'] = tweet_user_articles.apply (lambda row: row['tweet_created_at'].month, axis=1)
tweet_user_articles = tweet_user_articles[['user_id', 'article_id', 'month', 'Topic']]
tweet_user_articles = tweet_user_articles.drop_duplicates()

In [0]:
retweet_user_articles = retweets_data.join(article_topics.set_index("article_id"), on = 'article_id')
retweet_user_articles['month'] = retweet_user_articles.apply (lambda row: row['retweet_created_at'].month, axis=1)
retweet_user_articles = retweet_user_articles[['retweet_user_id', 'article_id', 'month', 'Topic']]
retweet_user_articles = retweet_user_articles.drop_duplicates()
retweet_user_articles = retweet_user_articles.rename(columns={'retweet_user_id': 'user_id'})

In [0]:
user_articles_bytime = pd.concat([tweet_user_articles, retweet_user_articles], ignore_index=True)

In [10]:
user_articles_bytime.head()

Unnamed: 0,user_id,article_id,month,Topic
0,313584104,politifact13013,9,2
1,10480592,politifact13013,9,2
2,175844074,politifact13013,9,2
3,2577100238,politifact13013,9,2
4,453088398,politifact13013,9,2


In [0]:
user_groups = pd.read_csv('kmeans_user_groups.csv', dtype = {"user_id" : str}, header=0)
if 'Unnamed: 0' in user_groups:
  user_groups = user_groups.drop(columns = ["Unnamed: 0"])
user_groups.head()
if 'cluster' in user_groups:
  user_groups = user_groups.rename(columns={'cluster': 'user_group'})

In [12]:
user_group_articles = user_articles_bytime.join(user_groups.set_index('user_id'), on='user_id', how='left')
user_group_articles.head()

Unnamed: 0,user_id,article_id,month,Topic,user_group
0,313584104,politifact13013,9,2,195
1,10480592,politifact13013,9,2,195
2,175844074,politifact13013,9,2,195
3,2577100238,politifact13013,9,2,195
4,453088398,politifact13013,9,2,195


In [0]:
user_group_topics = user_group_articles.groupby(['user_group', 'Topic'])['article_id'].nunique().reset_index().rename(columns={'article_id':'num_of_articles_in_topic'})
user_group_totals = user_group_articles.groupby('user_group')['article_id'].nunique().reset_index().rename(columns={'article_id':'num_of_articles'})
user_group_totals = {row['user_group']: row['num_of_articles'] for index, row in user_group_totals.iterrows()}

In [0]:
user_group_topics['prob'] = user_group_topics.apply(lambda row: row['num_of_articles_in_topic']/user_group_totals[row['user_group']], axis = 1)

In [0]:
user_group_topics
user_group_topics_dict = {}
for index, row in user_group_topics.iterrows():
  grp = row['user_group']
  topic = row['Topic']
  prob = row['prob']
  if grp not in user_group_topics_dict:
    user_group_topics_dict[grp] = {}
    user_group_topics_dict[grp]['best_topic_prob'] = 0.0
  user_group_topics_dict[grp][topic] = prob
  if user_group_topics_dict[grp]['best_topic_prob'] < prob:
    user_group_topics_dict[grp]['best_topic_prob'] = prob
    user_group_topics_dict[grp]['best_topic'] = topic

In [16]:
len(user_group_topics_dict)

200

In [17]:
from collections import Counter
Counter([grp['best_topic'] for grp in list(user_group_topics_dict.values())])

Counter({0.0: 33, 1.0: 43, 2.0: 83, 3.0: 23, 4.0: 18})

In [0]:
all_topics = article_topics.Topic.unique()

In [19]:
all_topics

array([2, 4, 0, 3, 1])

In [0]:
for user in user_group_topics_dict:
  for topic in all_topics:
    if topic not in user_group_topics_dict[user]:
      user_group_topics_dict[user][topic] = 0

In [0]:
import pickle
with open('kmeans_bal_lda_cf.pickle', 'wb') as handle:
    pickle.dump(user_group_topics_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)