In [11]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import StandardScaler

## Read in dataframes

In [2]:
# import dataframes to construct network model
personalities = pd.read_pickle('personalities.pkl')
favorites = pd.read_pickle('favorites_cleaned_lemma.pkl')
profiles = pd.read_pickle('profiles.pkl')
topics = pd.read_pickle('model_data.pkl')

In [3]:
personalities.head(1)

Unnamed: 0,user_id,all_tweets,Openness,Conscientiousness,Extraversion,Agreeableness,Emotional range
0,2649540547,"@nbcsnl ok this is legendary, 𝗙𝗹𝗶𝗽𝗽𝗲𝗱 𝘁𝗵𝗲 𝘀𝘄𝗶𝘁...",0.730236,0.334933,0.411628,0.106015,0.552724


In [4]:
favorites.head(1)

Unnamed: 0,favorited_by_id,id,created_at,screen_name,user_id,in_reply_to_status_id,in_reply_to_screen_name,in_reply_to_user_id,favorite_count,retweet_count,text
0,2649540547,1236533635290890240,Sun Mar 08 06:06:06 +0000 2020,AOC,138203134,1.236524e+18,nbcsnl,28221296.0,115728,3550,ok this is legendary


In [5]:
profiles.head(1)

Unnamed: 0,user_id,screen_name,followers_count,friends_count,favourites_count,statuses_count,follower_ids,friend_ids
0,2649540547,baka_brooks,234,257,4931,218,"[1094750013304029187, 1235337664083222528, 133...","[1235337664083222528, 14372486, 11598303501027..."


In [6]:
topics.head(1)

Unnamed: 0,id,created_at,screen_name,user_id,in_reply_to_status_id,in_reply_to_screen_name,in_reply_to_user_id,favorite_count,retweet_count,text,...,vader_neu,vader_compound,clusters,topic_0,topic_1,topic_2,topic_3,topic_4,topic_5,topic_6
0,1236533635290890240,Sun Mar 08 06:06:06 +0000 2020,AOC,138203134,1.236524e+18,nbcsnl,28221296.0,115728,3550,ok this is legendary,...,0.577,0.296,2,0.00031,0.0008,0.000266,0.000556,0.000949,9e-05,0.000194


## Clean and merge dataframes

In [7]:
# separate friend and follower ids into separate dataframe to be added after aggregation
followers = profiles.loc[:, ['user_id', 'follower_ids', 'friend_ids']]
profiles = profiles.drop(columns=['follower_ids', 'friend_ids'])

In [8]:
# merge profiles and personalities
users = profiles.merge(personalities.drop(columns='all_tweets'), on='user_id')
users = users.drop_duplicates('user_id', keep='first')
print(users.shape)
users.head()

(179, 11)


Unnamed: 0,user_id,screen_name,followers_count,friends_count,favourites_count,statuses_count,Openness,Conscientiousness,Extraversion,Agreeableness,Emotional range
0,2649540547,baka_brooks,234,257,4931,218,0.730236,0.334933,0.411628,0.106015,0.552724
3,1094750013304029187,Kwammentary,534,1748,698,210,0.906036,0.550105,0.648426,0.297694,0.500949
4,1235337664083222528,D1_data,5,64,7,1,0.708852,0.768717,0.294999,0.15403,0.112169
5,1333491954,lolegra,169,346,2614,2647,0.518856,0.069776,0.637403,0.241616,0.757079
6,1219308741369253894,stealsdeals12,93,1150,1130,878,0.46632,0.706029,0.187577,0.058168,0.317718


In [9]:
# concatenate favorites and topics, keeping only 'favorited_by_id' from 
# the original dataframe
tweets = pd.concat([favorites.loc[:, 'favorited_by_id'], topics], axis=1)

# drop unneeded columns
tweets = tweets.drop(columns=['id', 'created_at', 'screen_name', 'user_id', 'in_reply_to_status_id',
                             'in_reply_to_screen_name', 'in_reply_to_user_id', 'text', 'type', 
                             'favorite_count', 'retweet_count'])
print(tweets.shape)
tweets.head()

(547463, 15)


Unnamed: 0,favorited_by_id,textblob_polarity,textblob_subjectivity,vader_neg,vader_pos,vader_neu,vader_compound,clusters,topic_0,topic_1,topic_2,topic_3,topic_4,topic_5,topic_6
0,2649540547,0.75,0.75,0.0,0.423,0.577,0.296,2,0.00031,0.0008,0.000266,0.000556,0.000949,9e-05,0.000194
1,2649540547,0.0,0.0,0.0,0.0,1.0,0.0,2,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,2649540547,-0.075,0.825,0.096,0.22,0.683,0.4588,1,0.058949,0.0,0.0,0.003409,0.013943,0.092806,0.000899
3,2649540547,0.5,0.5,0.268,0.0,0.732,-0.6701,2,0.000236,0.000508,0.000718,0.000266,0.004143,0.000158,0.000368
4,2649540547,0.0,0.0,0.0,0.0,0.0,0.0,2,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## Aggregate tweet attributes - sentiment, topics, clusters

In [70]:
# find the average sentiment score per user per cluster - vader coumpound
avg_sentiment = tweets.groupby(['favorited_by_id', 'clusters'], as_index=False).vader_compound.mean()

# pivot columns
avg_sentiment_pivot = avg_sentiment.pivot_table(index='favorited_by_id', columns='clusters', values='vader_compound').reset_index().rename_axis(None, axis=1)

# rename columns
avg_sentiment_pivot.columns = ['user_id'] + ['polarity_cluster_' + str(num) for num in range(7)]
avg_sentiment_pivot.head()

Unnamed: 0,user_id,polarity_cluster_0,polarity_cluster_1,polarity_cluster_2,polarity_cluster_3,polarity_cluster_4,polarity_cluster_5,polarity_cluster_6
0,3840,0.112737,0.398482,0.187999,0.201818,0.222017,0.139913,0.683018
1,1300301,,,0.23512,,,,
2,15043664,0.143196,0.353876,0.090186,0.093165,0.145393,0.093893,0.604089
3,17790052,0.1204,0.1107,0.057933,0.18715,-0.066878,-0.089583,0.7278
4,18210249,0.077132,0.431636,0.171286,0.111314,0.211751,0.225514,0.6309


In [73]:
# aggregate the topic weights across documents per user
topic_weights = tweets.groupby('favorited_by_id', as_index=False).mean()

# select the topic weights
topic_weights = topic_weights.loc[:, ['favorited_by_id'] + ['topic_' + str(num) for num in range(7)]].rename(columns={'favorited_by_id': 'user_id'})
topic_weights.head()

Unnamed: 0,user_id,topic_0,topic_1,topic_2,topic_3,topic_4,topic_5,topic_6
0,3840,0.002816,0.001711,0.004604,0.003548,0.009551,0.003871,0.00319
1,1300301,6.6e-05,0.000297,0.000599,0.000479,0.021437,0.00023,0.00039
2,15043664,0.004078,0.005008,0.003998,0.004637,0.010946,0.002853,0.002506
3,17790052,0.004045,0.006401,0.003713,0.006786,0.010579,0.003679,0.001989
4,18210249,0.002655,0.00232,0.004433,0.003029,0.011045,0.003382,0.003231


In [74]:
# calculate the number of tweets per user per cluster
clusters_count = tweets.groupby(['favorited_by_id', 'clusters'], as_index=False).topic_0.count().rename(columns={'topic_0': 'tweets_per_topic'})

# calculate the total number of tweets per user
tweets_count = tweets.groupby('favorited_by_id', as_index=False).topic_0.count().rename(columns={'topic_0': 'total_tweets'})

# merge the dataframes
tweets_agg = clusters_count.merge(tweets_count, on='favorited_by_id')

# calculate the distribution of tweets per user per cluster
tweets_agg['perc_of_tweets'] = tweets_agg['tweets_per_topic'] / tweets_agg['total_tweets']

# drop extra columns
tweets_agg = tweets_agg.drop(columns=['tweets_per_topic', 'total_tweets'])

# pivot columns
tweets_agg_pivot = tweets_agg.pivot_table(index='favorited_by_id', columns='clusters', values='perc_of_tweets').reset_index().rename_axis(None, axis=1)

# rename columns
tweets_agg_pivot.columns = ['user_id'] + ['perc_cluster_' + str(num) for num in range(7)]
tweets_agg_pivot.head()

Unnamed: 0,user_id,perc_cluster_0,perc_cluster_1,perc_cluster_2,perc_cluster_3,perc_cluster_4,perc_cluster_5,perc_cluster_6
0,3840,0.0259,0.039166,0.822489,0.028427,0.041377,0.015161,0.027479
1,1300301,,,1.0,,,,
2,15043664,0.015449,0.057934,0.78822,0.035404,0.033151,0.051497,0.018346
3,17790052,0.02027,0.060811,0.760135,0.054054,0.030405,0.060811,0.013514
4,18210249,0.023082,0.036577,0.833452,0.022727,0.036222,0.023082,0.024858


## Merge all attributes to user dataframe

In [86]:
# merge all dataframes to form full user-attribute dataframe for recommendation
rec_df = users.merge(tweets_agg_pivot, on='user_id').merge(avg_sentiment_pivot, on='user_id').merge(topic_weights, on='user_id')

# drop user_id column
rec_df = rec_df.drop(columns='user_id')

# fill NaNs with zero
rec_df = rec_df.fillna(0)

# set screen_name as index
rec_df = rec_df.set_index('screen_name')
rec_df.head()

Unnamed: 0_level_0,followers_count,friends_count,favourites_count,statuses_count,Openness,Conscientiousness,Extraversion,Agreeableness,Emotional range,perc_cluster_0,...,polarity_cluster_4,polarity_cluster_5,polarity_cluster_6,topic_0,topic_1,topic_2,topic_3,topic_4,topic_5,topic_6
screen_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
baka_brooks,234,257,4931,218,0.730236,0.334933,0.411628,0.106015,0.552724,0.0175,...,0.11101,0.2468,0.701033,0.003812,0.003851,0.005512,0.003765,0.011529,0.003104,0.002603
Kwammentary,534,1748,698,210,0.906036,0.550105,0.648426,0.297694,0.500949,0.027994,...,-0.09288,-0.154167,0.353436,0.003061,0.00168,0.00339,0.003421,0.010109,0.00458,0.001993
D1_data,5,64,7,1,0.708852,0.768717,0.294999,0.15403,0.112169,0.0,...,0.0,0.0,0.0,6.2e-05,0.001151,0.000587,0.000459,0.004842,0.000402,0.000189
lolegra,169,346,2614,2647,0.518856,0.069776,0.637403,0.241616,0.757079,0.016129,...,0.076844,0.070533,0.665398,0.004961,0.006937,0.003897,0.004396,0.009471,0.0035,0.003333
stealsdeals12,93,1150,1130,878,0.46632,0.706029,0.187577,0.058168,0.317718,0.023339,...,0.246142,0.098733,0.691669,0.003849,0.002078,0.006285,0.005339,0.012632,0.004387,0.001792


## Scale features

In [87]:
scaler = StandardScaler()

In [89]:
scaled_attrs = scaler.fit_transform(rec_df)
scaled_attrs

array([[-0.26299706, -0.40370641, -0.13598679, ...,  0.75297236,
        -0.23149431, -0.21825997],
       [ 0.42954166,  2.4048539 , -0.56809406, ...,  0.24735729,
         0.35089914, -0.56554206],
       [-0.79163495, -0.7672558 , -0.63863177, ..., -1.62807489,
        -1.29797715, -1.5917868 ],
       ...,
       [-0.14295702, -0.18708371, -0.16763178, ..., -0.48018831,
        -0.1731547 , -0.24558275],
       [-0.32532555, -0.3114063 , -0.53379497, ...,  0.31146997,
        -0.01632769, -0.32038829],
       [-0.47306714, -0.52991267, -0.50174166, ...,  0.32338582,
         1.15058184, -0.04580916]])

In [90]:
# convert scaled dataset back to a dataframe
rec_df_scaled = pd.DataFrame(scaled_attrs, index=rec_df.index, columns=rec_df.columns)
rec_df_scaled.head()

Unnamed: 0_level_0,followers_count,friends_count,favourites_count,statuses_count,Openness,Conscientiousness,Extraversion,Agreeableness,Emotional range,perc_cluster_0,...,polarity_cluster_4,polarity_cluster_5,polarity_cluster_6,topic_0,topic_1,topic_2,topic_3,topic_4,topic_5,topic_6
screen_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
baka_brooks,-0.262997,-0.403706,-0.135987,-0.520177,0.721698,0.204247,-0.559164,-1.269132,0.029402,-0.208599,...,-0.011322,1.415707,0.681858,0.180391,-0.020598,0.688453,-0.162659,0.752972,-0.231494,-0.21826
Kwammentary,0.429542,2.404854,-0.568094,-0.520644,1.437334,1.389407,0.504166,-0.19373,-0.216911,0.276634,...,-1.637257,-1.629962,-0.777634,-0.243216,-0.889151,-0.123148,-0.31321,0.247357,0.350899,-0.565542
D1_data,-0.791635,-0.767256,-0.638632,-0.532833,0.63465,2.593515,-1.082883,-0.999743,-2.066498,-1.0178,...,-0.896578,-0.45894,-2.261645,-1.935229,-1.100734,-1.19477,-1.60945,-1.628075,-1.297977,-1.591787
lolegra,-0.413047,-0.236059,-0.372508,-0.378522,-0.138778,-1.25623,0.454667,-0.508352,1.001604,-0.271993,...,-0.283782,0.076816,0.532231,0.828542,1.213716,0.070745,0.113387,0.020208,-0.075302,0.196882
stealsdeals12,-0.58849,1.278416,-0.523995,-0.481687,-0.352636,2.248229,-1.565252,-1.53757,-1.088621,0.061411,...,1.0663,0.29102,0.64254,0.201224,-0.730216,0.983894,0.525827,1.145986,0.274655,-0.679888
