# Analysis of polarization in retweets and replies

In [2]:
import os
import sys
sys.path.append('../../src')
PATH = '/home/piotr/projects/twitter'

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pickle
import json

### Get Tweet-level polarization for each of the Tweets:

In [None]:
tweets_polarization = dict()
PATH_SCORES = os.path.join(PATH, 'results/polarization/tweet_polarization.json')
for df in tqdm(read_window(PATH_DATA, window_size=7,  batch_size=1, 
                           day_from=START, day_to=END, dtype=dtypes, filter_fun=ff)):

    #fit vectorizer on all vocabulary:
    model = ModelPolarization(parties = ["gov", "opp"], limit = 40, ngram_range = (1,2), log = 20)
    model.prefit(df["lemmatized"].astype(str).to_numpy())
    date_range = sorted(pd.to_datetime(df.day.unique()))
    mid_date = date_range[3].date().strftime('%Y-%m-%d')
    df = df[df['day'] == mid_date]
    #keep only unique across party and text:
    orig = df.drop_duplicates(subset = ['lemmatized','source'])
    #estimate:
    model = ModelPolarization(parties = ["gov", "opp"], limit = 10, ngram_range = (1,2))
    est = model.estimate(orig['source'], orig['user-id_str'], orig['lemmatized'], 
                     text_id = orig['id_str'], level ='speech', leave_out = True)
    orig['pol'] = orig['id_str'].map(est)
    #map the polarization scores on non-unique tweets:
    pol_dict = orig[['full_text','source','pol']].set_index(['full_text','source'])['pol'].to_dict()
    df['pol'] = df.set_index(['full_text','source']).index.map(pol_dict)
    df = df[['Repliesid_str','pol']].set_index('id_str')['pol'].to_dict()
    #save:
    tweets_polarization.update(df)
json.dump(tweets_polarization, open(PATH_SCORES, 'w'))

## Retweets

### 1. Overall:

In [None]:
retweets = pd.DataFrame()
for dat in tqdm(read_files(PATH_DATA, day_from = START, day_to = END, dtype = dtypes, filter_fun = ff), total = 144):
    dat = dat[dat.retweet]
    dat['pol'] = dat['id_str'].map(tweets_polarization)
    dat = dat.groupby(['source','full_text', 'pol']).size().reset_index()
    dat.columns = ['source', 'full_text','pol', 'n']
    retweets = retweets.append(dat)
retweets.corr()

In [None]:
retweets.plot.scatter('num','pol', figsize = (12, 8))
plt.xscale('log')
plt.xlabel('Number of retweets')
plt.ylabel('Partisanship')
plt.show()

### 2. Day-level

In [None]:
correlations = retweets.groupby(['day']).corr().reset_index()
correlations[correlations['level_1'] == 'pol'].set_index('day')['num'].plot(rot = 45, figsize = (12, 8))
plt.ylabel('Partisanship-popularity correlation')
plt.show()

### 3. Week- level:

In [None]:
retweets['week'] = pd.to_datetime(retweets['day']).dt.week
correlations = retweets.groupby(['week']).corr().reset_index()
correlations[correlations['level_1'] == 'pol'].set_index('week')['num'].plot(rot = 45, figsize = (12, 8))
plt.ylabel('Partisanship-popularity correlation')
plt.show()

### 4. By topic:

In [None]:
correlations = retweets.groupby(['day', 'topic']).corr().reset_index()
correlations = correlations[correlations['level_2'] == 'pol']
fig, ax = plt.subplots(5, 3, figsize = (16, 12), sharex = True, sharey = True)
ax = ax.ravel()
for i, (topic, df) in enumerate(correlations.groupby('topic')):
    ax[i].plot(df['day'], df['num'])
    ax[i].set_title(f'Topic : {topic}')
    ax[i].xaxis.set_tick_params(rotation=45)
    ax[i].xaxis.set_ticks(np.arange(len(df['day']), step = 20))
plt.tight_layout()
plt.show()

## Replies

In [None]:
cols = ['created_at', 'full_text','id_str','user-id_str','in_reply_to_status_id_str']
all_replies = pd.DataFrame(columns = cols)
for fname in tqdm([f for f in os.listdir(PATH_DATA) if '.csv' in f]):
    dat = pd.read_csv(os.path.join(PATH_DATA, fname), index_col = 0, dtype = str)
    replies = dat[dat['in_reply_to_status_id_str'].notna()]
    all_replies = all_replies.append(replies[cols])
counts = all_replies.groupby('in_reply_to_status_id_str').size().to_dict()
all_replies['n'] = all_replies['in_reply_to_status_id_str'].map(counts)
all_replies['date'] = pd.to_datetime('')
all_replies = all_replies.sort_values(['in_reply_to_status_id_str','date'])
all_replies['order'] = all_replies.groupby(['in_reply_to_status_id_str']).cumcount()