In [15]:
import sys
import os
PATH = '/home/piotr/projects/twitter/'
sys.path.append('/home/piotr/projects/twitter/src')
import itertools
import operator
import pandas as pd
import pickle
import numpy as np
import json
from tqdm import tqdm, trange
from time import sleep
import gc
import os
import nltk
from dask import array as da
import re
import time
import itertools
from collections import OrderedDict
from sklearn.feature_extraction.text import TfidfVectorizer
from upsetplot import plot as setplot
from tweepy import API, AppAuthHandler, Cursor
from twitter_tools.scrapers import TwitterSampler
from twitter_tools.config import consumer_key, consumer_secret, access_token, access_secret
from matplotlib import pyplot as plt
import datetime

#### Seed profiles to obtain the polarized followers from:

In [24]:
gov_seed = ['@D_Tarczynski', '@BeataSzydlo', '@Macierewicz_A', 
              '@KrystPawlowicz', '@StKarczewski', '@MorawieckiM', 
              '@ZiobroPL', '@jbrudzinski', '@PatrykJaki', '@mblaszczak']
opp_seed = ['@SchetynadlaPO', '@bbudka', '@KLubnauer', '@Arlukowicz', 
              '@profGrodzki', '@RyszardPetru', '@trzaskowski_', 
              '@TomaszSiemoniak', '@Gasiuk_Pihowicz']

#### Export as table: 

In [33]:
tbl = pd.DataFrame(dict(source = ['government', 'opposition'], 
                        username = [", ".join(gov_seed), ", ".join(opp_seed)]))
pd.set_option('display.max_colwidth', 150)
open(os.path.join(PATH, 'final/tables/seed_profiles.tex'), 'w').write(tbl.to_latex(index=False, 
                                                                                   column_format='lp{1.8cm}p{5cm}'))
tbl.to_csv(os.path.join(PATH, 'final/tables/seed_profiles.csv'))

### Get followers of politician profiles
The method `getFollowers` from `FollowerScrapers` given a list of Twitter account names returns dictionary of the account names from the list associatied with list of IDs of their followers. 

#### Initialize the Twitter API

In [3]:
auth = AppAuthHandler(consumer_key = consumer_key, consumer_secret = consumer_secret)
api = API(auth)

**Get followers of government politicians profiles**

In [37]:
if os.path.isfile("../data/sample/government_followers.json"):
    government_followers = json.load(open("../data/sample/government_followers.json","r"))
else:
    government = [api.get_user(name).id for name in gov_seed]
    government_followers = scraper.getFollowers(government, "../data/sample/government_followers.json")
    json.dump(government, open("../data/sample/government_followers.json", 'w'))

**Get followers of opposition politicians profiles**

In [38]:
if os.path.isfile("../data/sample/opposition_followers.json"):
    opposition_followers = json.load(open("../data/sample/opposition_followers.json","r"))
else:
    opposition = [api.get_user(name).id for name in opp_seed]
    opposition_followers = scraper.getFollowers(opposition, "../data/sample/opposition_followers.json")
    json.dump(opposition_followers, open("../data/sample/opposition_followers.json", 'w'))

#### Look at the intersections:

In [5]:
def intersect_followers(followers_dict):
    profile_names = list(followers_dict.keys())
    followers_intersections = [pd.Series(True, index=list(elements), name=name)
                      for name, elements in followers_dict.items()] #set all values to true where id exists
    followers_intersections = pd.concat(followers_intersections,sort = False,axis = 1) #concatanate
    followers_intersections = followers_intersections.fillna(False).reset_index() #fill nas and put id to columns
    followers_intersections = followers_intersections.groupby(profile_names).count() #get counts
    return followers_intersections["index"]

In [None]:
opposition_followers = {k:set(v) for k,v in opposition_followers.items()}
setplot(intersect_followers(opposition_followers),show_percentages = True)
plt.show()

In [None]:
government_followers = {k:set(v) for k,v in government_followers.items()}
government_intersections = intersect_followers(government_followers)
setplot(government_intersections,show_percentages = True)
plt.show()
print(pd.DataFrame(government_intersections).sort_values(by = "index"))

### Narrow down the population to political partisans
The method `subsetFollowers` given two followers dictionaries such as the ones created by the `getFollowers` method returns two lists of followers, each that had at least $n$ overlap within one dictionary and at most $m$ overlap with the other. Using this method I filtered out two distinct populations of users - *opposition partisans* following at least 6 opposition politicians and at most 4 government politicians and *government partisans* following at least 6 government politicians and and most.

In [47]:
if os.path.isfile("../data/sample/gov_partisans.p") and os.path.isfile("../data/sample/opp_partisans.p"):
    gov_partisans = pickle.load(open("../data/sample/gov_partisans.p","rb"))
    opp_partisans = pickle.load(open("../data/sample/opp_partisans.p","rb"))
else:
    gov_partisans, opp_partisans = scraper.subsetFollowers(government_followers, opposition_followers, 6, 4)
    pickle.dump(gov_partisans,open("../data/sample/gov_partisans.p","wb"))
    pickle.dump(opp_partisans,open("../data/sample/opp_partisans.p","wb"))

The size of these groups:

In [48]:
print("{} opposition partisans and {} government partisans were filtered".format(len(gov_partisans),len(opp_partisans)))

35422 opposition partisans and 26465 government partisans were filtered


As expected, there's no overlap between the two groups:

In [49]:
set(gov_partisans).intersection(set(opp_partisans))

set()

### Get population info

In [148]:
if os.path.isfile("../data/sample/gov_partisans_info.csv"):
    gov_followers_info = pd.read_csv("../data/sample/gov_partisans_info.csv", index_col = 0)
else:
    scraper.getFollowersData(gov_partisans, "../data/sample/gov_partisans_info.csv")

In [149]:
if os.path.isfile("../data/sample/opp_partisans_info.csv"):
    opp_followers_info = pd.read_csv("../data/sample/opp_partisans_info.csv", index_col = 0)
else:
    scraper.getFollowersData(opp_partisans, "../data/sample/opp_partisans_info.csv")

### Get samples:
Given the above partisans, I sampled 5000 from each group that had tweeted at least once since the 1st of March.

In [153]:
def filter_date(df, date):
    
    #filter out examples with erroneous date record:
    match_date = "[A-Z][a-z]{2} [A-Z][a-z]{2} \d{2} \d{2}:\d{2}:\d{2} \+\d{4} \d{4}"
    index_good = df["status-created_at"].astype(str).apply(lambda x: re.search(match_date, x) != None)
    df = df[index_good]
    
    #parse date
    dates = pd.to_datetime(df['status-created_at'], format = "%a %b %d %H:%M:%S +0000 %Y")
    
    #return date above limit
    return df["id_str"][dates > date].tolist()

In [154]:
if os.path.isfile("../data/sample/opp_sample.pickle"):
    opp_sample = pickle.load(open("../data/sample/opp_sample.pickle", "rb"))
else:
    opp_sample = filter_date(opp_followers_info, datetime.datetime.strptime("01/03/2020", "%d/%m/%Y"))
    opp_sample = opp_sample[:5000]
    pickle.dump(opp_sample, open("../data/sample/opp_sample.pickle","wb"))

In [156]:
if os.path.isfile("../data/sample/gov_sample.pickle"):
    gov_sample = pickle.load(open("../data/sample/opp_sample.pickle", "rb"))
else:
    
    gov_sample = filter_date(gov_followers_info, datetime.datetime.strptime("01/03/2020", "%d/%m/%Y"))
    gov_sample = gov_sample[:5000]
    pickle.dump(gov_sample, open("../data/sample/gov_sample.pickle","wb"))

### Trash

In [26]:
#function to filter out elements of superset that occur in at most/least n subsets
def how_many(superset,subsets):
    superset = np.array(superset)[:,np.newaxis] #convert to Nx1 nparray
    superset = da.from_array(superset, chunks = (2000,1)) #convert to dask
    count = np.zeros(superset.shape[0]).astype("int8")
    for subset in tqdm(subsets):
        tmp = da.from_array(subset)
        count += (superset == tmp).sum(axis = 1).compute().astype("int8")
        gc.collect()
    return count

government_superset = list(set(itertools.chain.from_iterable(government_followers.values())))
government_subsets = list(government_followers.values())
opposition_superset = list(set(itertools.chain.from_iterable(opposition_followers.values())))
opposition_subsets = list(opposition_followers.values())


gov_in_gov = how_many(government_superset, government_subsets) #how many gov profiles followed by each gov follower
gov_in_opp = how_many(government_superset, opposition_subsets) #how many opp profiles followed by each gov follower
opp_in_opp = how_many(opposition_superset, opposition_subsets) #how many opp profiles followed by each opp follower
opp_in_gov = how_many(opposition_superset, government_subsets) #how many gov profiles followed by each opp follower
pickle.dump([gov_in_gov,gov_in_opp,opp_in_opp,opp_in_gov], open("overlaps.pickle","wb"))

    
"""    
gov_partisans = np.array(government_superset)[(gov_in_gov >= 6) & (gov_in_opp <= 4)].tolist()
opp_partisans = np.array(opposition_superset)[(opp_in_opp >= 6) & (opp_in_gov <= 4)].tolist()


pickle.dump(gov_partisans, open("data/gov_partisans.pickle","wb"))
pickle.dump(opp_partisans, open("data/opp_partisans.pickle","wb"))


#no overlap
set(gov_partisans).intersection(set(opp_partisans))
"""

100%|██████████| 10/10 [12:00<00:00, 72.09s/it]
100%|██████████| 10/10 [10:46<00:00, 64.67s/it]
100%|██████████| 10/10 [10:41<00:00, 64.13s/it]
100%|██████████| 10/10 [12:11<00:00, 73.17s/it]


'    \ngov_partisans = np.array(government_superset)[(gov_in_gov >= 6) & (gov_in_opp <= 4)].tolist()\nopp_partisans = np.array(opposition_superset)[(opp_in_opp >= 6) & (opp_in_gov <= 4)].tolist()\n\n\npickle.dump(gov_partisans, open("data/gov_partisans.pickle","wb"))\npickle.dump(opp_partisans, open("data/opp_partisans.pickle","wb"))\n\n\n#no overlap\nset(gov_partisans).intersection(set(opp_partisans))\n'

In [27]:
gov_partisans = np.array(government_superset)[(gov_in_gov > gov_in_opp)].tolist()
opp_partisans = np.array(opposition_superset)[(opp_in_opp > opp_in_gov)].tolist()