# Bot Step 2

In [68]:
import json
import numpy as np
import pandas as pd
import pickle
import prince
import os

## Load in data

### All distinct accounts

In [2]:
def read_author_data():
    """ Function that reads the 'accounts.pkl' file and prints some descriptive stats.
    """
    with open('accounts.pkl', 'rb') as f:
        author_data = pickle.load(f)
    
    print('We have {} accounts, for which we have user name, id, and screen name.'.format(len(author_data)))
    print('Example: {}'.format(author_data[0]))
    
    return author_data

In [3]:
author_data = read_author_data()

We have 5876 accounts, for which we have user name, id, and screen name.
Example: ('RepFredUpton', '124224165', 'Rep. Fred Upton')


### Followers for (a selection of) these accounts

In [4]:
def read_account_data():
    """ Function that reads in the account data and returns this data in a dictionary.
    """
    account_data = dict()
    os.chdir(os.getcwd() + '\\following_lists')
    
    for filename in os.listdir(os.getcwd()):
        id = filename.split('_')[0]
        with open(filename, 'rb') as f:
            account_data[id] = pickle.load(f)

    os.chdir('..')
    
    return account_data

In [5]:
account_data = read_account_data()

## Create Adjacency Matrix

In [6]:
def create_adjacency_matrix(type='undirected'):
    """ Function that creates an adjacency matrix with the respective type.
    """
    adj_df = pd.DataFrame(0, index=[author_data[i][1] for i in range(len(author_data))],
                      columns=[author_data[i][1] for i in range(len(author_data))])
    col_set = {author_data[i][1] for i in range(len(author_data))}
    
    i = 0
    for key in account_data.keys():
        fol_list = list(account_data[key])
        fol_set = {fol_list[i][1] for i in range(len(fol_list))}
        for id_ in fol_set:
            if id_ in col_set:
                adj_df[id_].loc[key] = 1
                if (type=='undirected'):
                    adj_df[key].loc[id_] = 1

        i += 1
        if (i % 100 == 0):
            print(i)
    
    return adj_df

In [7]:
adj_df = create_adjacency_matrix(type='undirected')

100
200
300
400
500
600
700
800
900
1000
1100
1200
1300
1400
1500
1600
1700
1800
1900
2000
2100
2200
2300
2400
2500
2600
2700
2800
2900
3000
3100
3200
3300
3400
3500
3600
3700
3800
3900
4000
4100
4200
4300
4400
4500
4600
4700
4800
4900
5000


## Check whether we have zero columns and/or rows

In [13]:
def remove_null(adj_df):
    """ Function that removes null rows/columns from the adjacency matrix. These
        nulls correspond with authors that do not follow anyone and are not followed
        by anyone of interest.
    """
    zero_follower = 0
    drop_idx_list = []
    for i in range(len(adj_df)):
        if np.sum(adj_df.iloc[i,:]) == 0:
            zero_follower += 1
            drop_idx_list.append(adj_df.columns[i])
    
    print('The number of entries we delete is {}.'.format(zero_follower))
    adj_df_nz = adj_df.copy()
    adj_df_nz = adj_df_nz.drop(drop_idx_list, axis=0)
    adj_df_nz = adj_df_nz.drop(drop_idx_list, axis=1)
    
    return adj_df_nz

In [14]:
adj_df_nz = remove_null(adj_df)

The number of entries we delete is 76.


In [15]:
def write_to_file(filename, adj_df_nz):
    """ Write the adjacency matrix dataframe to a csv file.
    """
    adj_df_nz.to_csv(filename)
    return

In [16]:
write_to_file('adj_matrix.csv', adj_df_nz)

All columns that are purely 0 have corresponding rows that are 0 too. I.e., if someone was not followed by anyone in our data, this person did not follow anyone either. 

## Correspondence Analysis

In [36]:
def corresp_analysis(df = adj_df_nz,
                     filename = '0'):
    if filename != '0':
        df = pd.read_csv(filename, index_col=0)
    
    ca = prince.CA(n_components=100, n_iter=100)
    ca.fit(df)
    
    print('The first PC explains around {}% of variance.'.format(100*ca.eigenvalues_[0]/sum(ca.eigenvalues_)))
    
    pca_vals = ca.row_coordinates(df) # One can check that column PCA projs are exactly equal
    scores = pca_vals.iloc[:,0]
    
    return scores

In [37]:
ideology_scores = corresp_analysis(filename='adj_matrix.csv')

The first PC explains around 11.326051575094654% of variance.


#### Use scores to form quantiles

In [49]:
ideology_scores

124224165     0.844295
242926427     0.920977
234822928     1.275552
12788332      0.888146
52503751      1.109054
                ...   
325581008     0.113305
305818748    -0.672569
3021632183    0.216137
16796658     -0.051347
301628954     0.017839
Name: 0, Length: 5800, dtype: float64

In [83]:
def create_quantiles(scores):
    """ Function that maps a series of ideology scores into 7 quantiles.
    """
    q1_val = ideology_scores.quantile(1/7)
    q2_val = ideology_scores.quantile(2/7)
    q3_val = ideology_scores.quantile(3/7)
    q4_val = ideology_scores.quantile(4/7)
    q5_val = ideology_scores.quantile(5/7)
    q6_val = ideology_scores.quantile(6/7)
    
    mapping_dict = dict()
    
    for author_id in list(scores.index):
        score = scores.loc[author_id]
        quantile = 0
        if score < q1_val:
            quantile = 1
        elif score < q2_val:
            quantile = 2
        elif score < q3_val:
            quantile = 3
        elif score < q4_val:
            quantile = 4
        elif score < q5_val:
            quantile = 5
        elif score < q6_val:
            quantile = 6
        else:
            quantile = 7
        
        mapping_dict[author_id] = {'score': score, 'quantile': quantile}
    
    return mapping_dict

In [84]:
score_mapping = create_quantiles(ideology_scores)

In [86]:
with open('score_mappings.json', 'w') as f:
    json.dump(score_mapping, f)

In [87]:
max(ideology_scores)

2.0879281970416743

In [102]:
sorted(ideology_scores[ideology_scores < ideology_scores.quantile(1/7)])

[-1.0820533776136707,
 -1.050362092610834,
 -1.0452493473881972,
 -1.0341490409278897,
 -1.02235579357646,
 -1.0092769695877124,
 -1.0070327580297957,
 -0.9851217311596214,
 -0.9724527767974688,
 -0.970378021309772,
 -0.9688760323157876,
 -0.9654164471844978,
 -0.9635296381739429,
 -0.9614502556454599,
 -0.961222562899579,
 -0.9586945842651489,
 -0.9586330505869198,
 -0.9580831012546124,
 -0.9527451633726473,
 -0.9521806128141993,
 -0.9507011633713327,
 -0.9481903377445362,
 -0.944347266285102,
 -0.9435537433341835,
 -0.94339953738603,
 -0.9416820797849369,
 -0.9375683336361775,
 -0.934609190451277,
 -0.932873388929104,
 -0.9319118385255563,
 -0.9310582877998751,
 -0.9304236806220799,
 -0.9304086625191963,
 -0.9300694851722824,
 -0.9280397039949225,
 -0.9275519441941771,
 -0.9272740797576953,
 -0.9240718646232634,
 -0.921935123383956,
 -0.921539975117614,
 -0.9205524869466514,
 -0.9197455543241757,
 -0.9179816552435636,
 -0.9168251734757185,
 -0.9163541040440093,
 -0.9139195242906795,


In [113]:
ideology_scores[ideology_scores < -0.8]

16400248     -0.871776
1507338108   -0.867401
39008044     -0.898217
376364316    -1.082053
302881437    -0.806585
                ...   
45055696     -0.853299
21733692     -0.818844
512225400    -0.950701
2314055840   -0.803655
1604931252   -0.970378
Name: 0, Length: 185, dtype: float64