In [107]:
import pandas as pd
import numpy as np

In [None]:
biden_data = pd.read_csv('raw_data/2020_tweets/hashtag_donaldtrump.csv', lineterminator='\n')
trump_data = pd.read_csv('raw_data/2020_tweets/hashtag_joebiden.csv', lineterminator='\n')
biden_data['candidate'] = 'biden'
trump_data['candidate'] = 'trump'
data = pd.concat([trump_data, biden_data])

labeled_tweets = pd.read_csv('results/labeled_tweets.csv')

labeled_tweets = labeled_tweets.merge(data[['tweet_id', 'user_id']], on='tweet_id', how='left')

In [17]:
election_results = pd.read_csv('results/state_election_results.csv')

labeled_tweets['state_lower'] = labeled_tweets['state'].str.lower()
election_results['state_lower'] = election_results['state'].str.lower()

labeled_tweets = labeled_tweets[labeled_tweets['state_lower'].isin(election_results['state_lower'])]

labeled_tweets = labeled_tweets.drop(columns=['state_lower'])
election_results = election_results.drop(columns=['state_lower'])

In [18]:
# From https://ballotpedia.org/Presidential_battleground_states,_2020
swing_states = ['Arizona', 'Florida', 'Georgia', 'Iowa', 'Michigan',
                'Minnesota', 'Nevada', 'New Hampshire', 'North Carolina',
                'Ohio', 'Pennsylvania', 'Texas', 'Wisconsin']
labeled_tweets[labeled_tweets['state'].isin(swing_states)]['state'].value_counts()

Texas             17441
Florida           14886
Pennsylvania       7402
Georgia            5871
Ohio               4775
Nevada             4269
Arizona            3910
North Carolina     2800
Michigan           2399
Minnesota          1870
Wisconsin          1141
Iowa                336
New Hampshire        56
Name: state, dtype: int64

In [33]:
multiple_states = labeled_tweets.groupby('user_id')['state'].apply(lambda x: list(x.unique()))
users_with_multiple_states = multiple_states[multiple_states.apply(len) > 1]

if not users_with_multiple_states.empty:
    print(f'There are {len(users_with_multiple_states)} users with multiple states')
    for user_id, states in users_with_multiple_states.items():
        print(f"User ID: {user_id}, States: {states}")
else:
    print("No users have multiple states.")


There are 17 users with multiple states
User ID: 45721296.0, States: ['District of Columbia', 'New York']
User ID: 49378584.0, States: ['District of Columbia', 'Florida']
User ID: 606779465.0, States: ['California', 'Georgia']
User ID: 734215934.0, States: ['Texas', 'New York']
User ID: 1342861424.0, States: ['Montana', 'Texas']
User ID: 9.639733850500916e+17, States: ['Texas', 'California']
User ID: 1.0189151490029116e+18, States: ['New York', 'California']
User ID: 1.02138208905959e+18, States: ['California', 'Pennsylvania']
User ID: 1.1142462904210268e+18, States: ['District of Columbia', 'Maryland']
User ID: 1.1451557975146742e+18, States: ['Massachusetts', 'Washington']
User ID: 1.1562510488577352e+18, States: ['New York', 'District of Columbia']
User ID: 1.1798153295657738e+18, States: ['Utah', 'Pennsylvania']
User ID: 1.2438810740585062e+18, States: ['New York', 'District of Columbia']
User ID: 1.2870247078925435e+18, States: ['Utah', 'District of Columbia']
User ID: 1.298043766

In [None]:
# Using population vs twitter users from https://www.pewresearch.org/internet/2019/04/24/sizing-up-twitter-users/

stance_weights = pd.DataFrame({
    'forced_stance': [-1, 1, 0],
    'user_proportion': [0.6, 0.35, 0.05], 
    'population_proportion': [0.52, 0.43, 0.05]
    })

stance_weights['weight'] = stance_weights['population_proportion'] / stance_weights['user_proportion']
stance_weights = stance_weights[['forced_stance', 'weight']]

In [None]:
# independents alliance https://www.pewresearch.org/politics/2019/03/14/political-independents-who-they-are-what-they-think/
proportions = np.array([17, 7, 13])
proportions = np.divide(proportions, proportions.sum())

def generate_forced_stance(avg_stance):
    if avg_stance < 0:
        return -1
    elif avg_stance > 0:
        return 1
    else:
        return np.random.choice([-1, 0, 1], p=proportions)

In [116]:
average_stance = labeled_tweets.groupby('user_id')['stance'].mean()
forced_stance = labeled_tweets.groupby('user_id')['stance'].mean()
most_frequent_state = labeled_tweets.groupby('user_id')['state'].agg(lambda x: x.mode().iloc[0])

users_stances = pd.DataFrame({
    'user_id': average_stance.index,
    'avg_stance': average_stance.values, 
    'state': most_frequent_state.values
})
users_stances['forced_stance'] = users_stances['avg_stance'].apply(generate_forced_stance)
users_stances = users_stances.merge(stance_weights, on='forced_stance')

In [120]:
def weighted_avg(data, val, weight):
    return (data[val] * data[weight]).sum() / data[weight].sum()


weighted_avg_forced = (
    users_stances
    .groupby('state')
    .apply(lambda x: weighted_avg(x, 'forced_stance', 'weight'))
    .reset_index(name='weighted_avg_forced_stance')
)

In [121]:
state_scores = users_stances.groupby('state')['avg_stance'].mean()
state_scores_forced = users_stances.groupby('state')['forced_stance'].mean()
state_scores = pd.DataFrame({
    'state': state_scores.index,
    'average_avg_stance': state_scores.values,
    'average_forced_stance': state_scores_forced.values
})
state_scores = state_scores.merge(weighted_avg_forced, on='state')
state_scores[state_scores['state'].isin(swing_states)]

Unnamed: 0,state,average_avg_stance,average_forced_stance,weighted_avg_forced_stance
2,Arizona,-0.137457,-0.243959,-0.091578
9,Florida,-0.150601,-0.244821,-0.093163
10,Georgia,-0.200112,-0.326072,-0.183125
15,Iowa,-0.252724,-0.374046,-0.236185
22,Michigan,-0.150722,-0.256711,-0.108781
23,Minnesota,-0.192139,-0.334232,-0.195819
28,Nevada,-0.13926,-0.191358,-0.035517
29,New Hampshire,-0.05119,0.0,0.160912
33,North Carolina,-0.163901,-0.274742,-0.126192
35,Ohio,-0.167936,-0.258724,-0.107471


In [122]:
reddit_data = pd.read_pickle('raw_data/factoid_reddit/reddit_corpus_unbalanced_filtered.gzip', compression='gzip')
len(reddit_data)

4150

In [49]:
reddit_data = pd.read_pickle('raw_data/factoid_reddit/reddit_corpus_unbalanced_filtered.gzip', compression='gzip')

columns_to_keep = [
    'pb_factor', 'user_id'
]

reddit_data = reddit_data[columns_to_keep]
reddit_data = pd.DataFrame({
    "user_id": reddit_data["user_id"],
    "stance": reddit_data["pb_factor"].apply(lambda x: -1 if x < -0.5 else (1 if x > 0.5 else 0))
})
reddit_data = reddit_data.reset_index(drop=True)


reddit_user_stances = pd.DataFrame({
    'user_id' : reddit_data.groupby('user_id')['stance'].mean().index,
    'stance' : reddit_data.groupby('user_id')['stance'].mean().values
})

In [None]:
print(f"The average national average stance is {users_stances['avg_stance'].mean()}")
print(f"The average national forced stance is {users_stances['forced_stance'].mean()}")
print(f"The average stance from reddit data is {reddit_user_stances['stance'].mean()}")

The average national average stance is -0.1721383538719692
The average national forced stance is -0.2714242003436345
The weighted_average national forced stance is -0.2714242003436345
The average stance from reddit data is -0.403855421686747


In [124]:
state_scores.to_csv('results/state_scores_georgetown_model.csv')