In [2]:
import pandas as pd
import numpy as np

In [3]:
biden_data = pd.read_csv('raw_data/2020_tweets/hashtag_donaldtrump.csv', lineterminator='\n')
trump_data = pd.read_csv('raw_data/2020_tweets/hashtag_joebiden.csv', lineterminator='\n')
biden_data['candidate'] = 'biden'
trump_data['candidate'] = 'trump'
data = pd.concat([trump_data, biden_data])

labeled_tweets = pd.read_csv('results/labeled_tweets_georgetown_model.csv')

labeled_tweets = labeled_tweets.merge(data[['tweet_id', 'user_id']], on='tweet_id', how='left')

In [3]:
labeled_tweets = pd.read_csv('results/labeled_tweets_twitter_roberta.csv')

In [4]:
election_results = pd.read_csv('results/state_election_results.csv')

labeled_tweets['state_lower'] = labeled_tweets['state'].str.lower()
election_results['state_lower'] = election_results['state'].str.lower()

labeled_tweets = labeled_tweets[labeled_tweets['state_lower'].isin(election_results['state_lower'])]

labeled_tweets = labeled_tweets.drop(columns=['state_lower'])
election_results = election_results.drop(columns=['state_lower'])

In [5]:
# From https://ballotpedia.org/Presidential_battleground_states,_2020
swing_states = ['Arizona', 'Florida', 'Georgia', 'Iowa', 'Michigan',
                'Minnesota', 'Nevada', 'New Hampshire', 'North Carolina',
                'Ohio', 'Pennsylvania', 'Texas', 'Wisconsin']
labeled_tweets[labeled_tweets['state'].isin(swing_states)]['state'].value_counts()

state
Texas             14225
Florida           12025
Pennsylvania       6140
Georgia            4697
Ohio               4157
Nevada             3467
Arizona            3337
North Carolina     2329
Michigan           2088
Minnesota          1574
Wisconsin           991
Iowa                292
New Hampshire        50
Name: count, dtype: int64

In [10]:
with pd.option_context('display.max_rows', None, 'display.max_columns', None, 'display.max_colwidth', None):
    display(labeled_tweets.sample(50, replace=False, random_state=42)[['tweet', 'label']])

Unnamed: 0,tweet,label
202477,"Latest 'Borat' footage appears to show star at the #WhiteHouse, meeting #Trump Jr. #TheHill https://t.co/0NdcwJCi5j",neutral
308346,#TrumpIsLosing #TrumpMeltdown #Trump #byeashy https://t.co/CfIbo9XNee,pro-Biden
57343,"#trunalimunumaprzure Mobilization crew shirt for 13$ inly, 10$ mask, 2$ pin, stickers and more now on sale for 3 days only.\nhttps://t.co/tKkRULYZhb\n #FartingTrunalimunumaprzure #Biden #Trump #VOTE #USA #JoeBiden #BidenHarrisToSaveAmerica #MAGA Christmas #DaylightSavings",neutral
66580,"Depending on what happens tomorrow I’ll either be getting really drunk, or I'll be getting really drunk. \n.\n.\n.\n.\n#VOTE #VoteResponsibly #VoteBiden #VoteHimOut2020 #VoteBidenHarrisToSaveAmerica #BidenHarris2020 #Biden #BidenHarris #Biden2020",neutral
277060,"@LabyrinthWeaver @realDonaldTrump @Twitter This is what he is the best at -&gt; lying and spreading misinformation.\n\nOh, almost forgot, #Trump is tremendously good at #StockMarket manipulation which is a federal crime! https://t.co/Ah0tCwQDli",pro-Biden
6868,"#RupertMurdoch supposedly predicts a blow-out win for #JoeBiden. Nonetheless, the @FoxNews magnate is pulling out all the stops for @realDonaldTrump--including the #HunterBidensUkraineScandal story in the @nypost--also owned by #Murdoch.\nhttps://t.co/DpBkXYxsUU",neutral
327757,What are the odds he starts a new confederacy just so he can be president of something? #Election2020results #Trump #TRUMPislosing #CivilWar2020 #politics,neutral
241126,The driver of this truck will probably get a medal of honor reserved for those who fought to keep this country a democracy. This is perverted justice. #Trump #TrumpTerriorism \nhttps://t.co/DRgrSyN6DZ,neutral
97411,"@bennyjohnson Based on remaining votes in #Pennsylvania math shows #Biden will over take Trump’s lead of 165k just from 2 counties (Philadelphia &amp; Allegheny) &amp; then surpass him from votes in Delaware, Buck &amp; Montgomery. Giving Biden 20 delegates, the Presidency &amp; the Blue Wall back to Dems",neutral
19646,#Biden didn't off #Soleimani. https://t.co/A1gFzEhtMv,neutral


In [11]:
multiple_states = labeled_tweets.groupby('user_id')['state'].apply(lambda x: list(x.unique()))
users_with_multiple_states = multiple_states[multiple_states.apply(len) > 1]

if not users_with_multiple_states.empty:
    print(f'There are {len(users_with_multiple_states)} users with multiple states')
    for user_id, states in users_with_multiple_states.items():
        print(f"User ID: {user_id}, States: {states}")
else:
    print("No users have multiple states.")


There are 17 users with multiple states
User ID: 45721296.0, States: ['District of Columbia', 'New York']
User ID: 49378584.0, States: ['District of Columbia', 'Florida']
User ID: 606779465.0, States: ['California', 'Georgia']
User ID: 734215934.0, States: ['Texas', 'New York']
User ID: 1342861424.0, States: ['Montana', 'Texas']
User ID: 9.639733850500916e+17, States: ['Texas', 'California']
User ID: 1.0189151490029116e+18, States: ['New York', 'California']
User ID: 1.02138208905959e+18, States: ['California', 'Pennsylvania']
User ID: 1.1142462904210268e+18, States: ['District of Columbia', 'Maryland']
User ID: 1.1451557975146742e+18, States: ['Massachusetts', 'Washington']
User ID: 1.1562510488577352e+18, States: ['New York', 'District of Columbia']
User ID: 1.1798153295657738e+18, States: ['Utah', 'Pennsylvania']
User ID: 1.2438810740585062e+18, States: ['New York', 'District of Columbia']
User ID: 1.2870247078925435e+18, States: ['Utah', 'District of Columbia']
User ID: 1.298043766

In [12]:
# Using population vs twitter users from https://www.pewresearch.org/internet/2019/04/24/sizing-up-twitter-users/

stance_weights = pd.DataFrame({
    'forced_stance': [-1, 1, 0],
    'user_proportion': [0.6, 0.35, 0.05], 
    'population_proportion': [0.52, 0.43, 0.05]
    })

stance_weights['weight'] = stance_weights['population_proportion'] / stance_weights['user_proportion']
stance_weights = stance_weights[['forced_stance', 'weight']]

In [13]:
# independents alliance https://www.pewresearch.org/politics/2019/03/14/political-independents-who-they-are-what-they-think/
proportions = np.array([17, 7, 13])
proportions = np.divide(proportions, proportions.sum())

def generate_forced_stance(avg_stance):
    if avg_stance < 0:
        return -1
    elif avg_stance > 0:
        return 1
    else:
        return np.random.choice([-1, 0, 1], p=proportions)

In [14]:
users_stances = labeled_tweets.groupby('user_id', as_index=False).agg(
    avg_stance=('stance', 'mean'),
    state=('state', lambda x: x.mode().iloc[0])
)

users_stances['forced_stance'] = users_stances['avg_stance'].apply(generate_forced_stance)
users_stances = users_stances.merge(stance_weights, on='forced_stance')

In [27]:
users_stances

Unnamed: 0,user_id,avg_stance,state,forced_stance,weight
0,1.081000e+03,0.0,California,-1,0.866667
1,3.471000e+03,0.0,California,-1,0.866667
2,3.652000e+03,0.0,New York,-1,0.866667
3,4.358000e+03,0.0,North Carolina,-1,0.866667
4,1.002200e+04,-0.5,Missouri,-1,0.866667
...,...,...,...,...,...
42482,1.325429e+18,0.0,Massachusetts,1,1.228571
42483,1.325492e+18,1.0,District of Columbia,1,1.228571
42484,1.325521e+18,0.0,District of Columbia,1,1.228571
42485,1.325550e+18,0.0,Minnesota,1,1.228571


In [11]:
def weighted_avg(data, val, weight):
    return (data[val] * data[weight]).sum() / data[weight].sum()


weighted_avg_forced = (
    users_stances
    .groupby('state')
    .apply(lambda x: weighted_avg(x, 'forced_stance', 'weight'))
    .reset_index(name='weighted_avg_forced_stance')
)

In [16]:
state_scores = users_stances.groupby('state', as_index=False).agg(
    average_avg_stance=('avg_stance', 'mean'),
    average_forced_stance=('forced_stance', 'mean')
)

user_counts = users_stances.groupby('state')['user_id'].nunique()
state_scores = state_scores.merge(user_counts.rename('user_count'), on='state')
state_scores = state_scores.merge(weighted_avg_forced, on='state')

state_scores[state_scores['state'].isin(swing_states)]

Unnamed: 0,state,average_avg_stance,average_forced_stance,user_count,weighted_avg_forced_stance
2,Arizona,-0.137457,-0.228999,869,-0.046299
9,Florida,-0.150601,-0.234275,2655,-0.108036
10,Georgia,-0.200112,-0.309909,1423,-0.190408
15,Iowa,-0.252724,-0.412214,131,-0.229159
22,Michigan,-0.150722,-0.211409,596,-0.071028
23,Minnesota,-0.192139,-0.291105,371,-0.17004
28,Nevada,-0.13926,-0.239506,810,-0.07863
29,New Hampshire,-0.05119,-0.178571,28,0.189689
33,North Carolina,-0.163901,-0.220089,677,-0.041898
35,Ohio,-0.167936,-0.22142,831,-0.094611


In [30]:
reddit_data = pd.read_pickle('raw_data/factoid_reddit/reddit_corpus_unbalanced_filtered.gzip', compression='gzip')

columns_to_keep = [
    'pb_factor', 'user_id'
]

reddit_data = reddit_data[columns_to_keep]
reddit_data = pd.DataFrame({
    "user_id": reddit_data["user_id"],
    "stance": reddit_data["pb_factor"].apply(lambda x: -1 if x < -0.5 else (1 if x > 0.5 else 0))
})
reddit_data = reddit_data.reset_index(drop=True)


reddit_user_stances = pd.DataFrame({
    'user_id' : reddit_data.groupby('user_id')['stance'].mean().index,
    'stance' : reddit_data.groupby('user_id')['stance'].mean().values
})

In [31]:
print(f"The average national average stance is {users_stances['avg_stance'].mean()}")
print(f"The average national forced stance is {users_stances['forced_stance'].mean()}")
print(f"The average stance from reddit data is {reddit_user_stances['stance'].mean()}")

The average national average stance is -0.17213835387196919
The average national forced stance is -0.26994139383811516
The average stance from reddit data is -0.403855421686747


In [32]:
state_scores.to_csv('results/state_scores_georgetown_model.csv')