In [2]:
import gzip
import json
from datetime import datetime

import jsonlines
import pandas as pd

with jsonlines.Reader(gzip.open('../data/politicians_tweetdata_long.jsonl.gz')) as reader:
    raw_politician_tweets = list(reader)


def parse_twitter_datetime(dt: str):
    return datetime.strptime(dt, '%a %b %d %H:%M:%S +0000 %Y')


tweet_df = pd.DataFrame(data={
    'tweet': [t['full_text'] for t in raw_politician_tweets],
    'author': [t['user']['screen_name'] for t in raw_politician_tweets],
    'date': [parse_twitter_datetime(t['created_at']) for t in raw_politician_tweets],
    'id': [t['id'] for t in raw_politician_tweets]
})
tweet_df

Unnamed: 0,tweet,author,date,id
0,"Madame Vice President, Americans are watching....",MarshaBlackburn,2021-04-22 23:57:20,1385382242751500293
1,"RT @thehill: Sen. @MarshaBlackburn: ""Can you e...",MarshaBlackburn,2021-04-22 23:49:43,1385380324495044613
2,RT @SaraCarterDC: Marsha Blackburn: We need re...,MarshaBlackburn,2021-04-22 23:08:39,1385369992238092288
3,Mainstream media is falling in line with the D...,MarshaBlackburn,2021-04-22 20:43:49,1385333542041784322
4,Democrats want to pack the Supreme Court so th...,MarshaBlackburn,2021-04-22 17:03:59,1385278218186498048
...,...,...,...,...
192759,No authorization for war with Iran:\n• 2001 AU...,justinamash,2020-01-03 16:52:53,1213141191845388288
192760,There’s a reason our Constitution grants Congr...,justinamash,2020-01-03 03:40:45,1212941844558032898
192761,Happy New Year!\n\nMay 2020 be a year of peace...,justinamash,2020-01-01 05:00:00,1212237013040140289
192762,One of the best decisions I made in 2019 was l...,justinamash,2020-01-01 00:25:12,1212167856831508484


Downsample the rows because otherwise there is way too much data

In [3]:
tweet_df = tweet_df.sample(50000, random_state=0)
tweet_df

Unnamed: 0,tweet,author,date,id
76588,"It's hard to believe this has to be said, but ...",JoeBiden,2020-06-04 22:00:00,1268663821998206978
61636,@BernieSanders I understand your support for t...,JoinRocky,2020-01-22 19:27:30,1220065470646108162
53157,I am about to testify before @RulesDemocrats i...,LeaderHoyer,2020-05-14 15:19:19,1260952839851048962
84794,It is my pleasure to join you this Wed Jul 1. ...,DonBeyerVA,2020-06-29 20:10:46,1277696029291405322
102510,MAKE MICHIGAN AND AMERICA GREAT AGAIN! 🇺🇸 http...,Mike_Pence,2020-11-03 07:08:30,1323522425204563969
...,...,...,...,...
135217,The coronavirus (COVID-19) public health emerg...,gracenapolitano,2020-03-25 01:07:31,1242619085680230400
96173,We all have to take steps to make sure that we...,michaelcburgess,2021-01-28 22:37:00,1354921444392710149
25149,"Dems want you to have the 2k, the House GOP do...",RepSchakowsky,2020-12-24 21:41:05,1342223796649005063
64489,(2/2) Durante casi cuatro años he trabajado si...,MarioDB,2021-04-20 18:34:42,1384576271913467911


In [4]:
with open('../data/politicians.json') as file:
    raw_politicians = json.load(file)

politician_handles = [
    {
        'handle': handle['handle'],
        'handle_type': handle['name'],
        'name': p['name'],
        'party': p['party'],
        'lgbt_stance': p['vote_match'][2]
    }
    for p in raw_politicians for handle in p['twitters']
]

politician_df = pd.DataFrame.from_dict(politician_handles)
politician_df = politician_df[politician_df['handle_type'] != 'Don Beyer Twitter feed']
politician_df

Unnamed: 0,handle,handle_type,name,party,lgbt_stance
0,MarshaBlackburn,Official Twitter,Marsha Blackburn,Republican,0
1,pattymurray,Official Twitter,Patty Murray,Democratic,4
3,PeteSessions,Official Twitter,Pete Sessions,Republican,0
4,SessionsTX17,Campaign Twitter,Pete Sessions,Republican,0
5,KamalaHarris,Campaign Twitter,Kamala Harris,Democratic,4
...,...,...,...,...,...
198,wydenfororegon,Campaign Twitter,Ron Wyden,Democratic,4
199,joesestak,Campaign Twitter,Joe Sestak,Democratic,4
200,amashoffice,Official Twitter,Justin Amash,Libertarian,0
201,teamamash,Campaign Twitter,Justin Amash,Libertarian,0


In [5]:
tweet_df = tweet_df.merge(politician_df, left_on='author', right_on='handle')

In [6]:
import string
import nltk
import numpy as np
from nltk import TweetTokenizer, WordNetLemmatizer

nltk.download('stopwords')
nltk.download('wordnet')

# Build stopword set
stop = set()
stop.update(nltk.corpus.stopwords.words('english'))
stop.update(nltk.corpus.stopwords.words('spanish'))
stop.update(string.punctuation)

# Some twitter-specific ones
stop.update(['rt', '…', '—', 'u'])


def sublist_replacement(original, old, new):
    """Replaces a sublist with another list."""
    output = []

    def is_match(i):
        if len(original) - i < len(old):
            return False
        for j in range(len(old)):
            if original[i + j] != old[j]:
                return False
        return True

    i = 0
    while i < len(original):
        if is_match(i):
            output.extend(new)
            i += len(old)
        else:
            output.append(original[i])
            i += 1

    return output


sublist_replacement(
    ['foo', 'bar', 'spam', 'memes', 'foo', 'bar', 'spam', 'memes'],
    ['bar', 'spam', 'memes'],
    ['eggs', 'ham']
)

[nltk_data] Downloading package stopwords to /home/astrid/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /home/astrid/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


['foo', 'eggs', 'ham', 'foo', 'eggs', 'ham']

In [10]:
tweets = tweet_df['tweet']
tweets[tweets.str.find('biolo') >= 0]

5368     The #COVID19 pandemic has underscored why basi...
5451     #COVID19 evolved naturally, but what if the ne...
8708     RT @ASMicrobiology: The U.S. lags far behind o...
8709     RT @ASMicrobiology: The U.S. lags far behind o...
11627    Dr. @AyanaEliza Johnson is the co-author of th...
14485    We don’t want biological males in women’s shel...
14788    Is this the #CCP entering into biological warf...
28189    As a result of this ruling, women could be for...
33424    I love my mom more than anything—and like many...
38358    Texas Wesleyan alumna Diana Gerrard had been r...
40277    Rep. Langevin pushes back on the Administratio...
41506    Happy Mother's Day to all you wonderful biolog...
Name: tweet, dtype: object

In [18]:
tweets = tweet_df['tweet']
tweets[tweets.str.find('sports') >= 0]

4242     Each week of #BlackHistoryMonth, I am featurin...
10171    RT @CountyofLA: #ICYMI:\n-All LA County multi-...
10942    Today, I signed an executive order that allows...
11389    ICYMI: Yesterday, @RepSires and I called for f...
11570    This morning @GlobeBobRyan and I are talking a...
12077    With the Celtics beginning their playoff run, ...
12426    No abuse in sport should be tolerated. The pro...
13621    New York State is ready and willing to partner...
13670    Baseball is coming back!\n\nThe @Yankees &amp;...
14485    We don’t want biological males in women’s shel...
15628    @seattlestorm Sports story of December 12.  Co...
17669    Tom Seaver was one of the greatest pitchers of...
21579    This #WomensHistoryMonth, I remember the first...
22369    My heart is completely broken for Vanessa, the...
24878    Thankfully, the GOP doesn't control the House ...
28306    RT @jmartNYT: He looked directly into the eyes...
28842    Sid Hartman was a friend of my dad’s and an ic.

In [19]:
tweets = tweet_df['tweet']
tweets[tweets.str.lower().str.find('lgbt') >= 0]

4        How will I ensure the rights of LGBTQ+ people ...
48       I believe LGBTQ+ rights are human rights — and...
60       For all the hard-won progress we've made in th...
220      RT @POTUS: The Equality Act provides long over...
414      Health care is on the line.\nWorkers' rights a...
                               ...                        
40233    Beyond pleased to see #SCOTUS monumental decis...
41050    I said it in 1996 as one of the first Senators...
41196    Exciting news — the @NatlParkService has offic...
41240    Standing up to bigots like @mtgreenee and figh...
41861    RT @lgbtqdems: We are thrilled to endorse the ...
Name: tweet, Length: 218, dtype: object

In [16]:
tweets = tweet_df['tweet']
tweets[tweets.str.lower().str.find('transgender') >= 0]

188      We cannot be silent in the face of rising viol...
528      In light of Donald Trump's inhumane reversal o...
3196     No American should be barred from serving thei...
3212     Transgender people have the right to be seen a...
3428     On #TransDayOfRemembrance, we commemorate the ...
3919     RT @DeanObeidallah: WOW: Rep. Al Green (D-TX)a...
5280     Being trans shouldn’t mean living w/ a target ...
5463     Anyone willing to put on the uniform and serve...
5772     It was *always* wrong to fire a person simply ...
6035     2020 has been the most deadly year for trans a...
6266     “When an employer fires an employee for being ...
6551     @WillFiteForYou @AnonymousStaffr As you can se...
7190     On today's #TransgenderDayOfRemembrance we rei...
7665     Today on International Day of Transgender Visi...
8946     I am deeply saddened by Aimee’s passing and my...
8947     I am deeply saddened by Aimee’s passing and my...
10732    Today on Transgender Day of Remembrance, we li.