## Analyze tweets for PFTP


#### Run this cell to connect to your GIS and get started:

In [1]:
from arcgis.gis import GIS
from arcgis.features import FeatureLayer
gis = GIS("https://ourcommunity.maps.arcgis.com", "MMajumdar_ourcommunity")

Enter password: ········


#### Now you are ready to start!

In [2]:
tweets_item = gis.content.get('e441d3f5e13b412299418034d1fb0eba')
tweets_layer = FeatureLayer(tweets_item.url+'/0')
tweets_features = tweets_layer.query(out_fields='tweet_text, hashtags, screen_name, state, party, created_at')
tweets_df = tweets_features.sdf
tweets_df.shape

(88301, 7)

In [3]:
tweets_df.head()

Unnamed: 0,ObjectId,created_at,hashtags,party,screen_name,state,tweet_text
0,830627,2020-10-05 16:14:51,,R,mnicholson_BU13,Louisiana,Warming my team up in the cages before games o...
1,830628,2020-10-05 16:14:51,,R,RonaldinhoG6,Louisiana,@Bluegrenades Tbh he could have made a differe...
2,830629,2020-10-05 16:14:51,,R,just_a_roach,Louisiana,IF YOU CANT TELL IM FUCKING LIVID. IT IS SO DI...
3,830630,2020-10-05 16:14:51,,R,ZeteticAdvocate,Louisiana,@xavierbonilla87 The problem with Biden on thi...
4,830631,2020-10-05 16:14:51,,R,Syllba,Louisiana,I’ve still not recovered from this.


### Convert state abbreviations to state name

In [4]:
us_state_abbrev = {'AL':'Alabama',
    'AK':'Alaska',
    'AZ':'Arizona',
    'AR':'Arkansas',
    'CA':'California',
    'CO':'Colorado',
    'CT':'Connecticut',
    'DE':'Delaware',
    'DC':'District of Columbia',
    'FL':'Florida',
    'GA':'Georgia',
    'HI':'Hawaii',
    'ID':'Idaho',
    'IL':'Illinois',
    'IN':'Indiana',
    'IA':'Iowa',
    'KS':'Kansas',
    'KY':'Kentucky',
    'LA':'Louisiana',
    'ME':'Maine',
    'MD':'Maryland',
    'MA':'Massachusetts',
    'MI':'Michigan',
    'MN':'Minnesota',
    'MS':'Mississippi',
    'MO':'Missouri',
    'MT':'Montana',
    'NE':'Nebraska',
    'NV':'Nevada',
    'NH':'New Hampshire',
    'NJ':'New Jersey',
    'NM':'New Mexico',
    'NY':'New York',
    'NC':'North Carolina',
    'ND':'North Dakota',
    'OH':'Ohio',
    'OK':'Oklahoma',
    'OR':'Oregon',
    'PA':'Pennsylvania',
    'RI':'Rhode Island',
    'SC':'South Carolina',
    'SD':'South Dakota',
    'TN':'Tennessee',
    'TX':'Texas',
    'UT':'Utah',
    'VT':'Vermont',
    'VA':'Virginia',
    'WA':'Washington',
    'WV':'West Virginia',
    'WI':'Wisconsin',
    'WY':'Wyoming'
}

In [5]:
def replace_states(value):
    '''Replace state abbreviations with state names'''
    if len(value)==2:
        return us_state_abbrev[value]
    else:
        return value

In [6]:
tweets_df['state'] = tweets_df['state'].apply(replace_states)
tweets_df.head()

Unnamed: 0,ObjectId,created_at,hashtags,party,screen_name,state,tweet_text
0,830627,2020-10-05 16:14:51,,R,mnicholson_BU13,Louisiana,Warming my team up in the cages before games o...
1,830628,2020-10-05 16:14:51,,R,RonaldinhoG6,Louisiana,@Bluegrenades Tbh he could have made a differe...
2,830629,2020-10-05 16:14:51,,R,just_a_roach,Louisiana,IF YOU CANT TELL IM FUCKING LIVID. IT IS SO DI...
3,830630,2020-10-05 16:14:51,,R,ZeteticAdvocate,Louisiana,@xavierbonilla87 The problem with Biden on thi...
4,830631,2020-10-05 16:14:51,,R,Syllba,Louisiana,I’ve still not recovered from this.


## 1. Extracting mentions for tweets

In [7]:
tweets_df['mention'] = tweets_df['tweet_text'].str.findall(r'(?<![@\w])@(\w{1,25})').apply(','.join)
tweets_df['mention'].head()

0                   
1       Bluegrenades
2                   
3    xavierbonilla87
4                   
Name: mention, dtype: object

## 2. Extract tokens for each tweet

In [8]:
from nltk.tokenize import word_tokenize
import pandas as pd

import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\manu9321\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [9]:
tweets_df['tokens'] = tweets_df['tweet_text'].apply(word_tokenize)
tweets_df['tokens'].head()

0    [Warming, my, team, up, in, the, cages, before...
1    [@, Bluegrenades, Tbh, he, could, have, made, ...
2    [IF, YOU, CANT, TELL, IM, FUCKING, LIVID, ., I...
3    [@, xavierbonilla87, The, problem, with, Biden...
4     [I, ’, ve, still, not, recovered, from, this, .]
Name: tokens, dtype: object

In [10]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\manu9321\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [11]:
from nltk.corpus import stopwords
english_stopwords = set(stopwords.words('english'))

In [12]:
def clean_tokens(tokens):
    '''Remove punctuation and stopwords from tokens'''
    english_stopwords.add('http')
    english_stopwords.add('https')
    english_stopwords.add('amp')
    tokens1 = [word for word in tokens if word.isalnum()]
    return [x for x in tokens1 if x.lower() not in english_stopwords]

In [13]:
tweets_df['cleaned_tokens'] = tweets_df['tokens'].apply(clean_tokens)
tweets_df['cleaned_tokens'].head()

0    [Warming, team, cages, games, Saturday, heard,...
1    [Bluegrenades, Tbh, could, made, difference, r...
2    [CANT, TELL, IM, FUCKING, LIVID, DISGUSTING, P...
3    [xavierbonilla87, problem, Biden, issue, likel...
4                                   [still, recovered]
Name: cleaned_tokens, dtype: object

In [14]:
tweets_df.head()

Unnamed: 0,ObjectId,created_at,hashtags,party,screen_name,state,tweet_text,mention,tokens,cleaned_tokens
0,830627,2020-10-05 16:14:51,,R,mnicholson_BU13,Louisiana,Warming my team up in the cages before games o...,,"[Warming, my, team, up, in, the, cages, before...","[Warming, team, cages, games, Saturday, heard,..."
1,830628,2020-10-05 16:14:51,,R,RonaldinhoG6,Louisiana,@Bluegrenades Tbh he could have made a differe...,Bluegrenades,"[@, Bluegrenades, Tbh, he, could, have, made, ...","[Bluegrenades, Tbh, could, made, difference, r..."
2,830629,2020-10-05 16:14:51,,R,just_a_roach,Louisiana,IF YOU CANT TELL IM FUCKING LIVID. IT IS SO DI...,,"[IF, YOU, CANT, TELL, IM, FUCKING, LIVID, ., I...","[CANT, TELL, IM, FUCKING, LIVID, DISGUSTING, P..."
3,830630,2020-10-05 16:14:51,,R,ZeteticAdvocate,Louisiana,@xavierbonilla87 The problem with Biden on thi...,xavierbonilla87,"[@, xavierbonilla87, The, problem, with, Biden...","[xavierbonilla87, problem, Biden, issue, likel..."
4,830631,2020-10-05 16:14:51,,R,Syllba,Louisiana,I’ve still not recovered from this.,,"[I, ’, ve, still, not, recovered, from, this, .]","[still, recovered]"


## 3. Get stats by state

In [15]:
states_url = 'https://services1.arcgis.com/99lidPhWCzftIe9K/arcgis/rest/services/USStates/FeatureServer/0'
states_layer = FeatureLayer(states_url)
states_df = states_layer.query(as_df=True)
states_df.shape

(51, 8)

In [16]:
states_df.head()

Unnamed: 0,OBJECTID,OBJECTID_1,SHAPE,STATE_ABBR,STATE_FIPS,STATE_NAME,Shape__Area,Shape__Length
0,1,1,"{""rings"": [[[-9848795.1956, 3749386.4234], [-9...",AL,1,Alabama,189968400000.0,2506745.0
1,2,2,"{""rings"": [[[-16280479.1337, 11068714.8191], [...",AK,2,Alaska,4864666000000.0,48278160.0
2,3,3,"{""rings"": [[[-12781025.0556, 3828631.1485], [-...",AZ,4,Arizona,434110700000.0,2907332.0
3,4,4,"{""rings"": [[[-10531500.4946, 4369567.5414], [-...",AR,5,Arkansas,205329000000.0,2652242.0
4,5,5,"{""rings"": [[[-13829195.0781, 4967239.458], [-1...",CA,6,California,649635800000.0,6856510.0


In [17]:
filtered_tweets = tweets_df[['state', 'party', 'mention', 'hashtags', 'cleaned_tokens']]
filtered_tweets.head()

Unnamed: 0,state,party,mention,hashtags,cleaned_tokens
0,Louisiana,R,,,"[Warming, team, cages, games, Saturday, heard,..."
1,Louisiana,R,Bluegrenades,,"[Bluegrenades, Tbh, could, made, difference, r..."
2,Louisiana,R,,,"[CANT, TELL, IM, FUCKING, LIVID, DISGUSTING, P..."
3,Louisiana,R,xavierbonilla87,,"[xavierbonilla87, problem, Biden, issue, likel..."
4,Louisiana,R,,,"[still, recovered]"


In [18]:
filtered_tweets = pd.get_dummies(filtered_tweets, columns=['party'], prefix=['party'])
filtered_tweets.head()

Unnamed: 0,state,mention,hashtags,cleaned_tokens,party_D,party_I,party_Other,party_R
0,Louisiana,,,"[Warming, team, cages, games, Saturday, heard,...",0,0,0,1
1,Louisiana,Bluegrenades,,"[Bluegrenades, Tbh, could, made, difference, r...",0,0,0,1
2,Louisiana,,,"[CANT, TELL, IM, FUCKING, LIVID, DISGUSTING, P...",0,0,0,1
3,Louisiana,xavierbonilla87,,"[xavierbonilla87, problem, Biden, issue, likel...",0,0,0,1
4,Louisiana,,,"[still, recovered]",0,0,0,1


In [19]:
state_group = filtered_tweets.groupby('state').agg({'party_D':'sum', 'party_R':'sum', 'party_I':'sum', 'party_Other':'sum', 'mention': ' '.join, 'hashtags': ' '.join, 'cleaned_tokens': 'sum'})

In [20]:
state_group.head()

Unnamed: 0_level_0,party_D,party_R,party_I,party_Other,mention,hashtags,cleaned_tokens
state,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Alabama,89.0,45.0,0.0,0.0,"realDonaldTrump LynnWilson111,Flyfi...","JobsJobsJobs, AL2 FightBack, COVID19, WINNI...","[Great, see, President, recovering, America, l..."
Arizona,377.0,260.0,0.0,0.0,"Michele_Henson,Tea_Party_Chris,RepGosar Michel...","PMA azcd7, barnettforcongress A...","[RepGosar, Michele, shhhh, always, voice, Amer..."
Arkansas,52.0,25.0,0.0,10.0,"POTUS PantsuitPolitic MarioDB,...","PBMs Election2020, ElectANurse world...","[Today, last, day, Arkansas, register, vote, u..."
California,1318.0,916.0,0.0,0.0,LawmanTommy LawmanTommy SonnieJohnson OP_Omom...,CA53 CA33 2020Census USA SCOTUS Her...,"[LawmanTommy, mom, said, LawmanTommy, mom, Son..."
Colorado,177.0,157.0,12.0,2.0,RepDLamborn RepSwalwell officialusalg R...,"CD6 DemandDougDebates CO03 CD6, Healthcare, P...","[Attention, CD6, Medicare, Open, Enrollment, p..."


## 4. Find the most popular topics and terms across all tweets

First we lemmatize each word, i.e extract the root for each word to equalize them. E.g. Running, ran, runner will all be reduced to the root word _run_

In [21]:
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\manu9321\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [22]:
from nltk.corpus import wordnet as wn

In [23]:
def get_lemma(word):
    '''Return root of each word'''
    lemma = wn.morphy(word)
    if lemma is None:
        return word
    else:
        return lemma

def lemmatize_tokens(tokens):
    '''Return cleaned tokens in their root form'''
    return [get_lemma(token).lower() for token in tokens]

In [24]:
state_group['token_root_words'] = state_group['cleaned_tokens'].apply(lemmatize_tokens)

In [25]:
from gensim import corpora
from gensim.models.ldamodel import LdaModel

In [26]:
def topics_for_state(token_roots):
    '''Extract the top 5 topics/themes for each state'''
    dictionary = corpora.dictionary.Dictionary(token_roots)
    corpus = [dictionary.doc2bow(token) for token in token_roots]
    ldamodel = LdaModel(corpus, num_topics = 7, id2word=dictionary, passes=15)
    ldamodel.save('model.gensim')
    topics = ldamodel.print_topics(num_words=5)
    for topic in topics:
        print(topic)

In [27]:
topics_for_state(state_group['token_root_words'])

(0, '0.006*"get" + 0.004*"one" + 0.004*"people" + 0.004*"like" + 0.003*"make"')
(1, '0.008*"get" + 0.006*"like" + 0.006*"people" + 0.006*"one" + 0.004*"know"')
(2, '0.003*"senthomtillis" + 0.003*"senatorburr" + 0.002*"repmarkwalker" + 0.002*"virginiafoxx" + 0.002*"ncpolitics"')
(3, '0.008*"vote" + 0.006*"people" + 0.006*"need" + 0.005*"get" + 0.005*"trump"')
(4, '0.000*"get" + 0.000*"like" + 0.000*"know" + 0.000*"one" + 0.000*"people"')
(5, '0.002*"florida" + 0.001*"utpol" + 0.001*"anewpathforward" + 0.001*"gaetz" + 0.001*"idpol"')
(6, '0.002*"nebraska" + 0.002*"eastman" + 0.002*"bacon" + 0.002*"kara" + 0.001*"fortenberry"')


In [28]:
import pyLDAvis.gensim
import gensim

  from collections import Iterable


In [29]:
dictionary = corpora.dictionary.Dictionary(state_group['token_root_words'])
corpus = [dictionary.doc2bow(token) for token in state_group['token_root_words']]
lda = gensim.models.ldamodel.LdaModel.load('model.gensim')

In [30]:
p = pyLDAvis.gensim.prepare(lda, corpus, dictionary)
pyLDAvis.save_html(p, 'lda.html')

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  return pd.concat([default_term_info] + list(topic_dfs))


Here is the static view of the topics generated

![image](https://user-images.githubusercontent.com/13968196/95792497-9eecdd00-0cb1-11eb-9ca3-a1ce31b97c4b.png)
