In [123]:
# Libraries that are part of the Python standard library
import gzip
import itertools
import json
import pathlib
import string
import time

# Third-party packages
import matplotlib.pyplot as plt
import nltk
import numpy as np
import pandas as pd
import seaborn as sns

In [124]:
run_nltk_downloader = False

if run_nltk_downloader:
    nltk.download()

In [125]:
# create a pathlib.Path object for the data
raw_data_filepath = pathlib.Path('relevant_data_location.json')

# open the data filepath
with open(raw_data_filepath, 'r') as fp:
    tweet_data = json.load(fp)
    
print(f'The data includes {len(tweet_data)} observations.')

The data includes 3157730 observations.


In [134]:
#np.random.seed(0)
#sample_size = 5000
#excerpt = np.random.choice(tweet_data, size=sample_size, replace=False,)
#excerpt=tweet_data[0:10000]
column_names = [
    'Datetime',
    'Username',
    'User',
    'Loc1',
    'Loc2',
    'Language',
    'Text',
]
tweet_df = pd.DataFrame(tweet_data, columns = column_names)
#exc_tweet_df = pd.DataFrame(excerpt, columns = column_names)

In [None]:
#this block is for cleaning tokens. Will be useful once we get into topic modeling
user_defined_stop_words = ['..', '...', 'get', '\u200d', '’', '“', '”']

my_mwe_tuples = [
    ('u', '.', 's', '.'),
    ('covid', '-', '19'),
]

i = nltk.corpus.stopwords.words('english')
j = list(string.punctuation) + user_defined_stop_words
stopwords = set(i).union(j)

tknzr = nltk.tokenize.TweetTokenizer()
mwe_tknzr = nltk.tokenize.MWETokenizer()
mwe_tknzr.add_mwe(my_mwe_tuples)
lemmatizer = nltk.stem.WordNetLemmatizer()
stemmer = nltk.stem.porter.PorterStemmer()


def create_tok(x):
    tokens = tknzr.tokenize(x.lower())
    tokens = mwe_tknzr.tokenize(tokens)
    tokens = [token for token in tokens if token not in string.punctuation]
    tokens = [token for token in tokens if token not in set(stopwords)]
    tokens = [lemmatizer.lemmatize(token) for token in tokens]
    tokens = [stemmer.stem(token) for token in tokens]
    return tokens
    

In [None]:
exc_tweet_df['Tokens'] = exc_tweet_df['Text'].apply(create_tok)

In [156]:
#filter to get just tweets from AL use on 'tweet_data' for full dataset
alabama_df = tweet_df.loc[tweet_df['Loc2'].str.contains("AL | Alabama | al | alabama"),:]


In [135]:
#convert Datetime column to actual datetime
tweet_df.loc[:,'Datetime'] = pd.to_datetime(tweet_df['Datetime'], 
                                      format = '%a %b %d %H:%M:%S %z %Y')

In [158]:
#convert Datetime column to actual datetime
alabama_df.loc[:,'Datetime'] = pd.to_datetime(alabama_df['Datetime'], 
                                      format = '%a %b %d %H:%M:%S %z %Y')

In [None]:
#flatten tokens column. Not recommended for big dataframe
real_toks = [val for key, val in exc_tweet_df["Tokens"].iteritems()]
#had to get a flat list of all tokens; might be an easier way
flat_tok_list = [item for sublist in real_toks for item in sublist]
freq_dist = nltk.FreqDist(iter(flat_tok_list))
freq_dist.most_common(10)


In [None]:
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
analyser = SentimentIntensityAnalyzer()
sent_scores = []
for x in range(10000):
    sent_scores.append(analyser.polarity_scores(exc_tweet_df.loc[x, 'Text'])['compound'])
    
exc_tweet_df.loc[4, 'Text']
exc_tweet_df["CSP"] = sent_scores
exc_tweet_df.head()
exc_tweet_df.to_csv(r'sample_tweet_frame.csv', index = False)

In [171]:
'''new = tweet_df.sample(n=10000)
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
analyser = SentimentIntensityAnalyzer()
#sent_scores = []
x=0
for index, row in new.iterrows():
    print(row["User"] + " " + str(analyser.polarity_scores(row['Text'])['compound']))
    #sent_scores.append(analyser.polarity_scores(row['Text'])['compound'])
    x = x + 1
    if(x > 50):
        break
new["CSP"] = sent_scores
new.to_csv(r'sample_tweet_frame.csv', index = False)'''

'new = tweet_df.sample(n=10000)\nfrom vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer\nanalyser = SentimentIntensityAnalyzer()\n#sent_scores = []\nx=0\nfor index, row in new.iterrows():\n    print(row["User"] + " " + str(analyser.polarity_scores(row[\'Text\'])[\'compound\']))\n    #sent_scores.append(analyser.polarity_scores(row[\'Text\'])[\'compound\'])\n    x = x + 1\n    if(x > 50):\n        break\nnew["CSP"] = sent_scores\nnew.to_csv(r\'sample_tweet_frame.csv\', index = False)'

In [166]:
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
analyser = SentimentIntensityAnalyzer()
sent_scores = []
for index, row in alabama_df.iterrows():
    #print(row["User"] + " " + str(analyser.polarity_scores(row['Text'])['compound']))
    sent_scores.append(analyser.polarity_scores(row['Text'])['compound'])
    
alabama_df = pd.DataFrame(data=alabama_df)
alabama_df["CSP"] = sent_scores
alabama_df.head(20)

In [170]:
alabama_df.to_csv(r'alabama_tweet_frame.csv', index = False)

In [172]:
alabama_df["Datetime"].describe()

count                        215261
unique                       210373
top       2020-06-20 20:46:41+00:00
freq                              5
first     2020-03-27 23:20:45+00:00
last      2020-10-19 18:13:07+00:00
Name: Datetime, dtype: object

In [175]:
fips = pd.read_csv('alabama_csp_fips.csv')
fips.head()

Unnamed: 0,County,AvgCSP,FIPS,State,TweetCount
0,Choctaw County,,1023.0,AL,0
1,Clay County,,1027.0,AL,0
2,Cleburne County,,1029.0,AL,0
3,Coosa County,,1037.0,AL,0
4,County,,,,0


In [176]:
fips.head(15)

Unnamed: 0,County,AvgCSP,FIPS,State,TweetCount
0,Choctaw County,,1023.0,AL,0
1,Clay County,,1027.0,AL,0
2,Cleburne County,,1029.0,AL,0
3,Coosa County,,1037.0,AL,0
4,County,,,,0
5,Lamar County,,1075.0,AL,0
6,Sumter County,,1119.0,AL,0
7,Washington County,,1129.0,AL,0
8,Wilcox County,,1131.0,AL,0
9,Crenshaw County,-0.4007,1041.0,AL,1


In [178]:
fips = fips[fips['FIPS'].notna()]
fips.FIPS = fips.FIPS.astype(int)

In [194]:
import plotly.figure_factory as ff
import plotly

df_sample = pd.read_csv('https://raw.githubusercontent.com/plotly/datasets/master/laucnty16.csv')
df_sample['State FIPS Code'] = df_sample['State FIPS Code'].apply(lambda x: str(x).zfill(2))
df_sample['County FIPS Code'] = df_sample['County FIPS Code'].apply(lambda x: str(x).zfill(3))
df_sample['FIPS'] = df_sample['State FIPS Code'] + df_sample['County FIPS Code']

colorscale = ["#f7fbff", "#ebf3fb", "#deebf7", "#d2e3f3", "#c6dbef", "#b3d2e9", "#9ecae1",
    "#85bcdb", "#6baed6", "#57a0ce", "#4292c6", "#3082be", "#2171b5", "#1361a9",
    "#08519c", "#0b4083", "#08306b"
]
endpts = list(np.linspace(1, 12, len(colorscale) - 1))
fips = df_sample['FIPS'].tolist()
values = df_sample['Unemployment Rate (%)'].tolist()


fig = ff.create_choropleth(
    fips=fips, values=values, scope=['usa'],
    binning_endpoints=endpts, colorscale=colorscale,
    show_state_data=False,
    show_hover=True,
    asp = 2.9,
    title_text = 'USA by Unemployment %',
    legend_title = '% unemployed'
)
fig.layout.template = None
fig.show()

ValueError: 
The create_choropleth figure factory requires the plotly-geo package.
Install using pip with:

$ pip install plotly-geo

Or, install using conda with

$ conda install -c plotly plotly-geo


In [193]:
import geopandas
import shapely
import plotly
from plotly.figure_factory._county_choropleth import create_choropleth
import xlrd
# Check your version
print(plotly.__version__, geopandas.__version__,shapely.__version__,plotly-geo.__version__)

NameError: name 'geo' is not defined