In [None]:
#pip install pyreadr

In [2]:
import pandas as pd
import pyreadr
from functools import reduce

# Data Cleaning

## 0. Files for helping unify the USstate presentation

In [34]:
delineation_df = pd.read_excel('./data/delineation_file.xls')
delineation_df = delineation_df.iloc[1:1918]
delineation_df = delineation_df.rename(columns=delineation_df.iloc[0])
delineation_df = delineation_df.drop(1)
delineation_df['FIPS State Code'] = delineation_df['FIPS State Code'].astype(int)
delineation_df['CBSA Code'] = delineation_df['CBSA Code'].astype(int)
delineation_df = delineation_df[['CBSA Code', 'State Name', 'FIPS State Code', 'FIPS County Code']]

abbr_name = pd.read_csv('./data/abbr-name.csv')
abbr_name['abbr'] = abbr_name['abbr'].str.lower()

code_merged = pd.merge(delineation_df, abbr_name, left_on='State Name', right_on='full', how='left')
code_merged = code_merged.drop(columns=['full'])
code_merged.drop_duplicates()

Unnamed: 0,CBSA Code,State Name,FIPS State Code,FIPS County Code,abbr
0,10100,South Dakota,46,013,sd
1,10100,South Dakota,46,045,sd
2,10140,Washington,53,027,wa
3,10180,Texas,48,059,tx
4,10180,Texas,48,253,tx
...,...,...,...,...,...
1911,49700,California,6,101,ca
1912,49700,California,6,115,ca
1913,49740,Arizona,4,027,az
1914,49780,Ohio,39,119,oh


## 1. Crime Data 2012-2015

In [97]:
crime_df = pd.read_csv('./data/crime/crime.csv')
crime = crime_df[['STATE', 'YEAR', 'MONTH', 'OFFENSE', 'OCCUR', 'POP']]
crime['STATE'] = crime['STATE'].str.capitalize()
#crime_df = crime_df[crime_df['POP'] >= 3000]
crime = pd.merge(crime, abbr_name, left_on='STATE', right_on='full', how='left')
crime = crime.drop(columns=['full', 'OFFENSE'])

crime_agg = crime.groupby(['MONTH','YEAR','abbr']).sum().reset_index()
crime_agg = crime_agg.rename(columns={'MONTH': 'month','OCCUR': 'crime_counts'})
crime_agg['month'] = crime_agg['month'].astype(int)
crime_agg['YEAR'] = crime_agg['YEAR'].astype(int)
crime_agg['crime_proportion'] = crime_agg['crime_counts']/crime_agg['POP']
crime_agg = crime_agg.drop(columns=['POP'])
crime_agg

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  crime['STATE'] = crime['STATE'].str.capitalize()


Unnamed: 0,month,YEAR,abbr,crime_counts,crime_proportion
0,1,2012,ak,50,0.000029
1,1,2012,ar,41,0.000053
2,1,2012,az,259,0.000024
3,1,2012,ca,1158,0.000027
4,1,2012,co,255,0.000038
...,...,...,...,...,...
1820,12,2015,va,321,0.000023
1821,12,2015,vt,47,0.000165
1822,12,2015,wa,176,0.000023
1823,12,2015,wi,635,0.000089


## 2. Alcohol Data for 2012-2020

In [68]:
alcohol_17 = pd.read_csv('./data/alcohol/alcohol_17.csv')
alcohol_18 = pd.read_csv('./data/alcohol/alcohol_18.csv')
alcohol_19 = pd.read_csv('./data/alcohol/alcohol_19.csv')
alcohol_20 = pd.read_csv('./data/alcohol/alcohol_20.csv')
alcohol_17['YEAR'] = 2017
alcohol_18['YEAR'] = 2018
alcohol_19['YEAR'] = 2019
alcohol_20['YEAR'] = 2020
alcohol_17_20 = pd.concat([alcohol_17, alcohol_18, alcohol_19, alcohol_20])

new_df = alcohol_17_20['Geographic Area Name'].str.split(', ', expand=True)
alcohol_17_20['State_name'] = new_df[1]
alcohol_17_20.drop(columns =["Geographic Area Name"], inplace = True)
alcohol_17_20['alcohol'] = 0
alcohol_17_20 = alcohol_17_20.groupby(['YEAR', 'State_name']).count().reset_index()
alcohol_17_20['alcohol_per_month'] = round(alcohol_17_20['alcohol']/12, 2)
alcohol_17_20 = pd.merge(alcohol_17_20, abbr_name, left_on='State_name', right_on='full', how='left')
alcohol_17_20 = alcohol_17_20.dropna(subset=['full'])
alcohol_17_20 = alcohol_17_20.drop(columns=['alcohol', 'full'])

alcohol_12_16 = pd.read_csv('./data/alcohol/alcohol_2012_2016.csv')
alcohol_12_16['alcohol_per_month'] = round(alcohol_12_16['alcohol']/12, 2)
alcohol_12_16 = alcohol_12_16.fillna(0)
alcohol_12_16 = pd.merge(alcohol_12_16, code_merged, left_on='cbsa', right_on='CBSA Code', how='left')
alcohol_12_16 = alcohol_12_16.dropna(subset=['abbr'])
alcohol_12_16 = alcohol_12_16.drop(columns=['CBSA Code', 'FIPS State Code', 'FIPS County Code', 'cbsa', 'alcohol'])
alcohol_12_16 = alcohol_12_16.rename(columns={'State Name': 'State_name'})

alcohol_df = pd.concat([alcohol_12_16, alcohol_17_20])
alcohol_df = alcohol_df.groupby(['YEAR','abbr']).sum()
alcohol_df

Unnamed: 0_level_0,Unnamed: 1_level_0,alcohol_per_month
YEAR,abbr,Unnamed: 2_level_1
2012,ak,9.25
2012,al,86.40
2012,ar,82.83
2012,az,26.85
2012,ca,653.65
...,...,...
2020,vt,2.08
2020,wa,3.75
2020,wi,5.00
2020,wv,1.42


## 3. Twitter Data 2012-2021

In [82]:
# use proportion of misogyny tweets
all_tweets = pd.read_excel('./data/tweet/All_Tweets_State_Month_Year_05082022.xlsx')
misogyny_tweets = pd.read_excel('./data/tweet/Misogyny_Tweets_State_Month_Year_05082022.xlsx')
tweets = pd.merge(all_tweets, misogyny_tweets, on = ['USstate', 'year', 'month.keyword: Descending'])
tweets['misogyny_proportion'] = tweets['nTweets_y']/tweets['nTweets_x']
tweets = tweets.drop(columns=['nTweets_x', 'nTweets_y'])
tweets = tweets.sort_values(by=['year', 'month.keyword: Descending'], ascending=True)
tweets = tweets.rename(columns={'month.keyword: Descending': 'month'})
tweets

Unnamed: 0,USstate,year,month,misogyny_proportion
82,ar,2012,1,0.000129
185,ca,2012,1,0.000037
338,ct,2012,1,0.000093
423,fl,2012,1,0.000043
691,il,2012,1,0.000073
...,...,...,...,...
1375,mo,2021,12,0.000029
1809,ny,2021,12,0.000009
2093,pa,2021,12,0.000012
2266,tn,2021,12,0.000021


# 4. Merge all datasets

In [99]:
# alcohol and twitter
al_twitter = pd.merge(tweets, alcohol_df, right_on=['abbr','YEAR'], left_on=['USstate', 'year'], how='left')

# add crime
merged_df = pd.merge(al_twitter, crime_agg, left_on=['USstate', 'year', 'month'], right_on=['abbr', 'YEAR', 'month'], how='left')
merged_df = merged_df.fillna(-1)
merged_df = merged_df.drop(columns=['YEAR', 'abbr'])
merged_df

Unnamed: 0,USstate,year,month,misogyny_proportion,alcohol_per_month,crime_counts,crime_proportion
0,ar,2012,1,0.000129,82.83,41.0,0.000053
1,ca,2012,1,0.000037,653.65,1158.0,0.000027
2,ct,2012,1,0.000093,123.77,173.0,0.000043
3,fl,2012,1,0.000043,321.31,-1.0,-1.000000
4,il,2012,1,0.000073,947.98,13.0,0.000028
...,...,...,...,...,...,...,...
2463,mo,2021,12,0.000029,-1.00,-1.0,-1.000000
2464,ny,2021,12,0.000009,-1.00,-1.0,-1.000000
2465,pa,2021,12,0.000012,-1.00,-1.0,-1.000000
2466,tn,2021,12,0.000021,-1.00,-1.0,-1.000000


In [100]:
merged_df = merged_df[merged_df['crime_proportion']!=-1]
merged_df = merged_df.rename(columns={'alcohol_per_month': 'alcohol_counts'})
merged_df

Unnamed: 0,USstate,year,month,misogyny_proportion,alcohol_counts,crime_counts,crime_proportion
0,ar,2012,1,0.000129,82.83,41.0,0.000053
1,ca,2012,1,0.000037,653.65,1158.0,0.000027
2,ct,2012,1,0.000093,123.77,173.0,0.000043
4,il,2012,1,0.000073,947.98,13.0,0.000028
5,ky,2012,1,0.000093,243.25,238.0,0.000077
...,...,...,...,...,...,...,...
1308,nv,2015,12,0.000019,14.01,135.0,0.000018
1310,oh,2015,12,0.000017,277.14,314.0,0.000064
1311,pa,2015,12,0.000010,1013.69,600.0,0.000060
1313,tx,2015,12,0.000032,966.87,1110.0,0.000024


In [101]:
merged_df.to_csv('./data/merged_data.csv')