## Detecting Unusual Numbers: 
### An Application of Statistical Techniques to State-Reported COVID-19 Data

Outline of project steps:
 - Identify data source (done)
 - Clean and prep data, incl creating reference distributions 
      - for first digit (done)
 - Identify statistical tests to use
 - Apply tests and analyze findings
 - Finalize documentation

In [3]:
# import libraries
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
# import rpy2
# import met
# import os

In [4]:
# import data from source
# - https://covidtracking.com/about-data/data-definitions
df = pd.read_csv('all_states.csv', low_memory = False)

In [11]:
df.head()

Unnamed: 0,date,state,dataQualityGrade,death,deathConfirmed,deathIncrease,deathProbable,hospitalized,hospitalizedCumulative,hospitalizedCurrently,...,totalTestResults,totalTestResultsIncrease,totalTestsAntibody,totalTestsAntigen,totalTestsPeopleAntibody,totalTestsPeopleAntigen,totalTestsPeopleViral,totalTestsPeopleViralIncrease,totalTestsViral,totalTestsViralIncrease
0,2021-01-30,AK,A,262,0,0,0,1205,1205,39,...,1495885,6119,0,0,0,0,0,0,1495885,6119
1,2021-01-30,AL,A,7566,6094,0,1472,41859,41859,1879,...,2127308,0,0,0,104420,0,2127308,0,0,0
2,2021-01-30,AR,A+,4838,3896,7,942,13599,13599,911,...,2420194,12494,0,0,0,360922,0,0,2420194,12494
3,2021-01-30,AS,0,0,0,0,0,0,0,0,...,2140,0,0,0,0,0,0,0,2140,0
4,2021-01-30,AZ,A+,13098,11682,76,1416,52006,52006,3828,...,6643932,54791,416806,0,0,0,3440127,16808,6643932,54791


In [74]:
# Change nulls to zeros
df.fillna(0, inplace = True)

# Change all numerical columns to integers and take absolute value of negative numbers
num_cols = df.columns.tolist()[3:]
for n in num_cols:
    # take absolute values of any negative numbers 
    df[n] = df[n].apply(lambda x: int(abs(x)))

#### Create Dataset
 - Each observation is a day, each entity is a state

In [75]:
# Select features of interest
feature = 'positiveIncrease'
# first, second, second_last, last

In [76]:
# take first digit
def grab_first_digit(data, col):
    # Take all first digit values - if value is 0, code as 55
    data['first_digit'] = data[col].apply(lambda x: str(x)[0] if x != 0 else 55)   
    
    # Create restructured dataframe
    revised = pd.DataFrame(data.first_digit.value_counts(dropna = False))
    revised.reset_index(inplace = True)
    revised.rename(columns = {'index': 'digit', 'first_digit': col}, inplace = True)
    return revised

In [95]:
# step 1
# create list of all states
states = df['state'].unique().tolist()

# step 2
# create restructured dataframe of first state data
data = df.loc[df['state'] == states[0], ['date', feature]]
data.rename(columns = {feature: states[0]}, inplace = True)

first_dig_posInc = grab_first_digit(data, states[0])

# # step 3
# # restructure and append data for each state
for i in range(1, len(states)):
    # select state data
    data = df.loc[df['state'] == states[i], ['date', feature]].copy()
    data.rename(columns = {feature: states[i]}, inplace = True)
    first_dig_posInc = first_dig_posInc.merge(grab_first_digit(data, states[i]), how = 'outer', on = 'digit')
    
# Remove zero values from first digit dataset (coded as 55)
first_dig_posInc = first_dig_posInc.loc[first_dig_posInc['digit'] != 55]


In [96]:
first_dig_posInc

Unnamed: 0,digit,AK,AL,AR,AS,AZ,CA,CO,CT,DC,...,TN,TX,UT,VA,VI,VT,WA,WI,WV,WY
0,1,92,108,82,,80,76,66,87,80,...,103,97,109,85,71.0,107,77,72,140,90
1,2,42,67,46,,51,64,74,53,58,...,62,31,61,25,48.0,54,66,63,53,61
2,3,37,40,23,,46,57,76,19,46,...,43,28,47,32,21.0,36,64,42,31,44
3,5,32,21,30,,29,27,22,25,36,...,18,28,27,26,10.0,27,30,34,17,22
4,4,31,27,30,,42,43,57,23,25,...,42,40,34,41,11.0,24,32,45,19,42
5,6,31,12,28,,25,19,22,13,27,...,14,25,16,29,17.0,15,25,17,11,19
6,8,20,16,20,,19,15,5,11,19,...,14,23,9,31,,18,16,16,16,13
8,7,14,11,28,,18,16,4,15,20,...,17,29,11,19,6.0,18,17,22,11,16
9,9,13,20,17,,16,13,3,10,12,...,10,25,8,38,8.0,19,13,14,21,10


In [97]:
# Create population distribution
first_dig_posInc.fillna(0, inplace = True)
first_dig_posInc['popn_count'] = first_dig_posInc[states].sum(axis = 1).tolist()
first_dig_posInc['popn_proportion'] = first_dig_posInc['popn_count']/first_dig_posInc['popn_count'].sum()

In [98]:
# np.random.seed(123)
# ex = np.random.randint(low=1, high=10, size=9).tolist()

# Benford First Digit = log10(1+1/n)
ben = list(np.log10(1 + 1/np.arange(1, 10)))
first_dig_posInc['benford'] = ben

In [101]:
first_dig_posInc

Unnamed: 0,digit,AK,AL,AR,AS,AZ,CA,CO,CT,DC,...,VA,VI,VT,WA,WI,WV,WY,popn_count,popn_proportion,benford
0,1,92,108,82,0.0,80,76,66,87,80,...,85,71.0,107,77,72,140,90,4983.0,0.294069,0.30103
1,2,42,67,46,0.0,51,64,74,53,58,...,25,48.0,54,66,63,53,61,2845.0,0.167896,0.176091
2,3,37,40,23,0.0,46,57,76,19,46,...,32,21.0,36,64,42,31,44,1979.0,0.11679,0.124939
3,5,32,21,30,0.0,29,27,22,25,36,...,26,10.0,27,30,34,17,22,1404.0,0.082856,0.09691
4,4,31,27,30,0.0,42,43,57,23,25,...,41,11.0,24,32,45,19,42,1696.0,0.100089,0.079181
5,6,31,12,28,0.0,25,19,22,13,27,...,29,17.0,15,25,17,11,19,1180.0,0.069637,0.066947
6,8,20,16,20,0.0,19,15,5,11,19,...,31,0.0,18,16,16,16,13,929.0,0.054824,0.057992
8,7,14,11,28,0.0,18,16,4,15,20,...,19,6.0,18,17,22,11,16,1086.0,0.06409,0.051153
9,9,13,20,17,0.0,16,13,3,10,12,...,38,8.0,19,13,14,21,10,843.0,0.049749,0.045757


### Code Tests

In [4]:
t = pd.DataFrame({'obs': [ex], 'ref': [ref], 'sum': sum(ex)})
t['ref_adj'] = [[x * t['sum'][0] for x in t['ref'].tolist()[0]]]

In [7]:
m = met.Multinom(t['ref_adj'][0], t['obs'][0])

In [None]:
res = m.twosided_exact_test(save_cases = False)

In [None]:
res