# COUNTRY BASED BENFORD'S ANALYSIS OF COVID-19 DATASET


# Abstract


Benford's Law (also known as Newcomb - Benford's Law) is a numerical data analysis of the frequency distribution of leading digits in many datasets. Benford's law predicts the distribution of first digits, second digits, third digits, combinations of digits, etc. Benford's law is applied, among other applications, for the identification of manipulated data, the detection of fraud and the identification of false data. This study attempts to analyze country based COVID-19 data and to find out the level of compliance of data with the Benford's Law. European Centre for Disease Prevention and Control data on the geographic distribution of COVID-19 cases worldwide is used as a dynamic data source. 


# 1) Libraries

## 1.1) Import the Benford module

In [1]:
!pip install benford_py --user
import benford as bf



## 1.2) Import some libraries

In [2]:
import numpy as np
import pandas as pd
import sys
import math
import matplotlib.pyplot as plt
from time import process_time
from collections import defaultdict

In [3]:
# Here we change the display option of pandas. 
pd.options.display.max_rows = 200
pd.options.display.max_columns = None

# 2) Read the data


In [4]:
covid_data_all = pd.read_csv("https://opendata.ecdc.europa.eu/covid19/casedistribution/csv", dayfirst=True, parse_dates=True)


In [5]:
covid_data_all

Unnamed: 0,dateRep,day,month,year,cases,deaths,countriesAndTerritories,geoId,countryterritoryCode,popData2019,continentExp,Cumulative_number_for_14_days_of_COVID-19_cases_per_100000
0,09/08/2020,9,8,2020,39,5,Afghanistan,AF,AFG,38041757.0,Asia,2.357935
1,08/08/2020,8,8,2020,78,9,Afghanistan,AF,AFG,38041757.0,Asia,2.573488
2,07/08/2020,7,8,2020,41,0,Afghanistan,AF,AFG,38041757.0,Asia,2.652349
3,06/08/2020,6,8,2020,67,4,Afghanistan,AF,AFG,38041757.0,Asia,2.578745
4,05/08/2020,5,8,2020,82,6,Afghanistan,AF,AFG,38041757.0,Asia,2.896817
...,...,...,...,...,...,...,...,...,...,...,...,...
35144,25/03/2020,25,3,2020,0,0,Zimbabwe,ZW,ZWE,14645473.0,Africa,
35145,24/03/2020,24,3,2020,0,1,Zimbabwe,ZW,ZWE,14645473.0,Africa,
35146,23/03/2020,23,3,2020,0,0,Zimbabwe,ZW,ZWE,14645473.0,Africa,
35147,22/03/2020,22,3,2020,1,0,Zimbabwe,ZW,ZWE,14645473.0,Africa,


# 3) Rename the columns

In [6]:
covid_data_all.rename(columns={'dateRep':'date','countriesAndTerritories':'country', 'countryterritoryCode':'country_code', 'popData2019':'population_2019', 'continentExp':'continent', 'Cumulative_number_for_14_days_of_COVID-19_cases_per_100000':'cum_case_perht'}, inplace=True)

In [7]:
covid_data_all.columns.sort_values()

Index(['cases', 'continent', 'country', 'country_code', 'cum_case_perht',
       'date', 'day', 'deaths', 'geoId', 'month', 'population_2019', 'year'],
      dtype='object')

In [8]:
covid_data_all.head()

Unnamed: 0,date,day,month,year,cases,deaths,country,geoId,country_code,population_2019,continent,cum_case_perht
0,09/08/2020,9,8,2020,39,5,Afghanistan,AF,AFG,38041757.0,Asia,2.357935
1,08/08/2020,8,8,2020,78,9,Afghanistan,AF,AFG,38041757.0,Asia,2.573488
2,07/08/2020,7,8,2020,41,0,Afghanistan,AF,AFG,38041757.0,Asia,2.652349
3,06/08/2020,6,8,2020,67,4,Afghanistan,AF,AFG,38041757.0,Asia,2.578745
4,05/08/2020,5,8,2020,82,6,Afghanistan,AF,AFG,38041757.0,Asia,2.896817


In [9]:
covid_data_all.tail()

Unnamed: 0,date,day,month,year,cases,deaths,country,geoId,country_code,population_2019,continent,cum_case_perht
35144,25/03/2020,25,3,2020,0,0,Zimbabwe,ZW,ZWE,14645473.0,Africa,
35145,24/03/2020,24,3,2020,0,1,Zimbabwe,ZW,ZWE,14645473.0,Africa,
35146,23/03/2020,23,3,2020,0,0,Zimbabwe,ZW,ZWE,14645473.0,Africa,
35147,22/03/2020,22,3,2020,1,0,Zimbabwe,ZW,ZWE,14645473.0,Africa,
35148,21/03/2020,21,3,2020,1,0,Zimbabwe,ZW,ZWE,14645473.0,Africa,


In [10]:
covid_data_all.describe()

Unnamed: 0,day,month,year,cases,deaths,population_2019,cum_case_perht
count,35149.0,35149.0,35149.0,35149.0,35149.0,35085.0,32368.0
mean,15.650317,4.925574,2019.998094,558.310165,20.682039,44378880.0,26.562097
std,8.970579,1.881439,0.043619,3495.782751,123.568938,161586600.0,65.259535
min,1.0,1.0,2019.0,-2461.0,-1918.0,815.0,-1.262589
25%,8.0,4.0,2020.0,0.0,0.0,1641164.0,0.21079
50%,16.0,5.0,2020.0,6.0,0.0,8544527.0,3.329671
75%,23.0,6.0,2020.0,110.0,2.0,30366040.0,19.243613
max,31.0,12.0,2020.0,78427.0,4928.0,1433784000.0,885.924117


# 4) Explore the data

In [11]:
covid_data_all.shape

(35149, 12)

In [12]:
covid_data_all.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 35149 entries, 0 to 35148
Data columns (total 12 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   date             35149 non-null  object 
 1   day              35149 non-null  int64  
 2   month            35149 non-null  int64  
 3   year             35149 non-null  int64  
 4   cases            35149 non-null  int64  
 5   deaths           35149 non-null  int64  
 6   country          35149 non-null  object 
 7   geoId            35001 non-null  object 
 8   country_code     35085 non-null  object 
 9   population_2019  35085 non-null  float64
 10  continent        35149 non-null  object 
 11  cum_case_perht   32368 non-null  float64
dtypes: float64(2), int64(5), object(5)
memory usage: 3.2+ MB


# 5) Filter the data

In [13]:
covid_cases_negative=covid_data_all[covid_data_all.cases <0].copy()


In [14]:
covid_cases_negative

Unnamed: 0,date,day,month,year,cases,deaths,country,geoId,country_code,population_2019,continent,cum_case_perht
3644,20/05/2020,20,5,2020,-209,0,Benin,BJ,BEN,11801151.0,Africa,0.288107
6236,10/03/2020,10,3,2020,-9,1,Cases_on_an_international_conveyance_Japan,JPG11668,,,Other,
9658,12/05/2020,12,5,2020,-50,18,Ecuador,EC,ECU,17373657.0,America,36.083365
9661,09/05/2020,9,5,2020,-1480,50,Ecuador,EC,ECU,17373657.0,America,101.504249
9663,07/05/2020,7,5,2020,-2461,49,Ecuador,EC,ECU,17373657.0,America,106.88596
11668,03/06/2020,3,6,2020,-766,107,France,FR,FRA,67012883.0,Europe,11.785793
16703,20/06/2020,20,6,2020,-148,47,Italy,IT,ITA,60359546.0,Europe,5.765451
17410,22/07/2020,22,7,2020,-110,0,Jordan,JO,JOR,10101697.0,Asia,-0.554362
19490,29/04/2020,29,4,2020,-105,3,Lithuania,LT,LTU,2794184.0,Europe,9.806083
26161,03/05/2020,3,5,2020,-161,16,Portugal,PT,PRT,10276617.0,Europe,53.568212


In [15]:
covid_cases_negative.shape

(15, 12)

In [16]:
covid_cases=covid_data_all[covid_data_all.cases >=10].copy() # At least case data should be double digit


In [17]:
covid_cases

Unnamed: 0,date,day,month,year,cases,deaths,country,geoId,country_code,population_2019,continent,cum_case_perht
0,09/08/2020,9,8,2020,39,5,Afghanistan,AF,AFG,38041757.0,Asia,2.357935
1,08/08/2020,8,8,2020,78,9,Afghanistan,AF,AFG,38041757.0,Asia,2.573488
2,07/08/2020,7,8,2020,41,0,Afghanistan,AF,AFG,38041757.0,Asia,2.652349
3,06/08/2020,6,8,2020,67,4,Afghanistan,AF,AFG,38041757.0,Asia,2.578745
4,05/08/2020,5,8,2020,82,6,Afghanistan,AF,AFG,38041757.0,Asia,2.896817
...,...,...,...,...,...,...,...,...,...,...,...,...
35075,02/06/2020,2,6,2020,25,0,Zimbabwe,ZW,ZWE,14645473.0,Africa,1.072004
35077,31/05/2020,31,5,2020,14,0,Zimbabwe,ZW,ZWE,14645473.0,Africa,0.887646
35078,30/05/2020,30,5,2020,11,0,Zimbabwe,ZW,ZWE,14645473.0,Africa,0.805710
35079,29/05/2020,29,5,2020,17,0,Zimbabwe,ZW,ZWE,14645473.0,Africa,0.764741


In [18]:
covid_cases_country_count=covid_cases.groupby('country', as_index=False)['cases'].agg(['sum','count'])

In [19]:
covid_cases_country_count

Unnamed: 0_level_0,sum,count
country,Unnamed: 1_level_1,Unnamed: 2_level_1
Afghanistan,37010,129
Albania,6093,121
Algeria,34665,146
Andorra,797,33
Angola,1301,36
Antigua_and_Barbuda,39,1
Argentina,235633,135
Armenia,40153,144
Aruba,399,5
Australia,20520,136


In [20]:
covid_cases_country_count=covid_cases_country_count[covid_cases_country_count['count']>=109].copy() # At least minimum expected value should be 5 for each digit

In [21]:
covid_cases_country_count

Unnamed: 0_level_0,sum,count
country,Unnamed: 1_level_1,Unnamed: 2_level_1
Afghanistan,37010,129
Albania,6093,121
Algeria,34665,146
Argentina,235633,135
Armenia,40153,144
Australia,20520,136
Austria,21899,155
Azerbaijan,33425,139
Bahrain,43563,145
Bangladesh,255043,126


In [22]:
covid_cases_country_list=covid_cases_country_count.index.values.tolist()
covid_cases_country_list

['Afghanistan',
 'Albania',
 'Algeria',
 'Argentina',
 'Armenia',
 'Australia',
 'Austria',
 'Azerbaijan',
 'Bahrain',
 'Bangladesh',
 'Belarus',
 'Belgium',
 'Bolivia',
 'Bosnia_and_Herzegovina',
 'Brazil',
 'Bulgaria',
 'Canada',
 'Chile',
 'China',
 'Colombia',
 'Costa_Rica',
 'Cote_dIvoire',
 'Czechia',
 'Democratic_Republic_of_the_Congo',
 'Denmark',
 'Dominican_Republic',
 'Ecuador',
 'Egypt',
 'El_Salvador',
 'Finland',
 'France',
 'Germany',
 'Greece',
 'Guatemala',
 'Guinea',
 'Honduras',
 'Hungary',
 'India',
 'Indonesia',
 'Iran',
 'Iraq',
 'Ireland',
 'Israel',
 'Italy',
 'Japan',
 'Kazakhstan',
 'Kenya',
 'Kuwait',
 'Kyrgyzstan',
 'Malaysia',
 'Mexico',
 'Moldova',
 'Morocco',
 'Netherlands',
 'Nigeria',
 'North_Macedonia',
 'Norway',
 'Oman',
 'Pakistan',
 'Panama',
 'Peru',
 'Philippines',
 'Poland',
 'Portugal',
 'Puerto_Rico',
 'Qatar',
 'Romania',
 'Russia',
 'Saudi_Arabia',
 'Senegal',
 'Serbia',
 'Singapore',
 'South_Africa',
 'South_Korea',
 'Spain',
 'Sweden',
 'S

# 6) Benford's Law First Digit Chi-Square Test Function

In [23]:
def benford_first_digit(country):
    
    covid_cases_test=covid_cases[covid_cases['country']==country].copy()
        
    data=list(covid_cases_test.cases)

    for i in range(len(data)):

        while data[i]>=10:

            data[i]=data[i]/10

    first_digits=[int(x) for x in sorted(data)]

    unique=(set(first_digits))

    data_count=[]

    digits = [1, 2, 3, 4, 5, 6, 7, 8, 9]
              
    for i in digits:

        count=first_digits.count(i)
        
        data_count.append(count)
           
    total_count=sum(data_count)

    data_percentage=[(i/total_count)*100 for i in data_count]
        
    benford = [30.103, 17.6091, 12.4939, 9.691, 7.91812, 6.69468, 5.79919, 5.11525, 4.57575]
    
    expected = [(i * total_count / 100) for i in benford]
    
    chi_square_index = [0, 1, 2, 3, 4, 5, 6, 7, 8]
    
    chi_square = [((data_count[i]-expected[i])**2)/expected[i] for i in chi_square_index]
    
    chi_square_sum = sum(chi_square)
                
    return  chi_square_sum

# Benford's Law First Digit Chi-Square Test Results

In [24]:
benford_covid_cases_results = []
for i in covid_cases_country_list:
    benford_covid_cases_results.append((i, benford_first_digit(i)))                                    
benford_covid_cases_results=pd.DataFrame(benford_covid_cases_results, columns=('country', 'sum_of_chi_square'))
benford_covid_cases_results

Unnamed: 0,country,sum_of_chi_square
0,Afghanistan,19.902845
1,Albania,24.290613
2,Algeria,60.606765
3,Argentina,8.480627
4,Armenia,49.962258
5,Australia,16.030227
6,Austria,19.063969
7,Azerbaijan,69.772184
8,Bahrain,53.243931
9,Bangladesh,58.002261


In [25]:
# Here we add covid-19 case sum data.
benford_covid_cases_results=pd.merge(benford_covid_cases_results, covid_cases_country_count[['sum', 'count']], on='country', how='left')

In [26]:
# Here we change column names.
benford_covid_cases_results.rename(columns={'sum':'sum_of_covid_19_cases','count':'count_of_covid_19_reporting_dates'}, inplace=True)

In [27]:
benford_covid_cases_results

Unnamed: 0,country,sum_of_chi_square,sum_of_covid_19_cases,count_of_covid_19_reporting_dates
0,Afghanistan,19.902845,37010,129
1,Albania,24.290613,6093,121
2,Algeria,60.606765,34665,146
3,Argentina,8.480627,235633,135
4,Armenia,49.962258,40153,144
5,Australia,16.030227,20520,136
6,Austria,19.063969,21899,155
7,Azerbaijan,69.772184,33425,139
8,Bahrain,53.243931,43563,145
9,Bangladesh,58.002261,255043,126


# Benfords Law First Digit, First Two Digits, and Second Digit Conformity Tests

In [28]:
!pip install folium
import folium



In [29]:
# define the world map
world_map = folium.Map()

# display world map
world_map

In [34]:
world_geo = r'13_world_countries.json' # geojson file

# create a plain world map
world_map = folium.Map(location=[0, 0], zoom_start=2, tiles='Mapbox Bright')
world_map

In [32]:
world_map.choropleth(
    geo_data=world_geo,
    data=benford_covid_cases_results,
    columns=['country', 'sum_of_chi_square'],
    key_on='feature.properties.name',
    fill_color='YlOrRd', 
    fill_opacity=0.7, 
    line_opacity=0.2,
    legend_name='Immigration to Canada'
)

# display map
world_map



JSONDecodeError: Expecting value: line 1 column 1 (char 0)

In [None]:
world_geo = r'13_world_countries.json' # geojson file

# create a plain world map
world_map = folium.Map(location=[0, 0], zoom_start=2, tiles='Mapbox Bright')

In [None]:
# generate choropleth map using the total immigration of each country to Canada from 1980 to 2013
world_map.choropleth(
    geo_data=world_geo,
    data=benford_covid_cases_results,
    columns=['country','sum_of_chi_square'],
    key_on='feature.properties.name',
    fill_color='YlOrRd', 
    fill_opacity=0.7, 
    line_opacity=0.2,
    legend_name='Benfords Law First Digit Conformity: Sum of Chi-Squarey'
)

# display map
world_map



In [None]:
country_geo = 'https://github.com/python-visualization/folium/blob/master/examples/data/world-countries.json'
plot_data = benford_covid_cases_results[["country","sum_of_chi_square"]]
map = folium.Map(location=[0, 0], zoom_start=2.0)
map.choropleth(geo_data=country_geo, data=plot_data,
             columns=["country","sum_of_chi_square"],
             key_on='feature.id',
             fill_color='YlGnBu', fill_opacity=0.7, line_opacity=0.2,
             legend_name="Benford's Law First Digit Conformity: Sum of Chi-Square")
# display map
map

