# COVID-19 Pandemic Data Analysis

This notebook automatically create textual report for Covid data analysis

BigData and Blockchain lab by @SAnbaee

In [1]:
import pandas as pd
import numpy as np


In [2]:
input_path = 'owid-covid-data.csv'
main_df = pd.read_csv(input_path , index_col=['location'])
print(main_df)




              iso_code        date  total_cases  new_cases  total_deaths  \
location                                                                   
Aruba              ABW  2020-03-13            2          2             0   
Aruba              ABW  2020-03-20            4          2             0   
Aruba              ABW  2020-03-24           12          8             0   
Aruba              ABW  2020-03-25           17          5             0   
Aruba              ABW  2020-03-26           19          2             0   
...                ...         ...          ...        ...           ...   
International      NaN  2020-02-28          705          0             4   
International      NaN  2020-02-29          705          0             6   
International      NaN  2020-03-01          705          0             6   
International      NaN  2020-03-02          705          0             6   
International      NaN  2020-03-10          696         -9             7   

           

In [3]:
def get_max ( df , colName ):
    
    max_cases = max(df[colName])
    # use numpy.argmax for accessing the element number with maximum value of colName
    max_cases_index = np.argmax(df[colName])
    max_cases_date = df.loc[max_cases_index ,'date']
    
    # function with more than 1 output
    return (max_cases, max_cases_date)
                        
def get_min ( df , colName ):
    
    min_cases = min(df[colName])
    # use numpy.argmax for accessing the element number with maximum value of colName

    min_cases_index = np.argmin(df[colName])
    min_cases_date = df.loc[min_cases_index ,'date']

    # function with more than 1 output
    return (min_cases, min_cases_date)

In [9]:
# only analysis of Iran confirmed cases
df = main_df.loc['Iran']

#describe the statistical features of dataframe
print(df.describe())

#for accessing df.loc[max_cases_index ,'date'] we must reset index in df
df.reset_index(inplace = True, drop = True) 


# we must prepare the statics of covid in a txt file
outputPath = 'covid Analyssis Report.txt'

reportString  = 'Analysis of Covid-19 Pandemid during ' + min(df.date) + ' to ' + max(df.date) + '.\n'
reportString += 'Total cases since ' +max(df.date) + ' is ' + str(df.loc[len(df)-1, 'total_cases']) +'.\n'
reportString += 'Total deaths since ' +max(df.date) + ' is ' + str(df.loc[len(df)-1 , 'total_deaths']) +'.\n'

reportString += '========================* new_cases *==============================\n'



#use numpy.mean for average
avg_new_cases = np.mean(df['new_cases'])
reportString += 'average value of new_cases is : '+ str(avg_new_cases) +'\n'

max_new_cases , max_in_date = get_max(df , 'new_cases')
reportString += 'max number of new_cases is '+ str(max_new_cases) + ' in '+ max_in_date + '\n'

mix_new_cases , min_in_date = get_min(df , 'new_cases')
reportString += 'min number of new_cases is '+ str(mix_new_cases) + ' in '+ min_in_date + '\n'



         total_cases    new_cases  total_deaths  new_deaths  \
count     140.000000   140.000000    140.000000  140.000000   
mean    32818.978571   858.557143   2051.300000   49.914286   
std     40876.615744   961.370359   2542.336513   56.333218   
min         0.000000     0.000000      0.000000    0.000000   
25%         0.000000     0.000000      0.000000    0.000000   
50%      6863.500000   818.500000    215.500000   28.000000   
75%     72090.250000  1428.500000   4501.750000   92.250000   
max    120198.000000  5275.000000   6988.000000  292.000000   

       total_cases_per_million  new_cases_per_million  \
count               140.000000             140.000000   
mean                390.734936              10.221807   
std                 486.667239              11.445860   
min                   0.000000               0.000000   
25%                   0.000000               0.000000   
50%                  81.715000               9.744500   
75%                 858.289000   

In [10]:
reportString += '\n================* new_deaths *==================\n'

#use numpy.mean for average
avg_new_cases = np.mean(df['new_deaths'])
reportString += 'average value of new_cases is : '+ str(avg_new_cases) +'\n'

max_new_deaths , max_in_date = get_max(df , 'new_deaths')
reportString += 'max number of new_cases is '+ str(max_new_deaths) + ' in '+ max_in_date + '\n'

mix_new_deaths , min_in_date = get_min(df , 'new_deaths')
reportString += 'min number of new_cases is '+ str(mix_new_deaths) + ' in '+ min_in_date + '\n'



#### In Pandas **missing data** is represented by two value:
<ul>
    <li>None: None is a Python singleton object that is often used for missing data in Python code.</li>
    <li>NaN : NaN (an acronym for Not a Number), is a special floating-point value recognized by all systems that use the standard IEEE floating-point representation</li>
</ul>

Pandas treat None and NaN as essentially *interchangeable* for indicating **missing** or **null** values. useful functions for detecting, removing, and replacing null values in Pandas DataFrame :
<ul>
<li>isnull()</li>
<li>notnull()</li>
<li>dropna()</li>
<li>fillna()</li>
<li>replace()</li>
<li>interpolate()</li>
</ul>

See more examples from [here](https://www.geeksforgeeks.org/working-with-missing-data-in-pandas/).

In [11]:
reportString += '\n================* new_tests *==================\n'

bool_series = pd.notnull(df["new_tests"])  
df2 = df[bool_series==True]
df2.reset_index(inplace = True, drop = True) 

print(df2.shape)

#use numpy.mean for average
avg_new_tests = np.mean(df2['new_tests'])
reportString += 'average value of new_test is : '+ str(avg_new_tests) +'\n'

max_new_tests , max_in_date = get_max(df2 , 'new_tests')
reportString += 'max number of new_test is '+ str(max_new_tests) + ' in '+ max_in_date + '\n'

# replacing Nan with zero may be change the min value


min_new_tests , min_in_date = get_min(df2 , 'new_tests')
reportString += 'min number of new_tests is '+ str(min_new_tests) + ' in '+ min_in_date + '\n'



(30, 28)


In [12]:
# write report string in a file
fh = open('CovidReport.txt','w')                 
fh.write(reportString)
fh.close()
