# Task

1. Download and prepare data
2. Read data:
    - Sort population and set new index  
    
3. Data selection:
    - data selection
    - join data frames to one  
    
4. Data analyiys:
    4.1. All data
       - hist
       - shapiro test
        
    4.2. Data standardization
        - polyfit
        
    4.3. Factors
    4.4. Europe data
    4.5. Anova

In [1]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
from scipy.stats import shapiro
import re

In [2]:
%cd ../data/

/home/adam/projects/github/engineer/data


In [3]:
%ls

In [4]:
cov = pd.read_csv('covid.csv',sep=';')
pop = pd.read_csv('countryPopulation.csv',sep=';')

FileNotFoundError: [Errno 2] No such file or directory: 'covid.csv'

In [None]:
cov.head(2)

In [None]:
pop.shape

In [None]:
pop.head(2)

In [None]:
idx = cov.Date == '2020-10-29'
tmp = cov.loc[idx]
tmp.head()

In [None]:
data = tmp.merge(pop,on='Country')
idx = data.Country != 'other'
data = data.loc[idx]
print(data.shape)
data.head()

# analysis

In [None]:
data.Cumdeaths.describe()

# gauss pdf

$$f = \frac{1}{\sigma \cdot \sqrt{2\pi}}\cdot e^{-0.5(\frac{x - \mu}{\sigma})^2}$$

In [None]:
def myPdf(data,n=100):
    '''Args:
        data:   pd.Series
        n:      int, number of bins'''
    data = data.copy()
    mu = data.mean()
    s = data.std()
    rng = (data.min(),data.max())
    x = np.random.uniform(*rng,n)
    x.sort()
    y = (1/(s*(2*np.pi)**0.5))*np.exp(-0.5*((x-mu)/s)**2)
    return x,y

In [None]:
x,y = myPdf(data.Cumdeaths)

In [None]:
plt.hist(data.Cumdeaths,bins=30,density=True);
plt.plot(x,y,'--r')

### Normality test

In [None]:
from scipy.stats import shapiro

In [None]:
shapiro(data.Cumdeaths)

In [None]:
print('stats: {:.3f}\tp-val: {:.9f}'.format(*shapiro(data.Cumdeaths)))

### Bar plot

>- data sorting by 'Cumdeaths'
>- selection of countries for which 'Cumdeaths' is greater than 20
>- division of data into 4 parts 

In [None]:
data.sort_values('Cumdeaths',inplace=True)
tmp = data[data.Cumdeaths > 20]
tmp.shape

In [None]:
# division of data into 4 parts
# division boundaries 'db'
db = np.round(np.linspace(0,tmp.shape[0],5),0).astype(int).tolist()
db

In [None]:
f= plt.figure(figsize=(16,25))

for i,v in enumerate(db[:-1]):
    tmp1 = tmp.iloc[db[i]:db[i+1]]

    plt.subplot(4,1,i+1)
    plt.bar(tmp1.Country,tmp1.Cumdeaths)
    plt.xticks(rotation=45)
    plt.grid()


plt.tight_layout()

### Simple regression

> linear model $y = ax+b$  
> checking the correlation between the population and the cumulative number of deaths

In [None]:
# np.polyfit(): Least squares polynomial fit
# returns: Polynomial coefficients,
cof = np.polyfit(data.c2020,data.Cumdeaths,1)
cof

In [None]:
# constructs a polynomial based on its coefficients
poly = np.poly1d(cof)
print(poly)

In [None]:
# Calculate the y-values for the fitting line
yLine = poly(data.c2020)
yLine[:5]

In [None]:
# plot scatter end firring line
plt.scatter(data.c2020,data.Cumdeaths)
plt.plot(data.c2020,yLine,'r')

### Correlation coefficient (Pearson)

In [None]:
np.corrcoef(data.c2020,data.Cumdeaths)

### Standardization: number of deaths per million inhabitants

In [None]:
data.loc[:,'perMln'] = np.round((data.Cumdeaths/data.c2020)*10**6,1)
data = data.sort_values('perMln',ascending=False)
data.head()

In [None]:
tmp = data[data.perMln > 100]
print(f'Shape: {tmp.shape}')
tmp = tmp.sort_values('perMln')
tmp.head(3)

In [None]:
# division of data into 2 parts
# division boundaries 'db'
db = np.round(np.linspace(0,tmp.shape[0],3),0).astype(int).tolist()
db

In [None]:
f= plt.figure(figsize=(12,15))

for i,v in enumerate(db[:-1]):
    tmp1 = tmp.iloc[db[i]:db[i+1]]

    plt.subplot(2,1,i+1)
    plt.bar(tmp1.Country,tmp1.perMln)
    plt.xticks(rotation=45)
    plt.grid()


plt.tight_layout()