## Outline:
* [Numpy](#numpy)
* [Pandas](#pandas)
* [Statistics](#statistics)
* [Plotting](#plotting)

In [1]:
import numpy as np
import pandas as pd
from scipy import stats

%precision 2

csvpath = 'https://covid.ourworldindata.org/data/owid-covid-data.csv'

## Numpy <a class="anchor" id="numpy"></a>

In [2]:
ar1 = np.arange(1, 11, 2)
ar2 = np.array(list(range(1, 11, 2)))
print('np.arange(1, 11, 2):', ar1)
print('np.array(list(range(1, 11, 2))):', ar2)
print('ar1 == ar2:', ar1 == ar2)
print('np.array_equal(ar1, ar2):', np.array_equal(ar1, ar2))
print()
ar3 = np.arange(1, 11).reshape(2,5)
print('np.arange(1, 11).reshape(2,5):\n', ar3)
print()
print('multi-dimentional:')
ar4 = np.arange(1,9).reshape(2,2,2)
print(ar4)
print()
print('np.linspace:')
ar5 = np.linspace(0, 1, 11)
print(ar5)
print('\nnp.diag')
ar6 = np.diag([1,2,3])
print(ar6)
print('\nnp.repeat')
print(np.array([1,2,3]*3))
print(np.repeat([1,2,3], 3))
print()
print('np.vstack([ar1, ar1**2]):\n', np.vstack([ar1, ar1**2]))
print()
print('np.hstack([ar1, ar1**2]):\n', np.hstack([ar1, ar1**2]))

np.arange(1, 11, 2): [1 3 5 7 9]
np.array(list(range(1, 11, 2))): [1 3 5 7 9]
ar1 == ar2: [ True  True  True  True  True]
np.array_equal(ar1, ar2): True

np.arange(1, 11).reshape(2,5):
 [[ 1  2  3  4  5]
 [ 6  7  8  9 10]]

multi-dimentional:
[[[1 2]
  [3 4]]

 [[5 6]
  [7 8]]]

np.linspace:
[0.  0.1 0.2 0.3 0.4 0.5 0.6 0.7 0.8 0.9 1. ]

np.diag
[[1 0 0]
 [0 2 0]
 [0 0 3]]

np.repeat
[1 2 3 1 2 3 1 2 3]
[1 1 1 2 2 2 3 3 3]

np.vstack([ar1, ar1**2]):
 [[ 1  3  5  7  9]
 [ 1  9 25 49 81]]

np.hstack([ar1, ar1**2]):
 [ 1  3  5  7  9  1  9 25 49 81]


In [3]:
ar = np.arange(10)
print(ar.reshape(2,5))
print()
print(ar)
print()
print(ar.resize(2,5))
print()
print(ar)

[[0 1 2 3 4]
 [5 6 7 8 9]]

[0 1 2 3 4 5 6 7 8 9]

None

[[0 1 2 3 4]
 [5 6 7 8 9]]


In [4]:
a = np.arange(1, 4)
print('a:', a)
print('np.dot(a,a):', np.dot(a, a))
print('a.dot(a):', a.dot(a))
print('a.dot(a.T):', a.dot(a.T))
print('*'*10)
ar = np.arange(1, 7).reshape(2,3)
print('ar:\n', ar)
print('ar.dot(ar.T):\n', ar.dot(ar.T))
print('np.dot(ar,ar.T):\n', np.dot(ar,ar.T))
print('np.inner(ar, ar):\n', np.inner(ar, ar))
print('np.outer(ar, ar):\n', np.outer(ar, ar))

a: [1 2 3]
np.dot(a,a): 14
a.dot(a): 14
a.dot(a.T): 14
**********
ar:
 [[1 2 3]
 [4 5 6]]
ar.dot(ar.T):
 [[14 32]
 [32 77]]
np.dot(ar,ar.T):
 [[14 32]
 [32 77]]
np.inner(ar, ar):
 [[14 32]
 [32 77]]
np.outer(ar, ar):
 [[ 1  2  3  4  5  6]
 [ 2  4  6  8 10 12]
 [ 3  6  9 12 15 18]
 [ 4  8 12 16 20 24]
 [ 5 10 15 20 25 30]
 [ 6 12 18 24 30 36]]


In [5]:
ar = np.arange(1, 28)
ar.resize(3,3,3)
arorigin = ar.copy()
ar[-2:, -2:, -2:] = 0
print(ar)
print(), print(), print()
print(arorigin)

[[[ 1  2  3]
  [ 4  5  6]
  [ 7  8  9]]

 [[10 11 12]
  [13  0  0]
  [16  0  0]]

 [[19 20 21]
  [22  0  0]
  [25  0  0]]]



[[[ 1  2  3]
  [ 4  5  6]
  [ 7  8  9]]

 [[10 11 12]
  [13 14 15]
  [16 17 18]]

 [[19 20 21]
  [22 23 24]
  [25 26 27]]]


In [6]:
randar = np.random.randint(1, 10, (300,400)) #not inclusive
print(np.sum(randar==1))
print(np.sum(randar==10))
randar

13218
0


array([[5, 9, 4, ..., 2, 4, 2],
       [4, 3, 7, ..., 4, 1, 1],
       [7, 1, 2, ..., 8, 4, 4],
       ...,
       [1, 2, 9, ..., 7, 7, 1],
       [9, 7, 5, ..., 7, 5, 9],
       [1, 1, 3, ..., 2, 8, 9]])

In [7]:
ar = np.arange(4).reshape(2,2)
for i,j in zip(ar, ar):
    print(i)
    print(j)
    print()
    
ar = np.arange(8).reshape((2,2,2))
for i,j in zip(ar, ar):
    print(i)
    print(j)
    print()

[0 1]
[0 1]

[2 3]
[2 3]

[[0 1]
 [2 3]]
[[0 1]
 [2 3]]

[[4 5]
 [6 7]]
[[4 5]
 [6 7]]



## Pandas <a class="anchor" id="pandas"></a>

### Series

In [8]:
ser1 = pd.Series([10,20,30], index= ['A', 'B', 'C'], name='Serie 1')
print(ser1)
print()
ser2 = pd.Series({'A':10, 'B':20, 'C':30}, name='Serie 2')
print(ser2)
print()
print(ser1 == ser2)
print()
print(pd.Series.equals(ser1, ser2))

A    10
B    20
C    30
Name: Serie 1, dtype: int64

A    10
B    20
C    30
Name: Serie 2, dtype: int64

A    True
B    True
C    True
dtype: bool

True


In [9]:
ser1 = pd.Series(data= {'A':10, 'B':20, 'C':None})
print(ser1)
print()
print('index: ', ser1.index)
print()
print('values: ', ser1.values)
print()
print(ser1.dtype)
print(len(ser1), ser1.shape)
print()
print(ser1[0], ser1['B'], ser1['C'])
print(ser1.iloc[0], ser1.iloc[1], ser1.iloc[2])
print(ser1.loc['A'], ser1.loc['B'], ser1.loc['C'])
print()
print(ser1.describe())

A    10.0
B    20.0
C     NaN
dtype: float64

index:  Index(['A', 'B', 'C'], dtype='object')

values:  [10. 20. nan]

float64
3 (3,)

10.0 20.0 nan
10.0 20.0 nan
10.0 20.0 nan

count     2.000000
mean     15.000000
std       7.071068
min      10.000000
25%      12.500000
50%      15.000000
75%      17.500000
max      20.000000
dtype: float64


In [10]:
s = pd.Series([10, 20, 30.001, None])
s1 = s.copy()
s2 = s.copy()
s3 = s1.append(s2)
print('s3:')
print(s3)
print()
print('s3.iloc[0]: ', s3.iloc[0])
print('s3.loc[0]:')
print(s3.loc[0])
print()
print('s3[0]')
print(s3[0])
print()
print('mathematical operations:')
print('s1 * s2')
print(s1 * s2)
print()
print('s1 / s2')
print(s1 / s2)
print()
print('s1 + s2')
print(s1 + s2)

s3:
0    10.000
1    20.000
2    30.001
3       NaN
0    10.000
1    20.000
2    30.001
3       NaN
dtype: float64

s3.iloc[0]:  10.0
s3.loc[0]:
0    10.0
0    10.0
dtype: float64

s3[0]
0    10.0
0    10.0
dtype: float64

mathematical operations:
s1 * s2
0    100.000000
1    400.000000
2    900.060001
3           NaN
dtype: float64

s1 / s2
0    1.0
1    1.0
2    1.0
3    NaN
dtype: float64

s1 + s2
0    20.000
1    40.000
2    60.002
3       NaN
dtype: float64


In [11]:
s = pd.Series(range(10, 13))
s += 1
print(s)
print()
s[3] = None
print(s)
print()
print('s[3] == None: ', s[3] == None)
print()
print('s[3] == np.nan: ', s[3] == np.nan)
print('s[3] is np.nan: ', s[3] is np.nan)
print('np.nan == np.nan: ', np.nan == np.nan)
print('np.isnan(np.nan): ', np.isnan(np.nan))
print('pd.isna(s[3]):', pd.isna(s[3]))
print('pd.isna(s):\n', pd.isna(s))

0    11
1    12
2    13
dtype: int64

0      11
1      12
2      13
3    None
dtype: object

s[3] == None:  True

s[3] == np.nan:  False
s[3] is np.nan:  False
np.nan == np.nan:  False
np.isnan(np.nan):  True
pd.isna(s[3]): True
pd.isna(s):
 0    False
1    False
2    False
3     True
dtype: bool


### DataFrames

In [12]:
ser1 = pd.Series({'A':1, 'B':2, 'C':3})
ser2 = pd.Series({'A':4, 'B':5, 'C':6, 'D':None})
ser3 = pd.Series({'A':7, 'B':8, 'C':9, 'E': 10.0})
pd.DataFrame([ser1, ser2, ser3], index=['Serie 1', 'Serie 2', 'Serie 3'])

Unnamed: 0,A,B,C,D,E
Serie 1,1.0,2.0,3.0,,
Serie 2,4.0,5.0,6.0,,
Serie 3,7.0,8.0,9.0,,10.0


In [13]:
dfalldata = pd.read_csv(csvpath)
dforigin = pd.read_csv(csvpath, usecols=['location', 'date', 'new_cases', 'new_deaths', 'new_tests', 
                                         'population', 'population_density', 'hospital_beds_per_100k'])
print(dforigin.head(2))
print()
print(dforigin.tail(2))
print()
print(dforigin.describe())

  location        date  new_cases  new_deaths  new_tests  population  \
0    Aruba  2020-03-13          2           0        NaN    106766.0   
1    Aruba  2020-03-20          2           0        NaN    106766.0   

   population_density  hospital_beds_per_100k  
0               584.8                     NaN  
1               584.8                     NaN  

            location        date  new_cases  new_deaths  new_tests  \
20764  International  2020-03-02          0           0        NaN   
20765  International  2020-03-10         -9           1        NaN   

       population  population_density  hospital_beds_per_100k  
20764         NaN                 NaN                     NaN  
20765         NaN                 NaN                     NaN  

           new_cases    new_deaths      new_tests    population  \
count   20766.000000  20766.000000    5036.000000  2.070200e+04   
mean      568.223635     35.143119   10984.897736  1.070395e+08   
std      5061.068184    333.06201

In [14]:
df = dforigin.copy()
print(len(df), df.shape)
print('df index: ', df.index)
print('df columns: ', df.columns)
print()
df.set_index('date', inplace=True)
df.sort_index(inplace=True)
print('df index: ', df.index)
print()
print(df.head(2))

20766 (20766, 8)
df index:  RangeIndex(start=0, stop=20766, step=1)
df columns:  Index(['location', 'date', 'new_cases', 'new_deaths', 'new_tests',
       'population', 'population_density', 'hospital_beds_per_100k'],
      dtype='object')

df index:  Index(['2019-12-31', '2019-12-31', '2019-12-31', '2019-12-31', '2019-12-31',
       '2019-12-31', '2019-12-31', '2019-12-31', '2019-12-31', '2019-12-31',
       ...
       '2020-05-30', '2020-05-30', '2020-05-30', '2020-05-30', '2020-05-30',
       '2020-05-30', '2020-05-30', '2020-05-30', '2020-05-30', '2020-05-30'],
      dtype='object', name='date', length=20766)

                      location  new_cases  new_deaths  new_tests  population  \
date                                                                           
2019-12-31  Dominican Republic          0           0        NaN  10847904.0   
2019-12-31              France          0           0        NaN  65273512.0   

            population_density  hospital_beds_per_100k  


In [15]:
# Multi-level index
df = (dforigin[['location', 'date', 'new_cases', 'new_deaths', 'new_tests']]
      .copy()
      .set_index(['location', 'date'])
      .sort_index())
print(df.head(3))
print()

# df['location'] and df[['location', 'date']] --> KeyError
print('df.index.nlevels:', df.index.nlevels)
print('df.index.names:', df.index.names)
print(df.index[:2])
print(df.index.levels[1][:2])
print()
print(df.loc[('Austria', '2020-01-01'):('Austria', '2020-01-03'), ['new_cases', 'new_deaths']])
print(), print()
print(df.loc[('Afghanistan', '2020-01-01'):('Armenia', '2020-01-01'), ['new_cases', 'new_deaths']])
print()

                        new_cases  new_deaths  new_tests
location    date                                        
Afghanistan 2019-12-31          0           0        NaN
            2020-01-01          0           0        NaN
            2020-01-02          0           0        NaN

df.index.nlevels: 2
df.index.names: ['location', 'date']
MultiIndex([('Afghanistan', '2019-12-31'),
            ('Afghanistan', '2020-01-01')],
           names=['location', 'date'])
Index(['2019-12-31', '2020-01-01'], dtype='object', name='date')

                     new_cases  new_deaths
location date                             
Austria  2020-01-01          0           0
         2020-01-02          0           0
         2020-01-03          0           0


                        new_cases  new_deaths
location    date                             
Afghanistan 2020-01-01          0           0
            2020-01-02          0           0
            2020-01-03          0           0
            2020-0

In [16]:
df = dforigin.copy()
df.loc[100000] = None
print('df.loc[100000] = None\n', df.tail(3))
print()
print('df.isna().sum().sum(): ', df.isna().sum().sum())
print()
df.fillna('$$$', inplace=True)
df[100000] = None
print('df[100000] = None !new column is added with title 100000! \n', df.tail(3))
print()
print('df.isna().sum().sum(): ', df.isna().sum().sum())

df.loc[100000] = None
              location        date  new_cases  new_deaths  new_tests  \
20764   International  2020-03-02        0.0         0.0        NaN   
20765   International  2020-03-10       -9.0         1.0        NaN   
100000            NaN         NaN        NaN         NaN        NaN   

        population  population_density  hospital_beds_per_100k  
20764          NaN                 NaN                     NaN  
20765          NaN                 NaN                     NaN  
100000         NaN                 NaN                     NaN  

df.isna().sum().sum():  20161

df[100000] = None !new column is added with title 100000! 
              location        date new_cases new_deaths new_tests population  \
20764   International  2020-03-02         0          0       $$$        $$$   
20765   International  2020-03-10        -9          1       $$$        $$$   
100000            $$$         $$$       $$$        $$$       $$$        $$$   

       population_densi

#### Group by

In [17]:
df = dforigin.copy().set_index('date').sort_index()
group, frame = next(iter(df.groupby('location')))
print(group)
print('type(frame): ', type(frame))
print(frame.head(2))

Afghanistan
type(frame):  <class 'pandas.core.frame.DataFrame'>
               location  new_cases  new_deaths  new_tests  population  \
date                                                                    
2019-12-31  Afghanistan          0           0        NaN  38928341.0   
2020-01-01  Afghanistan          0           0        NaN  38928341.0   

            population_density  hospital_beds_per_100k  
date                                                    
2019-12-31              54.422                     0.5  
2020-01-01              54.422                     0.5  


In [18]:
df = dforigin.copy()
print(df.groupby('location').agg({'new_cases': [np.sum, np.max], 'new_deaths': np.average, 'new_tests': np.sum}).head(3))

            new_cases       new_deaths new_tests
                  sum  amax    average       sum
location                                        
Afghanistan     13659  1063   1.732394       0.0
Albania          1099    34   0.397590       0.0
Algeria          9134   199   4.340136       0.0


In [19]:
df = dforigin.copy().set_index(['location', 'date']).sort_index()
print(df.groupby(level=0)['new_cases', 'new_deaths', 'new_tests'].agg([np.sum,np.max]).head(2))
print()
print(df.groupby(level=1)['new_cases', 'new_deaths', 'new_tests'].agg({'sum': np.sum, 'max': np.max}).tail(2))

            new_cases       new_deaths      new_tests     
                  sum  amax        sum amax       sum amax
location                                                  
Afghanistan     13659  1063        246   32       0.0  NaN
Albania          1099    34         33    3       0.0  NaN

                 sum                             max                     
           new_cases new_deaths  new_tests new_cases new_deaths new_tests
date                                                                     
2020-05-29    239680       9470  1398914.0    119840       4735  492276.0
2020-05-30    244708       9600   749661.0    122354       4800  316139.0


in a future version.

For column-specific groupby renaming, use named aggregation

    >>> df.groupby(...).agg(name=('column', aggfunc))

  return super().aggregate(arg, *args, **kwargs)


#### Pivot tables

In [20]:
df = dforigin.copy()
df.pivot_table(values= 'new_cases', index='date', columns= 'location', aggfunc=np.sum).sort_index()

location,Afghanistan,Albania,Algeria,Andorra,Angola,Anguilla,Antigua and Barbuda,Argentina,Armenia,Aruba,...,Uruguay,Uzbekistan,Vatican,Venezuela,Vietnam,Western Sahara,World,Yemen,Zambia,Zimbabwe
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2019-12-31,0.0,,0.0,,,,,,0.0,,...,,,,,0.0,,27.0,,,
2020-01-01,0.0,,0.0,,,,,,0.0,,...,,,,,0.0,,0.0,,,
2020-01-02,0.0,,0.0,,,,,,0.0,,...,,,,,0.0,,0.0,,,
2020-01-03,0.0,,0.0,,,,,,0.0,,...,,,,,0.0,,17.0,,,
2020-01-04,0.0,,0.0,,,,,,0.0,,...,,,,,0.0,,0.0,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2020-05-26,591.0,6.0,197.0,0.0,0.0,0.0,0.0,552.0,452.0,0.0,...,18.0,97.0,0.0,56.0,1.0,0.0,89096.0,15.0,0.0,0.0
2020-05-27,658.0,25.0,194.0,0.0,2.0,0.0,0.0,600.0,289.0,0.0,...,2.0,72.0,0.0,34.0,1.0,0.0,95870.0,12.0,0.0,0.0
2020-05-28,625.0,21.0,160.0,0.0,0.0,0.0,0.0,705.0,372.0,0.0,...,14.0,0.0,0.0,34.0,0.0,0.0,101548.0,6.0,137.0,76.0
2020-05-29,580.0,26.0,140.0,0.0,2.0,0.0,0.0,769.0,442.0,0.0,...,8.0,111.0,0.0,82.0,0.0,0.0,119840.0,23.0,0.0,17.0


#### Date and time

In [21]:
print(pd.Timestamp('9/1/2016 10:05AM'))
print(pd.Period('1/2016'))
print(pd.to_datetime('4.7.12', dayfirst=True))
print(pd.Timestamp('9/3/2016')-pd.Timestamp('9/1/2016'))
print(pd.Timestamp('9/2/2016 8:10AM') + pd.Timedelta('12D 3H'))

2016-09-01 10:05:00
2016-01
2012-07-04 00:00:00
2 days 00:00:00
2016-09-14 11:10:00


In [22]:
df = dforigin.copy().set_index('date').sort_index()
df.index = pd.to_datetime(df.index)
print(df.index.weekday_name)
print()
df['2020-05-01':'2020-05-03']

Index(['Tuesday', 'Tuesday', 'Tuesday', 'Tuesday', 'Tuesday', 'Tuesday',
       'Tuesday', 'Tuesday', 'Tuesday', 'Tuesday',
       ...
       'Saturday', 'Saturday', 'Saturday', 'Saturday', 'Saturday', 'Saturday',
       'Saturday', 'Saturday', 'Saturday', 'Saturday'],
      dtype='object', name='date', length=20766)



Unnamed: 0_level_0,location,new_cases,new_deaths,new_tests,population,population_density,hospital_beds_per_100k
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2020-05-01,Uruguay,13,2,731.0,3473727.0,19.751,2.80
2020-05-01,Switzerland,179,15,4470.0,8654618.0,214.243,4.53
2020-05-01,Mexico,1425,127,2955.0,128932753.0,66.444,1.38
2020-05-01,Anguilla,0,0,,15002.0,,
2020-05-01,Western Sahara,0,0,,597330.0,,
...,...,...,...,...,...,...,...
2020-05-03,Somalia,70,3,,15893219.0,23.500,0.90
2020-05-03,Niger,8,2,,24206636.0,16.955,0.30
2020-05-03,Thailand,6,0,,69799978.0,135.132,2.10
2020-05-03,Nigeria,218,17,,206139587.0,209.588,


## Statistics <a class="anchor" id="statistics"></a>

In [23]:
print('binomial: ', np.random.binomial(1, 0.9, 10))
print('uniform : ', np.random.uniform(0, 5, 10))
print('normal  : ', np.random.normal(0, 1, 10))

binomial:  [1 0 1 1 1 1 1 1 0 1]
uniform :  [4.06 2.9  2.31 0.81 4.56 0.62 0.54 2.12 3.35 3.1 ]
normal  :  [-1.68  0.37 -1.34  0.47 -1.01  0.14 -0.3  -1.45 -0.17 -0.51]


**stats.ttest_ind?**

Calculate the T-test for the means of *two independent* samples of scores.

This is a two-sided test for the null hypothesis that 2 independent samples
have identical average (expected) values. This test assumes that the
populations have identical variances by default.

**Hypothesis: Italy and Spain have reported different cumulative death-rate?**

In [24]:
# calculating the average increase rate in new_deaths for each country
alpha = 0.01
country1, country2 = 'Italy', 'Spain'
df = (dfalldata[['location', 'date', 'new_cases', 'new_deaths']]
      .copy())
df['date'] = pd.to_datetime(df['date'])
df = df.set_index(['location', 'date']).sort_index()
print(df.head(3))
print()
c1df = df.loc[country1]
c1df[['new_cases', 'new_deaths']] = (c1df[['new_cases', 'new_deaths']]
                                 .cumsum())
c1df.rename(columns={'new_cases':'cases', 'new_deaths':'deaths'}, inplace=True)
print('country1df:', c1df.head(3))
print()
c2df = df.loc[country2]
c2df[['new_cases', 'new_deaths']] = (c2df[['new_cases', 'new_deaths']]
                                 .cumsum())
c2df.rename(columns={'new_cases':'cases', 'new_deaths':'deaths'}, inplace=True)
print('country2df:', c2df.head(3))
print()
# Use the data after first death reported
c1initdate = c1df.loc[c1df['deaths'] > 0, 'deaths'].idxmin()
c1df = c1df.loc[c1initdate:]
c2initdate = c2df.loc[c2df['deaths'] > 0, 'deaths'].idxmin()
c2df = c2df.loc[c2initdate:]
c1df['death_rate'] = (c1df['deaths'] / c1df['cases']).fillna(0)
c2df['death_rate'] = (c2df['deaths'] / c2df['cases']).fillna(0)
res = stats.ttest_ind(c1df['death_rate'], c2df['death_rate'], equal_var=False)
if res.pvalue < alpha:
    # reject null hypothesis
    print('pvalue', res.pvalue)
    print(f'At {alpha} level of significance, {country1} and {country2} have different death-rates')
else:
    print('pvalue', res.pvalue)
    print(f'At {alpha} level of significance, {country1} and {country2} might have the same death-rates')

                        new_cases  new_deaths
location    date                             
Afghanistan 2019-12-31          0           0
            2020-01-01          0           0
            2020-01-02          0           0

country1df:             cases  deaths
date                     
2019-12-31      0       0
2020-01-01      0       0
2020-01-02      0       0

country2df:             cases  deaths
date                     
2019-12-31      0       0
2020-01-01      0       0
2020-01-02      0       0

pvalue 0.00010974102659470714
At 0.01 level of significance, Italy and Spain have different death-rates


## Plotting <a class="anchor" id="plotting"></a>

In [25]:
%matplotlib notebook

In [26]:
import matplotlib as mpl
import matplotlib.pyplot as plt
mpl.get_backend()

'nbAgg'

In [28]:
plt.plot(c1df['death_rate'], '-o', label=country1)
plt.plot(c2df['death_rate'], '-o', label=country2)
plt.xlabel('day')
plt.ylabel('death-rate')
plt.title('Death-rate time series')
# add the legend to loc=4 (the lower right hand corner), also gets rid of the frame and adds a title
plt.legend(loc='upper left', frameon=False, title='Countries')

# get the current axes
ax = plt.gca()
# Set axis properties [xmin, xmax, ymin, ymax]
ax.axis([pd.to_datetime('2020-03-01'), pd.to_datetime('2020-05-01'),0.0,0.2])
plt.xticks(rotation=15);

<IPython.core.display.Javascript object>

In [29]:
plt.figure()
plt.subplot(1,2,1)
plt.plot(c1df['death_rate'], '-', label=country1)
plt.xlabel('day')
plt.ylabel('death-rate')
plt.xticks(rotation=30);

plt.subplot(1,2,2)
plt.plot(c2df['death_rate'], '-', label=country2)
plt.xlabel('day')
plt.xticks(rotation=30);

<IPython.core.display.Javascript object>

### Histograms 

In [30]:
# create 2x2 grid of axis subplots
fig, ((ax1, ax2, ax3), (ax4, ax5, ax6)) = plt.subplots(2, 3, sharex=True)
axs = [ax1, ax2, ax3, ax4, ax5, ax6]

# draw n = 10, 100, 1000, and 10000 samples from the normal distribution and plot corresponding histograms
for n in range(0,len(axs)):
    sample_size = 10**(n+1)
    sample = np.random.normal(loc=0.0, scale=1.0, size=sample_size)
    axs[n].hist(sample)
    axs[n].set_title('n={}'.format(sample_size))

<IPython.core.display.Javascript object>

### Box and Whisker Plots

In [31]:
import pandas as pd
normal_sample = np.random.normal(loc=0.0, scale=1.0, size=10000)
random_sample = np.random.random(size=10000)
gamma_sample = np.random.gamma(2, size=10000)

df = pd.DataFrame({'normal': normal_sample, 
                   'random': random_sample, 
                   'gamma': gamma_sample})

In [32]:
df.describe()

Unnamed: 0,normal,random,gamma
count,10000.0,10000.0,10000.0
mean,0.01134,0.502695,1.996486
std,1.004938,0.289071,1.416175
min,-3.724145,4.9e-05,0.015962
25%,-0.665705,0.250969,0.958414
50%,0.011476,0.504386,1.667854
75%,0.68814,0.751156,2.68498
max,3.603183,0.999902,10.352564


In [33]:
plt.figure()
plt.subplot(1,2,1)
# plot boxplots for all three of df's columns
_ = plt.boxplot([ df['normal'], df['random'], df['gamma'] ], whis='range')

plt.subplot(1,2,2)
# if `whis` argument isn't passed, boxplot defaults to showing 1.5*interquartile (IQR) whiskers with outliers
_ = plt.boxplot([ df['normal'], df['random'], df['gamma'] ])

<IPython.core.display.Javascript object>

### Heatmaps

In [34]:
plt.figure()

Y = np.random.normal(loc=0.0, scale=1.0, size=10000)
X = np.random.random(size=10000)
_ = plt.hist2d(X, Y, bins=25)

# add a colorbar legend
plt.colorbar()

<IPython.core.display.Javascript object>

<matplotlib.colorbar.Colorbar at 0x105ff50>

### Pandas Visualization

In [35]:
%matplotlib notebook
plt.style.use('seaborn-colorblind')

In [37]:
df = dforigin.copy()
df['date'] = pd.to_datetime(df['date'])
df['death-per-100k'] = df['new_deaths'] / df['population'] * 100000
dfpiv = (df.pivot_table(values= 'death-per-100k', index='date', columns= 'location')
      .sort_index()
      .fillna(0))


fig, axes = plt.subplots(nrows=2, ncols=2)
countries = ['United States', 'United Kingdom', 'Germany', 'France', 'Italy']
dfpiv.loc['2020-03-01': '2020-06-01', countries].plot(ax=axes[0,0])
countries = ['Iran', 'Turkey', 'Saudi Arabia', 'Russia', 'Pakistan']
dfpiv.loc['2020-03-01': '2020-06-01', countries].plot(ax=axes[0, 1])
countries = ['China', 'India', 'Indonesia', 'Brazil', 'Nigeria']
dfpiv.loc['2020-03-01': '2020-06-01', countries].plot(ax=axes[1, 0])
countries = np.random.choice(dfpiv.columns, 5, replace=False)
dfpiv.loc['2020-03-01': '2020-06-01', countries].plot(ax=axes[1, 1])

<IPython.core.display.Javascript object>

<matplotlib.axes._subplots.AxesSubplot at 0xa1dc970>

In [38]:
plt.clf()
countries = ['United States', 'United Kingdom', 'Germany', 'France', 'Italy']
dfpiv.loc['2020-03-01': '2020-06-01', countries].plot.kde()

<IPython.core.display.Javascript object>

<matplotlib.axes._subplots.AxesSubplot at 0xa1f5890>

[Kernel density estimation plots](https://en.wikipedia.org/wiki/Kernel_density_estimation) are useful for deriving a smooth continuous function from a given sample.

In [39]:
df = dforigin.copy().set_index('location')
df['date'] = pd.to_datetime(df['date'])
df = df.loc['Italy'].sort_values('date')
df.plot('new_cases', 'new_deaths', kind='scatter')

<IPython.core.display.Javascript object>

<matplotlib.axes._subplots.AxesSubplot at 0xa1f5510>

In [40]:
df = dforigin.copy().set_index('location')
df = df.loc['Germany'].sort_values('date', ascending=True)
df = df[df['new_cases']> 0]
# create a scatter plot of columns 'new_deaths' and 'new_cases', with changing color (c) and size (s) based on day and ratio
day = np.arange(len(df))
ratio = df['new_cases']/df['new_deaths']
df.plot.scatter('new_deaths', 'new_cases', c=day, s=ratio, colormap='viridis')
ax.set_aspect('equal')

<IPython.core.display.Javascript object>

### Seaborn

In [41]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib notebook

In [42]:
df = dforigin[['date', 'location', 'new_cases', 'new_deaths', 'population']].copy().set_index('date').sort_index()
df.index = pd.to_datetime(df.index)
begin = df.index.max() - pd.Timedelta('15 days')
df = df.loc[begin:]
df['death-per-100k'] = df['new_deaths'] / df['population'] * 100000

In [43]:
df = df[df['location'].isin(['United States', 'Italy', 'Germany', 'Canada'])]

plt.figure(figsize= (8, 8))
plt.subplot(211)
sns.swarmplot('location', 'death-per-100k', data=df);
plt.xticks(rotation=20);
plt.subplot(212)
sns.violinplot('location', 'death-per-100k', data=df);
plt.xticks(rotation=20);

<IPython.core.display.Javascript object>

In [44]:
df = (dforigin.loc[dforigin['location'].isin(['United States', 'Germany']), ['location', 'date', 'new_deaths']]
      .copy().set_index('location'))
df['date'] = pd.to_datetime(df['date'])
df = df.sort_values('date')
v1 = df.loc['United States', 'new_deaths']
v2 = df.loc['Germany', 'new_deaths']

In [45]:
plt.clf()
sns.jointplot(v1, v2, alpha=0.4);

<IPython.core.display.Javascript object>

In [46]:
sns.jointplot(v1, v2, kind='hex');

<IPython.core.display.Javascript object>

In [47]:
# set the seaborn style for all the following plots
sns.set_style('white')

sns.jointplot(v1, v2, kind='kde', space=0);

<IPython.core.display.Javascript object>