## Outline:
* [Numpy](#numpy)
* [Pandas](#pandas)
* [Statistics](#statistics)

In [1]:
import numpy as np
import pandas as pd
from scipy as sp

%precision 2

csvpath = 'https://covid.ourworldindata.org/data/owid-covid-data.csv'

## Numpy <a class="anchor" id="numpy"></a>

In [2]:
ar1 = np.arange(1, 11, 2)
ar2 = np.array(list(range(1, 11, 2)))
print('np.arange(1, 11, 2):', ar1)
print('np.array(list(range(1, 11, 2))):', ar2)
print('ar1 == ar2:', ar1 == ar2)
print('np.array_equal(ar1, ar2):', np.array_equal(ar1, ar2))
print()
ar3 = np.arange(1, 11).reshape(2,5)
print('np.arange(1, 11).reshape(2,5):\n', ar3)
print()
print('multi-dimentional:')
ar4 = np.arange(1,9).reshape(2,2,2)
print(ar4)
print()
print('np.linspace:')
ar5 = np.linspace(0, 1, 11)
print(ar5)
print('\nnp.diag')
ar6 = np.diag([1,2,3])
print(ar6)
print('\nnp.repeat')
print(np.array([1,2,3]*3))
print(np.repeat([1,2,3], 3))
print()
print('np.vstack([ar1, ar1**2]):\n', np.vstack([ar1, ar1**2]))
print()
print('np.hstack([ar1, ar1**2]):\n', np.hstack([ar1, ar1**2]))

np.arange(1, 11, 2): [1 3 5 7 9]
np.array(list(range(1, 11, 2))): [1 3 5 7 9]
ar1 == ar2: [ True  True  True  True  True]
np.array_equal(ar1, ar2): True

np.arange(1, 11).reshape(2,5):
 [[ 1  2  3  4  5]
 [ 6  7  8  9 10]]

multi-dimentional:
[[[1 2]
  [3 4]]

 [[5 6]
  [7 8]]]

np.linspace:
[0.  0.1 0.2 0.3 0.4 0.5 0.6 0.7 0.8 0.9 1. ]

np.diag
[[1 0 0]
 [0 2 0]
 [0 0 3]]

np.repeat
[1 2 3 1 2 3 1 2 3]
[1 1 1 2 2 2 3 3 3]

np.vstack([ar1, ar1**2]):
 [[ 1  3  5  7  9]
 [ 1  9 25 49 81]]

np.hstack([ar1, ar1**2]):
 [ 1  3  5  7  9  1  9 25 49 81]


In [3]:
ar = np.arange(10)
print(ar.reshape(2,5))
print()
print(ar)
print()
print(ar.resize(2,5))
print()
print(ar)

[[0 1 2 3 4]
 [5 6 7 8 9]]

[0 1 2 3 4 5 6 7 8 9]

None

[[0 1 2 3 4]
 [5 6 7 8 9]]


In [4]:
a = np.arange(1, 4)
print('a:', a)
print('np.dot(a,a):', np.dot(a, a))
print('a.dot(a):', a.dot(a))
print('a.dot(a.T):', a.dot(a.T))
print('*'*10)
ar = np.arange(1, 7).reshape(2,3)
print('ar:\n', ar)
print('ar.dot(ar.T):\n', ar.dot(ar.T))
print('np.dot(ar,ar.T):\n', np.dot(ar,ar.T))
print('np.inner(ar, ar):\n', np.inner(ar, ar))
print('np.outer(ar, ar):\n', np.outer(ar, ar))

a: [1 2 3]
np.dot(a,a): 14
a.dot(a): 14
a.dot(a.T): 14
**********
ar:
 [[1 2 3]
 [4 5 6]]
ar.dot(ar.T):
 [[14 32]
 [32 77]]
np.dot(ar,ar.T):
 [[14 32]
 [32 77]]
np.inner(ar, ar):
 [[14 32]
 [32 77]]
np.outer(ar, ar):
 [[ 1  2  3  4  5  6]
 [ 2  4  6  8 10 12]
 [ 3  6  9 12 15 18]
 [ 4  8 12 16 20 24]
 [ 5 10 15 20 25 30]
 [ 6 12 18 24 30 36]]


In [5]:
ar = np.arange(1, 28)
ar.resize(3,3,3)
arorigin = ar.copy()
ar[-2:, -2:, -2:] = 0
print(ar)
print(), print(), print()
print(arorigin)

[[[ 1  2  3]
  [ 4  5  6]
  [ 7  8  9]]

 [[10 11 12]
  [13  0  0]
  [16  0  0]]

 [[19 20 21]
  [22  0  0]
  [25  0  0]]]



[[[ 1  2  3]
  [ 4  5  6]
  [ 7  8  9]]

 [[10 11 12]
  [13 14 15]
  [16 17 18]]

 [[19 20 21]
  [22 23 24]
  [25 26 27]]]


In [6]:
randar = np.random.randint(1, 10, (300,400)) #not inclusive
print(np.sum(randar==1))
print(np.sum(randar==10))
randar

13386
0


array([[7, 2, 3, ..., 7, 9, 7],
       [9, 3, 4, ..., 9, 9, 8],
       [3, 6, 9, ..., 2, 1, 9],
       ...,
       [3, 9, 7, ..., 1, 8, 3],
       [5, 1, 5, ..., 1, 5, 8],
       [1, 5, 5, ..., 1, 3, 4]])

In [7]:
ar = np.arange(4).reshape(2,2)
for i,j in zip(ar, ar):
    print(i)
    print(j)
    print()
    
ar = np.arange(8).reshape((2,2,2))
for i,j in zip(ar, ar):
    print(i)
    print(j)
    print()

[0 1]
[0 1]

[2 3]
[2 3]

[[0 1]
 [2 3]]
[[0 1]
 [2 3]]

[[4 5]
 [6 7]]
[[4 5]
 [6 7]]



## Pandas <a class="anchor" id="pandas"></a>

### Series

In [8]:
ser1 = pd.Series([10,20,30], index= ['A', 'B', 'C'], name='Serie 1')
print(ser1)
print()
ser2 = pd.Series({'A':10, 'B':20, 'C':30}, name='Serie 2')
print(ser2)
print()
print(ser1 == ser2)
print()
print(pd.Series.equals(ser1, ser2))

A    10
B    20
C    30
Name: Serie 1, dtype: int64

A    10
B    20
C    30
Name: Serie 2, dtype: int64

A    True
B    True
C    True
dtype: bool

True


In [9]:
ser1 = pd.Series(data= {'A':10, 'B':20, 'C':None})
print(ser1)
print()
print('index: ', ser1.index)
print()
print('values: ', ser1.values)
print()
print(ser1.dtype)
print(len(ser1), ser1.shape)
print()
print(ser1[0], ser1['B'], ser1['C'])
print(ser1.iloc[0], ser1.iloc[1], ser1.iloc[2])
print(ser1.loc['A'], ser1.loc['B'], ser1.loc['C'])
print()
print(ser1.describe())

A    10.0
B    20.0
C     NaN
dtype: float64

index:  Index(['A', 'B', 'C'], dtype='object')

values:  [10. 20. nan]

float64
3 (3,)

10.0 20.0 nan
10.0 20.0 nan
10.0 20.0 nan

count     2.000000
mean     15.000000
std       7.071068
min      10.000000
25%      12.500000
50%      15.000000
75%      17.500000
max      20.000000
dtype: float64


In [10]:
s = pd.Series([10, 20, 30.001, None])
s1 = s.copy()
s2 = s.copy()
s3 = s1.append(s2)
print('s3:')
print(s3)
print()
print('s3.iloc[0]: ', s3.iloc[0])
print('s3.loc[0]:')
print(s3.loc[0])
print()
print('s3[0]')
print(s3[0])
print()
print('mathematical operations:')
print('s1 * s2')
print(s1 * s2)
print()
print('s1 / s2')
print(s1 / s2)
print()
print('s1 + s2')
print(s1 + s2)

s3:
0    10.000
1    20.000
2    30.001
3       NaN
0    10.000
1    20.000
2    30.001
3       NaN
dtype: float64

s3.iloc[0]:  10.0
s3.loc[0]:
0    10.0
0    10.0
dtype: float64

s3[0]
0    10.0
0    10.0
dtype: float64

mathematical operations:
s1 * s2
0    100.000000
1    400.000000
2    900.060001
3           NaN
dtype: float64

s1 / s2
0    1.0
1    1.0
2    1.0
3    NaN
dtype: float64

s1 + s2
0    20.000
1    40.000
2    60.002
3       NaN
dtype: float64


In [11]:
s = pd.Series(range(10, 13))
s += 1
print(s)
print()
s[3] = None
print(s)
print()
print('s[3] == None: ', s[3] == None)
print()
print('s[3] == np.nan: ', s[3] == np.nan)
print('s[3] is np.nan: ', s[3] is np.nan)
print('np.nan == np.nan: ', np.nan == np.nan)
print('np.isnan(np.nan): ', np.isnan(np.nan))
print('pd.isna(s[3]):', pd.isna(s[3]))
print('pd.isna(s):\n', pd.isna(s))

0    11
1    12
2    13
dtype: int64

0      11
1      12
2      13
3    None
dtype: object

s[3] == None:  True

s[3] == np.nan:  False
s[3] is np.nan:  False
np.nan == np.nan:  False
np.isnan(np.nan):  True
pd.isna(s[3]): True
pd.isna(s):
 0    False
1    False
2    False
3     True
dtype: bool


### DataFrames

In [12]:
ser1 = pd.Series({'A':1, 'B':2, 'C':3})
ser2 = pd.Series({'A':4, 'B':5, 'C':6, 'D':None})
ser3 = pd.Series({'A':7, 'B':8, 'C':9, 'E': 10.0})
pd.DataFrame([ser1, ser2, ser3], index=['Serie 1', 'Serie 2', 'Serie 3'])

Unnamed: 0,A,B,C,D,E
Serie 1,1.0,2.0,3.0,,
Serie 2,4.0,5.0,6.0,,
Serie 3,7.0,8.0,9.0,,10.0


In [13]:
dfalldata = pd.read_csv(csvpath)
dforigin = pd.read_csv(csvpath, usecols=['location', 'date', 'new_cases', 'new_deaths', 'new_tests', 
                                         'population', 'population_density', 'hospital_beds_per_100k'])
print(dforigin.head(2))
print()
print(dforigin.tail(2))
print()
print(dforigin.describe())

  location        date  new_cases  new_deaths  new_tests  population  \
0    Aruba  2020-03-13          2           0        NaN    106766.0   
1    Aruba  2020-03-20          2           0        NaN    106766.0   

   population_density  hospital_beds_per_100k  
0               584.8                     NaN  
1               584.8                     NaN  

            location        date  new_cases  new_deaths  new_tests  \
20126  International  2020-03-02          0           0        NaN   
20127  International  2020-03-10         -9           1        NaN   

       population  population_density  hospital_beds_per_100k  
20126         NaN                 NaN                     NaN  
20127         NaN                 NaN                     NaN  

           new_cases    new_deaths      new_tests    population  \
count   20128.000000  20128.000000    4699.000000  2.006400e+04   
mean      552.037758     34.798490   10382.310917  1.081156e+08   
std      4927.519806    332.29275

In [14]:
df = dforigin.copy()
print(len(df), df.shape)
print('df index: ', df.index)
print('df columns: ', df.columns)
print()
df.set_index('date', inplace=True)
df.sort_index(inplace=True)
print('df index: ', df.index)
print()
print(df.head(2))

20128 (20128, 8)
df index:  RangeIndex(start=0, stop=20128, step=1)
df columns:  Index(['location', 'date', 'new_cases', 'new_deaths', 'new_tests',
       'population', 'population_density', 'hospital_beds_per_100k'],
      dtype='object')

df index:  Index(['2019-12-31', '2019-12-31', '2019-12-31', '2019-12-31', '2019-12-31',
       '2019-12-31', '2019-12-31', '2019-12-31', '2019-12-31', '2019-12-31',
       ...
       '2020-05-27', '2020-05-27', '2020-05-27', '2020-05-27', '2020-05-27',
       '2020-05-27', '2020-05-27', '2020-05-27', '2020-05-27', '2020-05-27'],
      dtype='object', name='date', length=20128)

           location  new_cases  new_deaths  new_tests    population  \
date                                                                  
2019-12-31  Lebanon          0           0        NaN  6.825442e+06   
2019-12-31    World         27           0        NaN  7.794799e+09   

            population_density  hospital_beds_per_100k  
date                                

In [15]:
%%timeit -n 5
df.loc['2019-12-31']['new_tests'] = None

The slowest run took 5.07 times longer than the fastest. This could mean that an intermediate result is being cached.
1.16 ms ± 692 µs per loop (mean ± std. dev. of 7 runs, 5 loops each)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [16]:
%%timeit -n 5
df.loc['2019-12-31', 'new_tests'] = None

The slowest run took 4.25 times longer than the fastest. This could mean that an intermediate result is being cached.
970 µs ± 446 µs per loop (mean ± std. dev. of 7 runs, 5 loops each)


In [17]:
df = dforigin.copy().set_index(['location', 'date']).sort_index()
print(df.head(3))
print()
df.reset_index(inplace=True)
print(df.head(3))
print()

                        new_cases  new_deaths  new_tests  population  \
location    date                                                       
Afghanistan 2019-12-31          0           0        NaN  38928341.0   
            2020-01-01          0           0        NaN  38928341.0   
            2020-01-02          0           0        NaN  38928341.0   

                        population_density  hospital_beds_per_100k  
location    date                                                    
Afghanistan 2019-12-31              54.422                     0.5  
            2020-01-01              54.422                     0.5  
            2020-01-02              54.422                     0.5  

      location        date  new_cases  new_deaths  new_tests  population  \
0  Afghanistan  2019-12-31          0           0        NaN  38928341.0   
1  Afghanistan  2020-01-01          0           0        NaN  38928341.0   
2  Afghanistan  2020-01-02          0           0        NaN  389

In [18]:
df = dforigin.copy()
df.loc[100000] = None
print('df.loc[100000] = None\n', df.tail(3))
print()
print('df.isna().sum().sum(): ', df.isna().sum().sum())
print()
df.fillna('$$$', inplace=True)
df[100000] = None
print('df[100000] = None !new column is added with title 100000! \n', df.tail(3))
print()
print('df.isna().sum().sum(): ', df.isna().sum().sum())

df.loc[100000] = None
              location        date  new_cases  new_deaths  new_tests  \
20126   International  2020-03-02        0.0         0.0        NaN   
20127   International  2020-03-10       -9.0         1.0        NaN   
100000            NaN         NaN        NaN         NaN        NaN   

        population  population_density  hospital_beds_per_100k  
20126          NaN                 NaN                     NaN  
20127          NaN                 NaN                     NaN  
100000         NaN                 NaN                     NaN  

df.isna().sum().sum():  19682

df[100000] = None !new column is added with title 100000! 
              location        date new_cases new_deaths new_tests population  \
20126   International  2020-03-02         0          0       $$$        $$$   
20127   International  2020-03-10        -9          1       $$$        $$$   
100000            $$$         $$$       $$$        $$$       $$$        $$$   

       population_densi

#### Group by

In [19]:
df = dforigin.copy().set_index('date').sort_index()
group, frame = next(iter(df.groupby('location')))
print(group)
print('type(frame): ', type(frame))
print(frame.head(2))

Afghanistan
type(frame):  <class 'pandas.core.frame.DataFrame'>
               location  new_cases  new_deaths  new_tests  population  \
date                                                                    
2019-12-31  Afghanistan          0           0        NaN  38928341.0   
2020-01-01  Afghanistan          0           0        NaN  38928341.0   

            population_density  hospital_beds_per_100k  
date                                                    
2019-12-31              54.422                     0.5  
2020-01-01              54.422                     0.5  


In [20]:
df = dforigin.copy()
print(df.groupby('location').agg({'new_cases': [np.sum, np.max], 'new_deaths': np.average, 'new_tests': np.sum}).head(3))

            new_cases       new_deaths new_tests
                  sum  amax    average       sum
location                                        
Afghanistan     11831  1063   1.582734       0.0
Albania          1029    34   0.412500       0.0
Algeria          8697   199   4.284722       0.0


In [21]:
df = dforigin.copy().set_index(['location', 'date']).sort_index()
print(df.groupby(level=0)['new_cases', 'new_deaths', 'new_tests'].agg([np.sum,np.max]).head(2))
print()
print(df.groupby(level=1)['new_cases', 'new_deaths', 'new_tests'].agg({'sum': np.sum, 'max': np.max}).tail(2))

            new_cases       new_deaths      new_tests     
                  sum  amax        sum amax       sum amax
location                                                  
Afghanistan     11831  1063        220   32       0.0  NaN
Albania          1029    34         33    3       0.0  NaN

                 sum                            max                     
           new_cases new_deaths new_tests new_cases new_deaths new_tests
date                                                                    
2020-05-26    178200       6766       0.0     89100       3383       NaN
2020-05-27    190700       7870       0.0     95350       3935       NaN


in a future version.

For column-specific groupby renaming, use named aggregation

    >>> df.groupby(...).agg(name=('column', aggfunc))

  return super().aggregate(arg, *args, **kwargs)


#### Pivot tables

In [22]:
df = dforigin.copy()
df.pivot_table(values= 'new_cases', index='date', columns= 'location', aggfunc=np.sum).sort_index()

location,Afghanistan,Albania,Algeria,Andorra,Angola,Anguilla,Antigua and Barbuda,Argentina,Armenia,Aruba,...,Uruguay,Uzbekistan,Vatican,Venezuela,Vietnam,Western Sahara,World,Yemen,Zambia,Zimbabwe
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2019-12-31,0.0,,0.0,,,,,,0.0,,...,,,,,0.0,,27.0,,,
2020-01-01,0.0,,0.0,,,,,,0.0,,...,,,,,0.0,,0.0,,,
2020-01-02,0.0,,0.0,,,,,,0.0,,...,,,,,0.0,,0.0,,,
2020-01-03,0.0,,0.0,,,,,,0.0,,...,,,,,0.0,,17.0,,,
2020-01-04,0.0,,0.0,,,,,,0.0,,...,,,,,0.0,,0.0,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2020-05-23,540.0,12.0,190.0,0.0,2.0,0.0,0.0,718.0,322.0,0.0,...,4.0,72.0,0.0,62.0,0.0,0.0,106559.0,12.0,54.0,5.0
2020-05-24,782.0,8.0,195.0,0.0,0.0,0.0,0.0,704.0,374.0,0.0,...,11.0,54.0,0.0,0.0,0.0,0.0,101131.0,7.0,0.0,0.0
2020-05-25,584.0,9.0,193.0,1.0,9.0,0.0,0.0,723.0,359.0,0.0,...,5.0,32.0,0.0,177.0,1.0,0.0,94202.0,10.0,0.0,0.0
2020-05-26,591.0,6.0,197.0,0.0,0.0,0.0,0.0,552.0,452.0,0.0,...,18.0,97.0,0.0,56.0,1.0,0.0,89100.0,15.0,0.0,0.0


#### Date and time

In [23]:
print(pd.Timestamp('9/1/2016 10:05AM'))
print(pd.Period('1/2016'))
print(pd.to_datetime('4.7.12', dayfirst=True))
print(pd.Timestamp('9/3/2016')-pd.Timestamp('9/1/2016'))
print(pd.Timestamp('9/2/2016 8:10AM') + pd.Timedelta('12D 3H'))

2016-09-01 10:05:00
2016-01
2012-07-04 00:00:00
2 days 00:00:00
2016-09-14 11:10:00


In [24]:
df = dforigin.copy().set_index('date').sort_index()
df.index = pd.to_datetime(df.index)
print(df.index.weekday_name)
print()
df['2020-05-01':'2020-05-03']

Index(['Tuesday', 'Tuesday', 'Tuesday', 'Tuesday', 'Tuesday', 'Tuesday',
       'Tuesday', 'Tuesday', 'Tuesday', 'Tuesday',
       ...
       'Wednesday', 'Wednesday', 'Wednesday', 'Wednesday', 'Wednesday',
       'Wednesday', 'Wednesday', 'Wednesday', 'Wednesday', 'Wednesday'],
      dtype='object', name='date', length=20128)



Unnamed: 0_level_0,location,new_cases,new_deaths,new_tests,population,population_density,hospital_beds_per_100k
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2020-05-01,British Virgin Islands,0,0,,30237.0,207.973,
2020-05-01,Cayman Islands,0,0,,65720.0,256.496,
2020-05-01,Maldives,191,1,443.0,540542.0,1454.433,
2020-05-01,South Korea,9,1,3188.0,51269183.0,527.967,12.27
2020-05-01,Uzbekistan,44,0,,33469199.0,76.134,4.00
...,...,...,...,...,...,...,...
2020-05-03,Brazil,4970,421,,212559409.0,25.040,2.20
2020-05-03,Rwanda,6,0,1047.0,12952209.0,494.869,
2020-05-03,Thailand,6,0,,69799978.0,135.132,2.10
2020-05-03,Guernsey,0,0,,67052.0,,


## Statistics <a class="anchor" id="statistics"></a>

In [25]:
print('binomial: ', np.random.binomial(1, 0.9, 10))
print('uniform : ', np.random.uniform(0, 5, 10))
print('normal  : ', np.random.normal(0, 1, 10))

binomial:  [1 1 1 1 1 1 1 1 1 0]
uniform :  [4.98 4.42 3.76 3.77 1.65 3.18 2.32 2.48 3.61 0.1 ]
normal  :  [ 1.12 -0.78 -2.66 -0.81  1.39  1.08  0.2  -1.25  1.05 -0.28]
