# Pandas (Python Data Analysis Library)

__pandas__ is a fast, powerful, flexible and easy to use open source data analysis and manipulation tool,
built on top of the Python programming language!!!

---

_"In the future, I think that programming languages are going to diminish in importance relative to data itself and common computational libraries"_

__Wes McKinney__ - Pandas Benevolent Dictator for Life

https://pandas.pydata.org/docs/reference/index.html

In [1]:
#installation (https://pandas.pydata.org/docs/getting_started/install.html)

#!conda install pandas
#!conda install pandas=1.20.2

In [2]:
# import library and assign an alias

import numpy as np
import pandas as pd
print(pd.__version__)

1.3.4


---

### Pandas Data Structures

https://pandas.pydata.org/docs/user_guide/dsintro.html#dsintro

In [4]:
a = pd.Series(np.random.random(10))
print(a)

0    0.305294
1    0.412992
2    0.679693
3    0.133333
4    0.973997
5    0.810655
6    0.337172
7    0.730173
8    0.386460
9    0.377391
dtype: float64


In [5]:
a[0]

0.3052937230979882

In [3]:
# Series

s1 = pd.Series([1,2,3,4,5,6,7,8])
print(type(s1))
print(s1)

s2 = pd.Series(np.arange(1,8))
print(type(s2))
print(s2)

str_lst = ['football', 'rugby', 'beach volley', 'atletism']
s3 = pd.Series(str_lst)
print(type(s3))
print(s3)

<class 'pandas.core.series.Series'>
0    1
1    2
2    3
3    4
4    5
5    6
6    7
7    8
dtype: int64
<class 'pandas.core.series.Series'>
0    1
1    2
2    3
3    4
4    5
5    6
6    7
dtype: int32
<class 'pandas.core.series.Series'>
0        football
1           rugby
2    beach volley
3        atletism
dtype: object


In [15]:
# Series attributes

print(s1.index)
print(s1.size)
print(s1.shape)
print(s1.ndim)
print(s3.dtypes)
print(s3.axes)
print(s3.memory_usage(), 'bytes')
print(s1.describe())
print(s1.head())
print(s1.tail())
display(s1)

RangeIndex(start=0, stop=8, step=1)
8
(8,)
1
object
[RangeIndex(start=0, stop=4, step=1)]
160 bytes
count    8.00000
mean     4.50000
std      2.44949
min      1.00000
25%      2.75000
50%      4.50000
75%      6.25000
max      8.00000
dtype: float64
0    1
1    2
2    3
3    4
4    5
dtype: int64
3    4
4    5
5    6
6    7
7    8
dtype: int64


0    1
1    2
2    3
3    4
4    5
5    6
6    7
7    8
dtype: int64

In [17]:
s = pd.Series(np.random.randn(5), 
index=["a", "b", "c", "d", "e"])
s

a   -0.272740
b    0.966670
c    0.568923
d   -0.653188
e    1.465900
dtype: float64

__Pandas Data Types__

![Image](./img/etl_intro_pandas_01.png)

In [18]:
# DataFrames I

df1 = pd.DataFrame([1,2,3,4,5,6,7,8])
print(type(df1))
display(df1)

df2 = pd.DataFrame(np.arange(1,8))
print(type(df2))
display(df2)

str_lst = ['football', 'rugby', 'beach volley', 'atletism']
df3 = pd.DataFrame(str_lst)
print(type(df3))
display(df3)

<class 'pandas.core.frame.DataFrame'>


Unnamed: 0,0
0,1
1,2
2,3
3,4
4,5
5,6
6,7
7,8


<class 'pandas.core.frame.DataFrame'>


Unnamed: 0,0
0,1
1,2
2,3
3,4
4,5
5,6
6,7


<class 'pandas.core.frame.DataFrame'>


Unnamed: 0,0
0,football
1,rugby
2,beach volley
3,atletism


In [19]:
# DataFrames II

df_dict = {'col1': [1, 2], 'col2': [3, 4], 'col3': ['ironhack', 'Madrid'], 'col4': [True, None]}
df4 = pd.DataFrame(df_dict)
print(type(df4))
display(df4)

df_array = np.arange(12).reshape(4,3)
df5 = pd.DataFrame(df_array, columns=['a', 'b', 'c'])
print(type(df5))
display(df5)

df6 = pd.DataFrame({'Python Dictionaries': [df_dict, df_dict, df_dict],
                    'Numpy Arrays': [df_array * 3, df_array + df_array, np.zeros((5, 5, 5))], 
                    'Pandas DataFrames': [df3.head(), df4.tail(), df5], 
                    'Python Variables': [1, True, 'I love Pandas']})
print(type(df6))
display(df6)

<class 'pandas.core.frame.DataFrame'>


Unnamed: 0,col1,col2,col3,col4
0,1,3,ironhack,True
1,2,4,Madrid,


<class 'pandas.core.frame.DataFrame'>


Unnamed: 0,a,b,c
0,0,1,2
1,3,4,5
2,6,7,8
3,9,10,11


<class 'pandas.core.frame.DataFrame'>


Unnamed: 0,Python Dictionaries,Numpy Arrays,Pandas DataFrames,Python Variables
0,"{'col1': [1, 2], 'col2': [3, 4], 'col3': ['iro...","[[0, 3, 6], [9, 12, 15], [18, 21, 24], [27, 30...",0 0 football 1 rugb...,1
1,"{'col1': [1, 2], 'col2': [3, 4], 'col3': ['iro...","[[0, 2, 4], [6, 8, 10], [12, 14, 16], [18, 20,...",col1 col2 col3 col4 0 1 3 i...,True
2,"{'col1': [1, 2], 'col2': [3, 4], 'col3': ['iro...","[[[0.0, 0.0, 0.0, 0.0, 0.0], [0.0, 0.0, 0.0, 0...",a b c 0 0 1 2 1 3 4 5 2 6 ...,I love Pandas


In [27]:
df_dict = {'col1': [1, 2], 'col2': [3, 4], 'col3': ['ironhack', 'Madrid'], 'col4': [True, None]}
df4 = pd.DataFrame(df_dict)
display(df4)
df4[["col1","col2"]]

Unnamed: 0,col1,col2,col3,col4
0,1,3,ironhack,True
1,2,4,Madrid,


Unnamed: 0,col1,col2
0,1,3
1,2,4


In [29]:
# DataFrames attributes

print(df6.index)
print(df6.size)
print(df6.shape)
print(df6.ndim)
print(df6.dtypes)
print(df6.axes)
print(df6.memory_usage(), 'bytes')
print(df6.describe())
display(df6.head(1))
display(df6.tail(1))

print(df6.columns)
print(df6.info())

RangeIndex(start=0, stop=3, step=1)
12
(3, 4)
2
Python Dictionaries    object
Numpy Arrays           object
Pandas DataFrames      object
Python Variables       object
dtype: object
[RangeIndex(start=0, stop=3, step=1), Index(['Python Dictionaries', 'Numpy Arrays', 'Pandas DataFrames',
       'Python Variables'],
      dtype='object')]
Index                  128
Python Dictionaries     24
Numpy Arrays            24
Pandas DataFrames       24
Python Variables        24
dtype: int64 bytes
                                      Python Dictionaries  \
count                                                   3   
unique                                                  1   
top     {'col1': [1, 2], 'col2': [3, 4], 'col3': ['iro...   
freq                                                    3   

                                             Numpy Arrays  \
count                                                   3   
unique                                                  3   
top     [[0, 3, 6],

Unnamed: 0,Python Dictionaries,Numpy Arrays,Pandas DataFrames,Python Variables
0,"{'col1': [1, 2], 'col2': [3, 4], 'col3': ['iro...","[[0, 3, 6], [9, 12, 15], [18, 21, 24], [27, 30...",0 0 football 1 rugb...,1


Unnamed: 0,Python Dictionaries,Numpy Arrays,Pandas DataFrames,Python Variables
2,"{'col1': [1, 2], 'col2': [3, 4], 'col3': ['iro...","[[[0.0, 0.0, 0.0, 0.0, 0.0], [0.0, 0.0, 0.0, 0...",a b c 0 0 1 2 1 3 4 5 2 6 ...,I love Pandas


Index(['Python Dictionaries', 'Numpy Arrays', 'Pandas DataFrames',
       'Python Variables'],
      dtype='object')
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3 entries, 0 to 2
Data columns (total 4 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   Python Dictionaries  3 non-null      object
 1   Numpy Arrays         3 non-null      object
 2   Pandas DataFrames    3 non-null      object
 3   Python Variables     3 non-null      object
dtypes: object(4)
memory usage: 224.0+ bytes
None


---

In [34]:
# Pandas exploration methods => .loc[], .iloc[]

breeds = {'dogs': ['pug', 'malinois', 'border collie'], 
          'cats': ['siamese', 'persian', 'european'], 
          'horses': ['arabian', 'andalusian', 'mustang']}
df7 = pd.DataFrame(breeds)
df7.index = ['cute', 'strong', 'smart']
display(df7)

df7.index

print(df7.loc["cute", 'dogs'])
print(df7.iloc[0, 0])

Unnamed: 0,dogs,cats,horses
cute,pug,siamese,arabian
strong,malinois,persian,andalusian
smart,border collie,european,mustang


pug
pug


More about `.loc[]` and `.iloc[]`: https://towardsdatascience.com/how-to-use-loc-and-iloc-for-selecting-data-in-pandas-bd09cb4c3d79

---

In [35]:
# Pandas MORE exploration methods I

df8 = df7.transpose()
df8

Unnamed: 0,cute,strong,smart
dogs,pug,malinois,border collie
cats,siamese,persian,european
horses,arabian,andalusian,mustang


---

In [37]:
# Pandas MORE exploration methods II

#df9 = pd.read_csv('./datasets/mount_everest_deaths.csv', index_col='No.')
#print(df9.shape)
#df9.head()

In [None]:
df9['Nationality'].value_counts()

In [None]:
df9['Cause of death'].unique()

In [None]:
df9.sort_values(by='Location')

---

__To be continued...__