# Statistics Introduction Applied to Data Science
## Bonus : Two - Time Series
### Pandas operations - Review

In [1]:
# import numpy and pandas modules
import numpy as np
import pandas as pd

## Create Pandas Dataframe

In [2]:
# We will use a small dataset for a simple example
# data stored in a python dictionary 
data = {
    'age': [23, 17, 5, 12, 41, 33],
    'weight': [70.5, 69.2, 31.5, 49.6, 82.1, 70.8],
    'height': [5.9, 6.1, 2.5, 4.6, 5.2, 5.5]}
data

{'age': [23, 17, 5, 12, 41, 33],
 'weight': [70.5, 69.2, 31.5, 49.6, 82.1, 70.8],
 'height': [5.9, 6.1, 2.5, 4.6, 5.2, 5.5]}

In [3]:
# convert dictionary to a pandas dataframe
patients = pd.DataFrame(data)
patients

Unnamed: 0,age,weight,height
0,23,70.5,5.9
1,17,69.2,6.1
2,5,31.5,2.5
3,12,49.6,4.6
4,41,82.1,5.2
5,33,70.8,5.5


In [4]:
# information about the dataframe
patients.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6 entries, 0 to 5
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   age     6 non-null      int64  
 1   weight  6 non-null      float64
 2   height  6 non-null      float64
dtypes: float64(2), int64(1)
memory usage: 272.0 bytes


In [5]:
# Take a look at the head of the dataframe
patients.head(2)

Unnamed: 0,age,weight,height
0,23,70.5,5.9
1,17,69.2,6.1


In [6]:
# Gives the last 5 values (tail of dataframe)
patients.tail()

Unnamed: 0,age,weight,height
1,17,69.2,6.1
2,5,31.5,2.5
3,12,49.6,4.6
4,41,82.1,5.2
5,33,70.8,5.5


In [7]:
# create a dataframe with index names specified
# data is the dataset in dictionary format
# index names are in a list format
idx_names = ['Steve Blank', 'Richard Smith', 'Kevin Maney', 'Rhonda Byrne', 'Wallace Wattles', 'Joseph Sugarman']

patients_indexname = pd.DataFrame(data, index = idx_names)
patients_indexname

Unnamed: 0,age,weight,height
Steve Blank,23,70.5,5.9
Richard Smith,17,69.2,6.1
Kevin Maney,5,31.5,2.5
Rhonda Byrne,12,49.6,4.6
Wallace Wattles,41,82.1,5.2
Joseph Sugarman,33,70.8,5.5


## Working with Rows and Columns 

In [8]:
# select row only by label, using loc[]
# .loc refers to label
patients_indexname.loc['Kevin Maney']

age        5.0
weight    31.5
height     2.5
Name: Kevin Maney, dtype: float64

In [9]:
# select row and column, using loc[]
# .loc refers to label
patients_indexname.loc['Kevin Maney', 'weight']

31.5

In [10]:
# slice by row, using index positions, python way
patients_indexname[2:5]

Unnamed: 0,age,weight,height
Kevin Maney,5,31.5,2.5
Rhonda Byrne,12,49.6,4.6
Wallace Wattles,41,82.1,5.2


In [11]:
# slice by row, using label names (values and not index position)
patients_indexname['Rhonda Byrne':'Joseph Sugarman']

Unnamed: 0,age,weight,height
Rhonda Byrne,12,49.6,4.6
Wallace Wattles,41,82.1,5.2
Joseph Sugarman,33,70.8,5.5


In [12]:
# select column with slice method (all rows, and column at index position 1)
# you will get back the age column, which is index position 1 column-wise
# i refers to index
patients_indexname.iloc[:, 0]

Steve Blank        23
Richard Smith      17
Kevin Maney         5
Rhonda Byrne       12
Wallace Wattles    41
Joseph Sugarman    33
Name: age, dtype: int64

In [13]:
# select multiple rows and multiple columns with slice method
patients_indexname.iloc[2:5, 0:2]

Unnamed: 0,age,weight
Kevin Maney,5,31.5
Rhonda Byrne,12,49.6
Wallace Wattles,41,82.1


In [14]:
# select column using dot notation 
patients_indexname.height

Steve Blank        5.9
Richard Smith      6.1
Kevin Maney        2.5
Rhonda Byrne       4.6
Wallace Wattles    5.2
Joseph Sugarman    5.5
Name: height, dtype: float64

In [15]:
# or bracket
# patients_indexname.height
patients_indexname['height']

Steve Blank        5.9
Richard Smith      6.1
Kevin Maney        2.5
Rhonda Byrne       4.6
Wallace Wattles    5.2
Joseph Sugarman    5.5
Name: height, dtype: float64

In [16]:
# select multiple columns with brackets
patients_indexname[['weight', 'age']]

Unnamed: 0,weight,age
Steve Blank,70.5,23
Richard Smith,69.2,17
Kevin Maney,31.5,5
Rhonda Byrne,49.6,12
Wallace Wattles,82.1,41
Joseph Sugarman,70.8,33


In [17]:
# select column names
patients_indexname.columns

Index(['age', 'weight', 'height'], dtype='object')

In [18]:
# sorting by values
patients_indexname.sort_values(by='weight')

Unnamed: 0,age,weight,height
Kevin Maney,5,31.5,2.5
Rhonda Byrne,12,49.6,4.6
Richard Smith,17,69.2,6.1
Steve Blank,23,70.5,5.9
Joseph Sugarman,33,70.8,5.5
Wallace Wattles,41,82.1,5.2


In [19]:
# summary statistics
patients_indexname.describe()

Unnamed: 0,age,weight,height
count,6.0,6.0,6.0
mean,21.833333,62.283333,4.966667
std,13.392784,18.379708,1.320101
min,5.0,31.5,2.5
25%,13.25,54.5,4.75
50%,20.0,69.85,5.35
75%,30.5,70.725,5.8
max,41.0,82.1,6.1


In [20]:
# transpose data (rows become columns and columns become rows)
patients_indexname.T

Unnamed: 0,Steve Blank,Richard Smith,Kevin Maney,Rhonda Byrne,Wallace Wattles,Joseph Sugarman
age,23.0,17.0,5.0,12.0,41.0,33.0
weight,70.5,69.2,31.5,49.6,82.1,70.8
height,5.9,6.1,2.5,4.6,5.2,5.5


## Advanced Pandas Operations

In [21]:
# Let's see the dataframe again with patients names
patients_indexname

Unnamed: 0,age,weight,height
Steve Blank,23,70.5,5.9
Richard Smith,17,69.2,6.1
Kevin Maney,5,31.5,2.5
Rhonda Byrne,12,49.6,4.6
Wallace Wattles,41,82.1,5.2
Joseph Sugarman,33,70.8,5.5


In [22]:
# Use of apply and lambda
patients_indexname['age'] = patients_indexname['age'].apply(lambda x: x*3)
patients_indexname['weight'] = patients_indexname['weight'].apply(lambda x: x*3)
patients_indexname['height'] = patients_indexname['height'].apply(lambda x: x*3)
patients_indexname

Unnamed: 0,age,weight,height
Steve Blank,69,211.5,17.7
Richard Smith,51,207.6,18.3
Kevin Maney,15,94.5,7.5
Rhonda Byrne,36,148.8,13.8
Wallace Wattles,123,246.3,15.6
Joseph Sugarman,99,212.4,16.5


In [23]:
# Use of apply and map
# Apply a function to a Dataframe element wise.
patients_indexname = patients_indexname.applymap(lambda x: x/3.0)
patients_indexname

Unnamed: 0,age,weight,height
Steve Blank,23.0,70.5,5.9
Richard Smith,17.0,69.2,6.1
Kevin Maney,5.0,31.5,2.5
Rhonda Byrne,12.0,49.6,4.6
Wallace Wattles,41.0,82.1,5.2
Joseph Sugarman,33.0,70.8,5.5


## Filters

In [24]:
# Use of filter with brackets
patients_filter = patients_indexname[(patients_indexname['weight'] < 70)]
patients_filter

Unnamed: 0,age,weight,height
Richard Smith,17.0,69.2,6.1
Kevin Maney,5.0,31.5,2.5
Rhonda Byrne,12.0,49.6,4.6


In [25]:
# copy dataframe 
patients_filter2 = patients.copy()

In [26]:
# filtering with query method
patients_filter2.query('age > 15', inplace = True)
patients_filter2

Unnamed: 0,age,weight,height
0,23,70.5,5.9
1,17,69.2,6.1
4,41,82.1,5.2
5,33,70.8,5.5


In [27]:
# example of mean by column
age_mean = patients_indexname['age'].mean()
age_mean

21.833333333333332

In [28]:
# Row-wise operation with axis = 0 for a Numpy array
# df_sum_column = patients_indexname.apply(np.sum, axis=0)

# np.sum sums down the rows per column when we set axis = 0, 0 refers to the row axis for a NumPy array
# Row-wise operation
df_sum_column = patients_indexname.apply(np.sum, axis=0)
df_sum_column

age       131.0
weight    373.7
height     29.8
dtype: float64

In [29]:
# Although adding the rows does not make sense for this dataframe, we will do it for illustrative purposes
df_sum_row = patients_indexname.apply(np.sum, axis=1)
df_sum_row

Steve Blank         99.4
Richard Smith       92.3
Kevin Maney         39.0
Rhonda Byrne        66.2
Wallace Wattles    128.3
Joseph Sugarman    109.3
dtype: float64

## Group by

In [30]:
# Create a fourth column that has a list of condition values
new_col = ['sick', 'healthy', 'healthy', 'healthy', 'sick', 'sick']

# Assign list to be the new column name
patients_indexname['condition'] = new_col
patients_indexname

Unnamed: 0,age,weight,height,condition
Steve Blank,23.0,70.5,5.9,sick
Richard Smith,17.0,69.2,6.1,healthy
Kevin Maney,5.0,31.5,2.5,healthy
Rhonda Byrne,12.0,49.6,4.6,healthy
Wallace Wattles,41.0,82.1,5.2,sick
Joseph Sugarman,33.0,70.8,5.5,sick


In [31]:
# Use groupby and mean, aggregate functions
# You will attain the aggregate means across patients where sick or healthy
patients_indexname.groupby('condition').mean()

Unnamed: 0_level_0,age,weight,height
condition,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
healthy,11.333333,50.1,4.4
sick,32.333333,74.466667,5.533333


In [32]:
## End