## Introduction to Dataset

In [15]:
import pandas as pd
titanic_survival = pd.read_csv('titanic_survival.csv')
# ⚠️ None: no value
# ⚠️ NaN: not a number
titanic_survival.loc[0]

pclass                                   1
survived                                 1
name         Allen, Miss. Elisabeth Walton
sex                                 female
age                                     29
sibsp                                    0
parch                                    0
ticket                               24160
fare                               211.338
cabin                                   B5
embarked                                 S
boat                                     2
body                                   NaN
home.dest                     St Louis, MO
Name: 0, dtype: object

## Finding the Missing Data with pd.isnull( )

In [16]:
# ⚠️pandas.isnull() takes a series and returns a series of True and False
# Count how many values in the "age" column have null values
age = titanic_survival['age']
age_null_true = age[pd.isnull(age)]
# return a series and we could use 'len' method
age_null_count = len(age_null_true)

correct_mean_age = titanic_survival["age"].mean()
correct_mean_fare = titanic_survival['fare'].mean()

## Drop Missing Values with df.dropna( )

In [17]:
# Drop all columns in titanic_survival that have missing values 
drop_na_columns = titanic_survival.dropna(axis=1)
# Drop all rows where the columns "age" or "sex" have missing values 
new_titanic_survival = titanic_survival.dropna(axis=0,subset=["age", "sex"])

#⚠️ pd.isnull() vs df.dropna()
#⚠️ remember the method of df.dropna(axis, subset)

## Calculating Summary Statistics

In [18]:
# create a dictionary fares_by_class which has 1, 2, and 3 as keys, with the average fares as the corresponding values.
passenger_classes = [1, 2, 3]
fares_by_class ={}
for pclass in passenger_classes:
    mean_fare = titanic_survival['fare'][titanic_survival['pclass']==pclass].mean()
# ⚠️comparison is powerful    
    fares_by_class[pclass] = mean_fare
fares_by_class

{1: 87.50899164086687, 2: 21.1791963898917, 3: 13.302888700564957}

## Better Way -- Pivot Table

In [19]:
# ⚠️pivot_table first group and then apply a calculation
import numpy as np
passenger_class_fares = titanic_survival.pivot_table(index="pclass", values="fare", aggfunc=np.mean)
# More Complex Pivot Tables
# calculates the total fares and total number of survivors for each embarkation port
port_stats = titanic_survival.pivot_table(index ='embarked', values=['fare','survived'],aggfunc = np.sum)

## Apply( ) in a DataFrame iterating over column

In [31]:
# Write a function that counts the number of null elements in a Series
def null_counts(df):
    return len(df[pd.isnull(df)])
column_null_count = titanic_survival.apply(null_counts)
column_null_count 
# ⚠️ iterate over column by default 每一列单独计算

pclass           1
survived         1
name             1
sex              1
age            264
sibsp            1
parch            1
ticket           1
fare             2
cabin         1015
embarked         3
boat           824
body          1189
home.dest      565
age_labels       0
dtype: int64

## Apply( ) in a DataFrame iterating over row

In [32]:
# ⚠️iterate over rows with parameter 'axis =1' 每一行单独计算
def theage(row):
    age = row["age"]
    if pd.isnull(age)==True:
        return "unknown"
    elif age < 18:
        return "minor"
    else:
        return "adult"

age_labels = titanic_survival.apply(theage, axis=1)
print(age_labels)
titanic_survival['age_labels'] = age_labels

0         adult
1         minor
2         minor
3         adult
4         adult
5         adult
6         adult
7         adult
8         adult
9         adult
10        adult
11        adult
12        adult
13        adult
14        adult
15      unknown
16        adult
17        adult
18        adult
19        adult
20        adult
21        adult
22        adult
23        adult
24        adult
25        adult
26        adult
27        adult
28        adult
29        adult
         ...   
1280      adult
1281      adult
1282    unknown
1283    unknown
1284    unknown
1285      adult
1286      adult
1287      adult
1288      adult
1289      adult
1290      adult
1291    unknown
1292    unknown
1293    unknown
1294      adult
1295      adult
1296      adult
1297    unknown
1298      adult
1299      adult
1300      minor
1301      adult
1302    unknown
1303    unknown
1304      minor
1305    unknown
1306      adult
1307      adult
1308      adult
1309    unknown
Length: 1310, dtype: obj

## Calculating Survival Percentage by Age Group

In [30]:
# calculates the mean survival chance("survived") for each age group ("age_labels")
import numpy as np
titanic_survival.pivot_table(index = 'age_labels', values = 'survived', aggfunc = np.mean)

Unnamed: 0_level_0,survived
age_labels,Unnamed: 1_level_1
adult,0.387892
minor,0.525974
unknown,0.277567
