# 1. Introduction

In [4]:
import pandas as pd
titanic_survival = pd.read_csv("titanic_survival.csv")

# 2. Finding the missing data

In [6]:
age = titanic_survival["age"]
print(age.loc[10:20])
age_is_null = pd.isnull(age)
age_null_true = age[age_is_null]
age_null_count = len(age_null_true)
print("number of passagers with unknown age: {0}".format(age_null_count))

10    47.0
11    18.0
12    24.0
13    26.0
14    80.0
15     NaN
16    24.0
17    50.0
18    32.0
19    36.0
20    37.0
Name: age, dtype: float64
number of passagers with unknown age: 263


In [7]:
titanic_survival.isnull().any().any()

True

In [8]:
titanic_survival.isnull().sum().sum()

3855

# 3. Whats the big deal with missing data

In [9]:
age_is_null = pd.isnull(titanic_survival["age"])
good_ages = titanic_survival["age"][age_is_null == False]
correct_mean_age = sum(good_ages) / len(good_ages)

In [16]:
age_is_null = titanic_survival["age"].isnull()

# 4. Easier ways to do math

In [17]:
correct_mean_age = titanic_survival["age"].mean()
correct_mean_fare = titanic_survival["fare"].mean()

# 5. Calculating summary statistics

In [18]:
passenger_classes = [1, 2, 3]
fares_by_class = {}
for this_class in passenger_classes:
    pclass_rows = titanic_survival[titanic_survival["pclass"] == this_class]
    pclass_fares = pclass_rows["fare"]
    fare_for_class = pclass_fares.mean()
    fares_by_class[this_class] = fare_for_class
print(fares_by_class)

{1: 87.50899164086687, 2: 21.1791963898917, 3: 13.302888700564957}


# 6. Making pivot tables

In [21]:
passenger_survival = titanic_survival.pivot_table(index="pclass", values="survived", aggfunc="sum")
print(passenger_survival)
passenger_age = titanic_survival.pivot_table(index="pclass", values="age", aggfunc="mean")
print(passenger_age)

pclass
1    200
2    119
3    181
Name: survived, dtype: int64
pclass
1    39.159918
2    29.506705
3    24.816367
Name: age, dtype: float64


In [22]:
import numpy as np
port_stats = titanic_survival.pivot_table(index="embarked", values=["fare","survived"], aggfunc=[np.mean, np.sum])
print(port_stats)

               mean                   sum         
               fare  survived        fare survived
embarked                                          
C         62.336267  0.555556  16830.7922      150
Q         12.409012  0.357724   1526.3085       44
S         27.418824  0.332604  25033.3862      304


# 8. Drop missing values

In [23]:
drop_na_rows = titanic_survival.dropna(axis=0) #column
drop_na_columns = titanic_survival.dropna(axis=1) #row
new_titanic_survival = titanic_survival.dropna(axis=0,subset=["age", "sex"])

# 9: Using Iloc To Access Rows By Position

In [24]:
# We have already sorted new_titanic_survival by age
first_five_rows = new_titanic_survival.iloc[0:5]
first_ten_rows = new_titanic_survival.iloc[0:10]
row_index_25 = new_titanic_survival.loc[25]
row_position_fifth = new_titanic_survival.iloc[4]

# 10. Using column indexes
We can also index columns using both the loc[] and iloc[] methods. With .loc[], we specify the column label strings as we have in the earlier exercises in this missions. With iloc[], we simply use the integer number of the column, starting from the left-most column which is 0. Similar to indexing with NumPy arrays, you separate the row and columns with a comma, and can use a colon to specify a range or as a wildcard.

In [25]:
first_row_first_column = new_titanic_survival.iloc[0,0]
all_rows_first_three_columns = new_titanic_survival.iloc[:,0:3]
row__index_83_age = new_titanic_survival.loc[83,"age"]
row_index_1000_pclass = new_titanic_survival.loc[766,"pclass"]
row_index_1100_age = new_titanic_survival.loc[1100, "age"]
row_index_25_survived = new_titanic_survival.loc[25, "survived"]
five_rows_three_cols = new_titanic_survival.iloc[0:5,0:3]

# 11. Reindexing rows

In [29]:
print(new_titanic_survival.iloc[0:5,0:3])
titanic_reindexed = new_titanic_survival.reset_index(drop=True)
print(titanic_reindexed.iloc[0:5,0:3])

   pclass  survived                                             name
0       1         1                    Allen, Miss. Elisabeth Walton
1       1         1                   Allison, Master. Hudson Trevor
2       1         0                     Allison, Miss. Helen Loraine
3       1         0             Allison, Mr. Hudson Joshua Creighton
4       1         0  Allison, Mrs. Hudson J C (Bessie Waldo Daniels)
   pclass  survived                                             name
0       1         1                    Allen, Miss. Elisabeth Walton
1       1         1                   Allison, Master. Hudson Trevor
2       1         0                     Allison, Miss. Helen Loraine
3       1         0             Allison, Mr. Hudson Joshua Creighton
4       1         0  Allison, Mrs. Hudson J C (Bessie Waldo Daniels)


# 12. Applying functions over a DataFrame

In [32]:
def hundredth_row(column):
    hundredth_item = column.iloc[99]
    return hundredth_item

hundredth_row = titanic_survival.apply(hundredth_row)
print(hundredth_row)

def not_null_count(column):
    column_null = pd.isnull(column)
    null = column[column_null]
    return len(null)

column_null_count = titanic_survival.apply(not_null_count)
print(column_null_count)

pclass                                                       1
survived                                                     1
name         Duff Gordon, Lady. (Lucille Christiana Sutherl...
sex                                                     female
age                                                         48
sibsp                                                        1
parch                                                        0
ticket                                                   11755
fare                                                      39.6
cabin                                                      A16
embarked                                                     C
boat                                                         1
body                                                       NaN
home.dest                                       London / Paris
dtype: object
pclass          0
survived        0
name            0
sex             0
age           263
sibsp         

# 13. Applying a function to a row

In [37]:
def is_minor(row):
    if row["age"] < 18:
        return True
    else:
        return False

minors = titanic_survival.apply(is_minor, axis=1)

def generate_age_label(row):
    age = row["age"]
    if pd.isnull(age):
        return "unknown"
    elif age < 18:
        return "minor"
    else:
        return "adult"

age_labels = titanic_survival.apply(generate_age_label, axis=1)
titanic_survival["age_labels"] = age_labels

# 14. Calculating survival rate by age group

In [39]:
age_group_survival = titanic_survival.pivot_table(index="age_labels", values="survived", aggfunc="sum")
print(age_group_survival)

age_labels
adult      346
minor       81
unknown     73
Name: survived, dtype: int64
