In [55]:
import pandas as pd
import numpy as np

In [113]:
titanic_survival = pd.read_csv("data/titanic_survival.csv")
titanic_survival.shape

(1310, 14)

In [11]:
data_columns = titanic_survival.columns.tolist()
data_columns

['pclass',
 'survived',
 'name',
 'sex',
 'age',
 'sibsp',
 'parch',
 'ticket',
 'fare',
 'cabin',
 'embarked',
 'boat',
 'body',
 'home.dest']

In [12]:
titanic_survival.head(3)

Unnamed: 0,pclass,survived,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked,boat,body,home.dest
0,1.0,1.0,"Allen, Miss. Elisabeth Walton",female,29.0,0.0,0.0,24160,211.3375,B5,S,2.0,,"St Louis, MO"
1,1.0,1.0,"Allison, Master. Hudson Trevor",male,0.9167,1.0,2.0,113781,151.55,C22 C26,S,11.0,,"Montreal, PQ / Chesterville, ON"
2,1.0,0.0,"Allison, Miss. Helen Loraine",female,2.0,1.0,2.0,113781,151.55,C22 C26,S,,,"Montreal, PQ / Chesterville, ON"


**Finding The Missing Data**  
In Python, the None keyword and type indicates no value.  
The Pandas library uses NaN to indicate the missing value.  

In general terms, both NaN and None can be called *null* values.

If we want to see which values are NaN, we can use the pandas.isnull() function which takes a pandas series and returns a series of True and False values, the same way that NumPy did when we compared arrays.  

We can use this resultant series to select only the rows that have null values.  

We'll use this structure to look at the the null values for the "age" column.  

In [21]:
age = titanic_survival["age"]
print(age.loc[10:20])

age_is_null = pd.isnull(age)
age_null_true = age[age_is_null]
age_null_count = len(age_null_true)
print(age_null_count)

10    47.0
11    18.0
12    24.0
13    26.0
14    80.0
15     NaN
16    24.0
17    50.0
18    32.0
19    36.0
20    37.0
Name: age, dtype: float64
264


**The problems with missing data**

In [51]:
# gives NaN because of missing data
mean_age = sum(titanic_survival["age"] / len(titanic_survival["age"]))
mean_age

nan

In [36]:
# correct this by filtering out NaNs with pd.isnull()
age_is_null = pd.isnull(titanic_survival["age"])
good_ages = titanic_survival["age"][age_is_null == False]
correct_mean_age = sum(good_ages) / len(good_ages)
correct_mean_age

29.8811345124283

In [37]:
# OR just select where pd.notnull()
age_not_null = pd.notnull(titanic_survival["age"])
correct_mean_age = sum(age[age_not_null]) / len(age[age_not_null])
correct_mean_age

29.8811345124283

In [41]:
# OR (best way: filters out NaNs)
correct_mean_age = titanic_survival["age"].mean()
correct_mean_age
correct_mean_fare = titanic_survival["fare"].mean()
correct_mean_fare

33.29547928134572

In [52]:
# summary statistics
passenger_classes = [1, 2, 3]
fares_by_class = {}
for c in passenger_classes:
    classRows = titanic_survival["pclass"] == c
    fares_by_class[c] = titanic_survival["fare"][classRows].mean()   
fares_by_class  

{1: 87.50899164086687, 2: 21.1791963898917, 3: 13.302888700564957}

In [61]:
# pivot tables (better way to group then calculate)
fares_by_class = titanic_survival.pivot_table(index="pclass", values="fare", aggfunc=np.mean)
# (default aggfunc is actually mean so this could be omitted)
passenger_age = titanic_survival.pivot_table(index="pclass", values="age")
passenger_age

pclass
1.0    39.159918
2.0    29.506705
3.0    24.816367
Name: age, dtype: float64

In [67]:
# more complex pivot tables
port_stats = titanic_survival.pivot_table(index="embarked",values=["fare","survived"],aggfunc=np.sum)
port_stats

Unnamed: 0_level_0,fare,survived
embarked,Unnamed: 1_level_1,Unnamed: 2_level_1
C,16830.7922,150.0
Q,1526.3085,44.0
S,25033.3862,304.0


In [73]:
# drop rows with missing values using df.dropna()
drop_na_rows = titanic_survival.dropna(axis=0) #axis=0 or axis='index' , axis=1 or axis='columns'
new_titanic_survival = titanic_survival.dropna(axis=0,subset=["age","sex"])
new_titanic_survival

Unnamed: 0,pclass,survived,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked,boat,body,home.dest
0,1.0,1.0,"Allen, Miss. Elisabeth Walton",female,29.0000,0.0,0.0,24160,211.3375,B5,S,2,,"St Louis, MO"
1,1.0,1.0,"Allison, Master. Hudson Trevor",male,0.9167,1.0,2.0,113781,151.5500,C22 C26,S,11,,"Montreal, PQ / Chesterville, ON"
2,1.0,0.0,"Allison, Miss. Helen Loraine",female,2.0000,1.0,2.0,113781,151.5500,C22 C26,S,,,"Montreal, PQ / Chesterville, ON"
3,1.0,0.0,"Allison, Mr. Hudson Joshua Creighton",male,30.0000,1.0,2.0,113781,151.5500,C22 C26,S,,135.0,"Montreal, PQ / Chesterville, ON"
4,1.0,0.0,"Allison, Mrs. Hudson J C (Bessie Waldo Daniels)",female,25.0000,1.0,2.0,113781,151.5500,C22 C26,S,,,"Montreal, PQ / Chesterville, ON"
5,1.0,1.0,"Anderson, Mr. Harry",male,48.0000,0.0,0.0,19952,26.5500,E12,S,3,,"New York, NY"
6,1.0,1.0,"Andrews, Miss. Kornelia Theodosia",female,63.0000,1.0,0.0,13502,77.9583,D7,S,10,,"Hudson, NY"
7,1.0,0.0,"Andrews, Mr. Thomas Jr",male,39.0000,0.0,0.0,112050,0.0000,A36,S,,,"Belfast, NI"
8,1.0,1.0,"Appleton, Mrs. Edward Dale (Charlotte Lamson)",female,53.0000,2.0,0.0,11769,51.4792,C101,S,D,,"Bayside, Queens, NY"
9,1.0,0.0,"Artagaveytia, Mr. Ramon",male,71.0000,0.0,0.0,PC 17609,49.5042,,C,,22.0,"Montevideo, Uruguay"


In [86]:
# using iloc to access rows by position (when row labels are not numbered or in order)
new_titanic_survival = new_titanic_survival.sort_values(by="age",ascending=0)
first_five_rows = new_titanic_survival.iloc[0:5]
first_ten_rows = new_titanic_survival.iloc[0:10]
row_position_fifth = new_titanic_survival.iloc[4]
row_index_25 = new_titanic_survival.loc[25]

In [91]:
# using column indexes
first_row_first_column = new_titanic_survival.iloc[0,0]
all_rows_first_three_columns = new_titanic_survival.iloc[:,0:3]
row__index_83_age = new_titanic_survival.loc[83,"age"]
row_index_1000_pclass = new_titanic_survival.loc[766,"pclass"]
row_index_1100_age = new_titanic_survival.loc[1100,"age"]
row_index_25_survived = new_titanic_survival.loc[25,"survived"]
five_rows_three_cols = new_titanic_survival.iloc[:5,:3]
five_rows_three_cols

Unnamed: 0,pclass,survived,name
14,1.0,1.0,"Barkworth, Mr. Algernon Henry Wilson"
61,1.0,1.0,"Cavendish, Mrs. Tyrell William (Julia Florence..."
1235,3.0,0.0,"Svensson, Mr. Johan"
9,1.0,0.0,"Artagaveytia, Mr. Ramon"
135,1.0,0.0,"Goldschmidt, Mr. George B"


In [96]:
# reindexing rows using df.reset_index()
titanic_reindexed = new_titanic_survival.reset_index(drop=True)
titanic_reindexed.head(2)

Unnamed: 0,pclass,survived,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked,boat,body,home.dest
0,1.0,1.0,"Barkworth, Mr. Algernon Henry Wilson",male,80.0,0.0,0.0,27042,30.0,A23,S,B,,"Hessle, Yorks"
1,1.0,1.0,"Cavendish, Mrs. Tyrell William (Julia Florence...",female,76.0,1.0,0.0,19877,78.85,C46,S,6,,"Little Onn Hall, Staffs"


In [98]:
# apply functions over a dataframe using df.apply()
# by default, .apply will iterate through each column and perform function on each

def hundredth_row(column):
    hundredth_item = column.iloc[99]
    return hundredth_item

hundredth_row = titanic_survival.apply(hundredth_row)


def count_null(column):
    column_null = pd.isnull(column)
    null = column[column_null]
    return len(null)
    
column_null_count = titanic_survival.apply(count_null)
column_null_count

pclass          1
survived        1
name            1
sex             1
age           264
sibsp           1
parch           1
ticket          1
fare            2
cabin        1015
embarked        3
boat          824
body         1189
home.dest     565
dtype: int64

In [105]:
# apply a function to a row
def is_minor(row):
    if row["age"] < 18:
        return True
    else:
        return False

minors = titanic_survival.apply(is_minor, axis=1)
import pandas as pd

def generate_age_label(row):
    age = row["age"]
    if pd.isnull(age):
        return "unknown"
    elif age < 18:
        return "minor"
    else:
        return "adult"

age_labels = titanic_survival.apply(generate_age_label, axis=1)
age_labels.head(3)

0    adult
1    minor
2    minor
dtype: object

In [112]:
# calculating survival percentage by age group
titanic_survival["age_labels"] = age_labels
age_group_survival = titanic_survival.pivot_table(index="age_labels",values="survived")
age_group_survival

age_labels
adult      0.387892
minor      0.525974
unknown    0.277567
Name: survived, dtype: float64