In [6]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
sns.set_style('whitegrid')

%matplotlib inline
%config InlineBackend.figure_format ='retina'

## Merging/Joining

#### often you'll be in situations where you have two tables that share a common column that would ideally be combined into one table

In [11]:
# load only passenger and surival data
path_to_file = './datasets/titanic_clean.csv'
survived = pd.read_csv(path_to_file, usecols=['PassengerId','Survived'])
survived.head(1)

Unnamed: 0,PassengerId,Survived
0,1,0


In [13]:
# load only passenger data without survival value
passengers = pd.read_csv(path_to_file, usecols=['PassengerId', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Fare', 'Embarked'])
passengers.head(1)

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Fare,Embarked
0,1,3,"Braund, Mr. Owen Harris",male,22.0,1,0,7.25,S


![](./datasets/joins.png)

**left joins preserve each row in the left table and bring over values from the right table where the values match**

In [14]:
# sample from survived so that not all passengers are in table
survived_sample = survived.sample(5)
survived_sample

Unnamed: 0,PassengerId,Survived
275,344,0
410,517,1
358,448,1
556,703,0
29,39,0


**left join survived_sample on passengers by passenger_id**

In [15]:
print(passengers.shape)
example_join = passengers.merge(survived_sample,on='PassengerId',how='left')
#equivalent syntax that you'll see in lessons
#pd.merge(passengers,survived_sample,on='PassengerId',how='left')
print(example_join.shape)

(712, 9)
(712, 10)


In [16]:
print('rows in example_join df with null Surived values: ',example_join.Survived.isnull().sum())
mask = example_join.Survived.notnull()
example_join[mask]

rows in example_join df with null Surived values:  707


Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Fare,Embarked,Survived
29,39,3,"Vander Planke, Miss. Augusta Maria",female,18.0,2,0,18.0,S,0.0
275,344,2,"Sedgwick, Mr. Charles Frederick Waddington",male,25.0,0,0,13.0,S,0.0
358,448,1,"Seward, Mr. Frederic Kimber",male,34.0,0,0,26.55,S,1.0
410,517,2,"Lemore, Mrs. (Amelia Milley)",female,34.0,0,0,10.5,S,1.0
556,703,3,"Barbara, Miss. Saiide",female,18.0,0,1,14.4542,C,0.0


**inner joins preserve only rows in the left table where the values match in the right table. In the example below, the passengers dataframe changes from 712 rows to 5 rows when inner joined with the survived sample. This can result in uintended information loss but can also be a way for you to filter for observations with complete attributes (e.g. maybe everyone without a survival outcome is irrelavant for training purposes)**

In [17]:
passengers.merge(survived_sample,on='PassengerId',how='inner')

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Fare,Embarked,Survived
0,39,3,"Vander Planke, Miss. Augusta Maria",female,18.0,2,0,18.0,S,0
1,344,2,"Sedgwick, Mr. Charles Frederick Waddington",male,25.0,0,0,13.0,S,0
2,448,1,"Seward, Mr. Frederic Kimber",male,34.0,0,0,26.55,S,1
3,517,2,"Lemore, Mrs. (Amelia Milley)",female,34.0,0,0,10.5,S,1
4,703,3,"Barbara, Miss. Saiide",female,18.0,0,1,14.4542,C,0


In [18]:
print(passengers.shape)
titanic = passengers.merge(survived,on='PassengerId',how='left')
print(titanic.shape)
titanic.head(1)

(712, 9)
(712, 10)


Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Fare,Embarked,Survived
0,1,3,"Braund, Mr. Owen Harris",male,22.0,1,0,7.25,S,0


## Concatenating

**Suppose our third class passengers are in a separate table but we want to make one table.**

In [19]:
third_class = titanic[titanic.Pclass==3].copy()
first_second = titanic[titanic.Pclass!=3].copy()

In [20]:
third_class.head(2)

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Fare,Embarked,Survived
0,1,3,"Braund, Mr. Owen Harris",male,22.0,1,0,7.25,S,0
2,3,3,"Heikkinen, Miss. Laina",female,26.0,0,0,7.925,S,1


In [14]:
first_second.head(2)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Fare,Embarked
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,71.2833,C
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,53.1,S


**The two tables have the same columns. What is your intution about what to combine them as one table?**

**We'll sometimes receive two datasets with common index values in two tables, one table with the target variable and another with our feature set. In those cases we want to combine the tables vertically.**

In [8]:
X = titanic.drop('Survived',axis=1).copy()
y = titanic[['Survived']].copy()

NameError: name 'titanic' is not defined

In [20]:
print(X.shape)
X.head(1)

(712, 9)


Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Fare,Embarked
0,1,3,"Braund, Mr. Owen Harris",male,22.0,1,0,7.25,S


In [21]:
print(y.shape)
y.head(1)

(712, 1)


Unnamed: 0,Survived
0,0


**check the documentation for concat at https://pandas.pydata.org/docs/reference/api/pandas.concat.html and write your solution to concatenate X and y below**

In [3]:
pd.concat([X,y],axis=1)

NameError: name 'pd' is not defined

**what differentiates joins from concatenating dataframes?**

## Grouping - class exercise

**Intuitively we can appreciate that the rate of something can be predictive.  Write python code (or pseudo code) that takes the titanic dataframe as an input and outputs the surival rate of each passenger class, replicating the output below.**

In [26]:
# clue
class1total = 0
class2total = 0
class3total = 0
class1survived = 0
class2survived = 0
class3survived = 0

for i in titanic.index:
    row = titanic.iloc[i,:]
    row.Survived
    if row.Pclass == 1:
        class1total += 1
        if row.Survived == 1:
            class1survived += 1
    if row.Pclass == 2:
        class2total += 1
        if row.Survived == 1:
            class2survived += 1
    if row.Pclass == 3:
        class3total += 1
        if row.Survived == 1:
            class3survived += 1

    
print(class1total,
class2total,
class3total,
class1survived,
class2survived,
class3survived)

print(class1survived/class1total)
print(class2survived/class2total)
print(class3survived/class3total)

184 173 355 120 83 85
0.6521739130434783
0.4797687861271676
0.23943661971830985


**We all saw the movie Titanic and the women got on the boats first. See if there are different surival rates within each fare class for each gender (e.g. the survival rate of women in first class)**

In [None]:
class1total = 0
class2total = 0
class3total = 0
class1survived = 0
class2survived = 0
class3survived = 0

class1_female_total = 0
class1_male_total = 0
class1_female_survived_total = 0
class1_male_survived_total = 0
class2_female_total = 0
class2_male_total = 0
class2_female_survived_total = 0
class2_male_survived_total = 0
class3_female_total = 0
class3_male_total = 0
class3_female_survived_total = 0
class3_male_survived_total = 0

for i in titanic.index:
    row = titanic.iloc[i,:]
    row.Survived
    if row.Pclass == 1 & row.Sex == "male":
        class1_male_total += 1
        if row.Survived == 1:
            class1_male_survived_total += 1
    if row.Pclass == 1 & row.Sex == "female":
        class1_female_total += 1
        if row.Survived == 1:
            class1_female_survived_total += 1
    if row.Pclass == 2 & row.Sex == "male":
        class2_male_total += 1
        if row.Survived == 1:
            class2_male_survived_total += 1
    if row.Pclass == 2 & row.Sex == "female":
        class2_female_total += 1
        if row.Survived == 1:
            class2_female_survived_total += 1
    if row.Pclass == 3 & row.Sex == "male":
        class3_male_total += 1
        if row.Survived == 1:
            class3_male_survived_total += 1
    if row.Pclass == 3 & row.Sex == "female":
        class3_female_total += 1
        if row.Survived == 1:
            class3_female_survived_total += 1
    
print(class1_female_total = 0
class1_male_total = 0
class1_female_survived_total = 0
class1_male_survived_total = 0
class2_female_total = 0
class2_male_total = 0
class2_female_survived_total = 0
class2_male_survived_total,
class3_female_total,
class3_male_total,
class3_female_survived_total,
class3_male_survived_total)

**If you got an answer, or if you didn't and skipped down here, what can you foresee as the problems with writing your own functions to approach these problems?**