In [1]:
import numpy as np
import pandas as pd

import sklearn.impute
import sklearn.model_selection
import sklearn.preprocessing

import acquire as ac
import prepare as pr

import warnings
warnings.filterwarnings("ignore")

---
### 1. Iris Data

In [2]:
# a. Use the function defined in acquire.py to load the iris data.
iris = ac.get_iris_data()
iris

Unnamed: 0,measurement_id,sepal_length,sepal_width,petal_length,petal_width,species_id,species_name
0,1,5.1,3.5,1.4,0.2,1,setosa
1,2,4.9,3.0,1.4,0.2,1,setosa
2,3,4.7,3.2,1.3,0.2,1,setosa
3,4,4.6,3.1,1.5,0.2,1,setosa
4,5,5.0,3.6,1.4,0.2,1,setosa
...,...,...,...,...,...,...,...
145,146,6.7,3.0,5.2,2.3,3,virginica
146,147,6.3,2.5,5.0,1.9,3,virginica
147,148,6.5,3.0,5.2,2.0,3,virginica
148,149,6.2,3.4,5.4,2.3,3,virginica


In [3]:
# b. Drop the species_id and measurement_id columns.
iris.drop(columns=["species_id", "measurement_id"], inplace=True)

In [4]:
# c. Rename the species_name column to just species.
iris.rename(columns={"species_name": "species"}, inplace=True)

In [5]:
iris.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa


In [6]:
iris.species.value_counts()

versicolor    50
virginica     50
setosa        50
Name: species, dtype: int64

In [7]:
# d. Encode the species name using a sklearn label encoder.
encoder = sklearn.preprocessing.LabelEncoder()
encoder.fit(["versicolor", "virginica", "setosa"])

LabelEncoder()

In [8]:
encoder.classes_

array(['setosa', 'versicolor', 'virginica'], dtype='<U10')

In [9]:
encoder.transform(iris[["species"]])

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2])

In [10]:
# d. Research the inverse_transform method of the label encoder.
# How might this be useful?

# may be useful when receiving work on a project from a 
# colleague in order to further familiarize oneself
encoder.inverse_transform([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2])

array(['setosa', 'setosa', 'setosa', 'setosa', 'setosa', 'setosa',
       'setosa', 'setosa', 'setosa', 'setosa', 'setosa', 'setosa',
       'setosa', 'setosa', 'setosa', 'setosa', 'setosa', 'setosa',
       'setosa', 'setosa', 'setosa', 'setosa', 'setosa', 'setosa',
       'setosa', 'setosa', 'setosa', 'setosa', 'setosa', 'setosa',
       'setosa', 'setosa', 'setosa', 'setosa', 'setosa', 'setosa',
       'setosa', 'setosa', 'setosa', 'setosa', 'setosa', 'setosa',
       'setosa', 'setosa', 'setosa', 'setosa', 'setosa', 'setosa',
       'setosa', 'setosa', 'versicolor', 'versicolor', 'versicolor',
       'versicolor', 'versicolor', 'versicolor', 'versicolor',
       'versicolor', 'versicolor', 'versicolor', 'versicolor',
       'versicolor', 'versicolor', 'versicolor', 'versicolor',
       'versicolor', 'versicolor', 'versicolor', 'versicolor',
       'versicolor', 'versicolor', 'versicolor', 'versicolor',
       'versicolor', 'versicolor', 'versicolor', 'versicolor',
       'versicolo

In [11]:
df = ac.get_iris_data()
df

Unnamed: 0,measurement_id,sepal_length,sepal_width,petal_length,petal_width,species_id,species_name
0,1,5.1,3.5,1.4,0.2,1,setosa
1,2,4.9,3.0,1.4,0.2,1,setosa
2,3,4.7,3.2,1.3,0.2,1,setosa
3,4,4.6,3.1,1.5,0.2,1,setosa
4,5,5.0,3.6,1.4,0.2,1,setosa
...,...,...,...,...,...,...,...
145,146,6.7,3.0,5.2,2.3,3,virginica
146,147,6.3,2.5,5.0,1.9,3,virginica
147,148,6.5,3.0,5.2,2.0,3,virginica
148,149,6.2,3.4,5.4,2.3,3,virginica


In [12]:
# e. Create a function named prep_iris that accepts the
# untransformed iris data, and returns the data with the
# transformations above applied.

# def prep_iris(df):
#     df.drop(columns=["species_id", "measurement_id"], inplace=True)
#     df.rename(columns={"species_name": "species"}, inplace=True)
#     encoder = sklearn.preprocessing.LabelEncoder()
#     encoder.fit(["versicolor", "virginica", "setosa"])
#     df.species = encoder.transform(df[["species"]])
#     return df

In [13]:
# def drop_iris_columns(df):
#     return df.drop(columns=["species_id", "measurement_id"], inplace=True)


In [14]:
# drop_iris_columns(df) 

In [15]:
# df

In [16]:
# def rename_iris_columns(df):
#     return df.rename(columns={"species_name": "species"}, inplace=True)

In [17]:
# rename_iris_columns(df)

In [18]:
# df

In [19]:
# def encode_species(train, test):
#     encoder = sklearn.preprocessing.LabelEncoder()
#     encoder.fit(["versicolor", "virginica", "setosa"])
#     train.species = encoder.transform(train[["species"]])
#     test.species = encoder.transform(test[["species"]])
#     return train, test

In [20]:
# testing
train, test = pr.prep_iris(df)

In [21]:
train

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
112,6.8,3.0,5.5,2.1,2
101,5.8,2.7,5.1,1.9,2
72,6.3,2.5,4.9,1.5,1
40,5.0,3.5,1.3,0.3,0
38,4.4,3.0,1.3,0.2,0
...,...,...,...,...,...
87,6.3,2.3,4.4,1.3,1
142,5.8,2.7,5.1,1.9,2
122,7.7,2.8,6.7,2.0,2
143,6.8,3.2,5.9,2.3,2


In [22]:
test

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
137,6.4,3.1,5.5,1.8,2
98,5.1,2.5,3.0,1.1,1
77,6.7,3.0,5.0,1.7,1
141,6.9,3.1,5.1,2.3,2
120,6.9,3.2,5.7,2.3,2
145,6.7,3.0,5.2,2.3,2
146,6.3,2.5,5.0,1.9,2
35,5.0,3.2,1.2,0.2,0
12,4.8,3.0,1.4,0.1,0
127,6.1,3.0,4.9,1.8,2


---
### 2. Titanic Data

In [23]:
# a. Use the function you defined in acquire.py to load the
# titanic data set.
titanic = ac.get_titanic_data()
titanic

Unnamed: 0,passenger_id,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,deck,embark_town,alone
0,0,0,3,male,22.0,1,0,7.2500,S,Third,,Southampton,0
1,1,1,1,female,38.0,1,0,71.2833,C,First,C,Cherbourg,0
2,2,1,3,female,26.0,0,0,7.9250,S,Third,,Southampton,1
3,3,1,1,female,35.0,1,0,53.1000,S,First,C,Southampton,0
4,4,0,3,male,35.0,0,0,8.0500,S,Third,,Southampton,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,886,0,2,male,27.0,0,0,13.0000,S,Second,,Southampton,1
887,887,1,1,female,19.0,0,0,30.0000,S,First,B,Southampton,1
888,888,0,3,female,,1,2,23.4500,S,Third,,Southampton,0
889,889,1,1,male,26.0,0,0,30.0000,C,First,C,Cherbourg,1


In [24]:
# b. Handle the missing values in the embark_town and embarked
# columns.
titanic.embark_town.value_counts()

Southampton    644
Cherbourg      168
Queenstown      77
Name: embark_town, dtype: int64

In [25]:
titanic.embarked.value_counts()

S    644
C    168
Q     77
Name: embarked, dtype: int64

In [26]:
titanic.embark_town = titanic.embark_town.fillna("Southampton")

In [27]:
titanic.embark_town.value_counts()

Southampton    646
Cherbourg      168
Queenstown      77
Name: embark_town, dtype: int64

In [28]:
titanic.embarked = titanic.embarked.fillna("S")

In [29]:
titanic.embarked.value_counts()

S    646
C    168
Q     77
Name: embarked, dtype: int64

In [30]:
# c. Remove the deck column.
titanic.drop(columns="deck", inplace=True)
titanic

Unnamed: 0,passenger_id,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,embark_town,alone
0,0,0,3,male,22.0,1,0,7.2500,S,Third,Southampton,0
1,1,1,1,female,38.0,1,0,71.2833,C,First,Cherbourg,0
2,2,1,3,female,26.0,0,0,7.9250,S,Third,Southampton,1
3,3,1,1,female,35.0,1,0,53.1000,S,First,Southampton,0
4,4,0,3,male,35.0,0,0,8.0500,S,Third,Southampton,1
...,...,...,...,...,...,...,...,...,...,...,...,...
886,886,0,2,male,27.0,0,0,13.0000,S,Second,Southampton,1
887,887,1,1,female,19.0,0,0,30.0000,S,First,Southampton,1
888,888,0,3,female,,1,2,23.4500,S,Third,Southampton,0
889,889,1,1,male,26.0,0,0,30.0000,C,First,Cherbourg,1


In [31]:
# d. Use a label encoder to transform the embarked column.
titanic.embarked.value_counts()

S    646
C    168
Q     77
Name: embarked, dtype: int64

In [32]:
encoder = sklearn.preprocessing.LabelEncoder()
encoder.fit(["S", "C", "Q"])

LabelEncoder()

In [33]:
encoder.classes_

array(['C', 'Q', 'S'], dtype='<U1')

In [34]:
titanic.embarked = encoder.transform(titanic[["embarked"]])

In [35]:
titanic

Unnamed: 0,passenger_id,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,embark_town,alone
0,0,0,3,male,22.0,1,0,7.2500,2,Third,Southampton,0
1,1,1,1,female,38.0,1,0,71.2833,0,First,Cherbourg,0
2,2,1,3,female,26.0,0,0,7.9250,2,Third,Southampton,1
3,3,1,1,female,35.0,1,0,53.1000,2,First,Southampton,0
4,4,0,3,male,35.0,0,0,8.0500,2,Third,Southampton,1
...,...,...,...,...,...,...,...,...,...,...,...,...
886,886,0,2,male,27.0,0,0,13.0000,2,Second,Southampton,1
887,887,1,1,female,19.0,0,0,30.0000,2,First,Southampton,1
888,888,0,3,female,,1,2,23.4500,2,Third,Southampton,0
889,889,1,1,male,26.0,0,0,30.0000,0,First,Cherbourg,1


In [36]:
# f. Fill the missing values in age. The way you fill these
# values is up to you. Consider the tradeoffs of different
# methods.
titanic.age.isna().sum()

177

In [37]:
imputer = sklearn.impute.SimpleImputer(strategy="median")
imputer.fit(titanic[["age"]])
titanic.age = imputer.transform(titanic[["age"]])
titanic

Unnamed: 0,passenger_id,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,embark_town,alone
0,0,0,3,male,22.0,1,0,7.2500,2,Third,Southampton,0
1,1,1,1,female,38.0,1,0,71.2833,0,First,Cherbourg,0
2,2,1,3,female,26.0,0,0,7.9250,2,Third,Southampton,1
3,3,1,1,female,35.0,1,0,53.1000,2,First,Southampton,0
4,4,0,3,male,35.0,0,0,8.0500,2,Third,Southampton,1
...,...,...,...,...,...,...,...,...,...,...,...,...
886,886,0,2,male,27.0,0,0,13.0000,2,Second,Southampton,1
887,887,1,1,female,19.0,0,0,30.0000,2,First,Southampton,1
888,888,0,3,female,28.0,1,2,23.4500,2,Third,Southampton,0
889,889,1,1,male,26.0,0,0,30.0000,0,First,Cherbourg,1


In [38]:
titanic.age.isna().sum()

0

In [39]:
# e. Scale the age and fare columns using a min max scaler.
# Why might this be beneficial?
# When might you not want to do this?
scaler = sklearn.preprocessing.MinMaxScaler()
scaler.fit(titanic[["age", "fare"]])
titanic[["age", "fare"]] = scaler.transform(titanic[["age", "fare"]])
titanic

Unnamed: 0,passenger_id,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,embark_town,alone
0,0,0,3,male,0.271174,1,0,0.014151,2,Third,Southampton,0
1,1,1,1,female,0.472229,1,0,0.139136,0,First,Cherbourg,0
2,2,1,3,female,0.321438,0,0,0.015469,2,Third,Southampton,1
3,3,1,1,female,0.434531,1,0,0.103644,2,First,Southampton,0
4,4,0,3,male,0.434531,0,0,0.015713,2,Third,Southampton,1
...,...,...,...,...,...,...,...,...,...,...,...,...
886,886,0,2,male,0.334004,0,0,0.025374,2,Second,Southampton,1
887,887,1,1,female,0.233476,0,0,0.058556,2,First,Southampton,1
888,888,0,3,female,0.346569,1,2,0.045771,2,Third,Southampton,0
889,889,1,1,male,0.321438,0,0,0.058556,0,First,Cherbourg,1


In [40]:
# g. Create a function named prep_titanic that accepts the
# untransformed titanic data, and returns the data with the
# transformations above applied.
# def prep_titanic(df):
#     df.embark_town = df.embark_town.fillna("Southampton")
#     df.embarked = df.embarked.fillna("S")
    
#     df.drop(columns="deck", inplace=True)
    
#     encoder = sklearn.preprocessing.LabelEncoder()
#     encoder.fit(["S", "C", "Q"])
#     df.embarked = encoder.transform(df[["embarked"]])
    
#     imputer = sklearn.impute.SimpleImputer(strategy="median")
#     imputer.fit(df[["age"]])
#     titanic.age = imputer.transform(titanic[["age"]])
    
#     scaler = sklearn.preprocessing.MinMaxScaler()
#     scaler.fit(df[["age", "fare"]])
#     df[["age", "fare"]] = scaler.transform(df[["age", "fare"]])
    
#     return df
    
    

In [41]:
df = ac.get_titanic_data()
df

Unnamed: 0,passenger_id,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,deck,embark_town,alone
0,0,0,3,male,22.0,1,0,7.2500,S,Third,,Southampton,0
1,1,1,1,female,38.0,1,0,71.2833,C,First,C,Cherbourg,0
2,2,1,3,female,26.0,0,0,7.9250,S,Third,,Southampton,1
3,3,1,1,female,35.0,1,0,53.1000,S,First,C,Southampton,0
4,4,0,3,male,35.0,0,0,8.0500,S,Third,,Southampton,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,886,0,2,male,27.0,0,0,13.0000,S,Second,,Southampton,1
887,887,1,1,female,19.0,0,0,30.0000,S,First,B,Southampton,1
888,888,0,3,female,,1,2,23.4500,S,Third,,Southampton,0
889,889,1,1,male,26.0,0,0,30.0000,C,First,C,Cherbourg,1


In [42]:
train, test = pr.prep_titanic(df)

In [43]:
train

Unnamed: 0,passenger_id,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,embark_town,alone
226,226,1,2,male,0.233476,0,0,0.020495,2,Second,Southampton,1
278,278,0,3,male,0.082684,4,1,0.056848,1,Third,Queenstown,0
31,31,1,1,female,0.346569,1,0,0.285990,0,First,Cherbourg,0
449,449,1,1,male,0.648153,0,0,0.059532,2,First,Southampton,1
632,632,1,1,male,0.396833,0,0,0.059532,0,First,Cherbourg,1
...,...,...,...,...,...,...,...,...,...,...,...,...
418,418,0,2,male,0.371701,0,0,0.025374,2,Second,Southampton,1
192,192,1,3,female,0.233476,1,0,0.015330,2,Third,Southampton,0
399,399,1,2,female,0.346569,0,0,0.024691,2,Second,Southampton,1
484,484,1,1,male,0.308872,1,0,0.177775,0,First,Cherbourg,0


In [44]:
test

Unnamed: 0,passenger_id,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,embark_town,alone
665,665,0,2,male,0.396833,2,0,0.143462,2,Second,Southampton,0
336,336,0,1,male,0.359135,1,0,0.129995,2,First,Southampton,0
524,524,0,3,male,0.346569,0,0,0.014110,0,Third,Cherbourg,1
635,635,1,2,female,0.346569,0,0,0.025374,2,Second,Southampton,1
452,452,0,1,male,0.371701,0,0,0.054164,0,First,Cherbourg,1
...,...,...,...,...,...,...,...,...,...,...,...,...
621,621,1,1,male,0.522493,1,0,0.102579,2,First,Southampton,0
240,240,0,3,female,0.346569,1,0,0.028213,0,Third,Cherbourg,0
27,27,0,1,male,0.233476,3,2,0.513342,2,First,Southampton,0
686,686,0,3,male,0.170646,4,1,0.077465,2,Third,Southampton,0
