In [1]:
import numpy as np
import pandas as pd

import sklearn.impute
import sklearn.model_selection
import sklearn.preprocessing

import acquire as ac
import prepare as pr

import warnings
warnings.filterwarnings("ignore")

---
### 1. Iris Data

In [2]:
# a. Use the function defined in acquire.py to load the iris data.
iris = ac.get_iris_data()
iris

Unnamed: 0,measurement_id,sepal_length,sepal_width,petal_length,petal_width,species_id,species_name
0,1,5.1,3.5,1.4,0.2,1,setosa
1,2,4.9,3.0,1.4,0.2,1,setosa
2,3,4.7,3.2,1.3,0.2,1,setosa
3,4,4.6,3.1,1.5,0.2,1,setosa
4,5,5.0,3.6,1.4,0.2,1,setosa
...,...,...,...,...,...,...,...
145,146,6.7,3.0,5.2,2.3,3,virginica
146,147,6.3,2.5,5.0,1.9,3,virginica
147,148,6.5,3.0,5.2,2.0,3,virginica
148,149,6.2,3.4,5.4,2.3,3,virginica


In [3]:
# b. Drop the species_id and measurement_id columns.
iris.drop(columns=["species_id", "measurement_id"], inplace=True)

In [4]:
# c. Rename the species_name column to just species.
iris.rename(columns={"species_name": "species"}, inplace=True)

In [5]:
iris.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa


In [6]:
iris.species.value_counts()

versicolor    50
virginica     50
setosa        50
Name: species, dtype: int64

In [7]:
train, test = sklearn.model_selection.train_test_split(iris, train_size=.8, random_state=56)

In [8]:
# d. Encode the species name using a sklearn label encoder.
encoder = sklearn.preprocessing.LabelEncoder()
encoder.fit(train.species)

LabelEncoder()

In [9]:
encoder.classes_

array(['setosa', 'versicolor', 'virginica'], dtype=object)

In [10]:
train.species = encoder.transform(train.species)
test.species = encoder.transform(test.species)

In [11]:
train.species

112    2
101    2
72     1
40     0
38     0
      ..
87     1
142    2
122    2
143    2
85     1
Name: species, Length: 120, dtype: int64

In [12]:
test.species

137    2
98     1
77     1
141    2
120    2
145    2
146    2
35     0
12     0
127    2
71     1
4      0
0      0
18     0
83     1
6      0
7      0
9      0
103    2
81     1
42     0
108    2
57     1
68     1
30     0
104    2
105    2
63     1
73     1
53     1
Name: species, dtype: int64

In [13]:
# d. Research the inverse_transform method of the label encoder.
# How might this be useful?

# may be useful when receiving work on a project from a 
# colleague in order to further familiarize oneself
train.species = encoder.inverse_transform(train.species)
test.species = encoder.inverse_transform(test.species)

In [14]:
train.species

112     virginica
101     virginica
72     versicolor
40         setosa
38         setosa
          ...    
87     versicolor
142     virginica
122     virginica
143     virginica
85     versicolor
Name: species, Length: 120, dtype: object

In [15]:
test.species

137     virginica
98     versicolor
77     versicolor
141     virginica
120     virginica
145     virginica
146     virginica
35         setosa
12         setosa
127     virginica
71     versicolor
4          setosa
0          setosa
18         setosa
83     versicolor
6          setosa
7          setosa
9          setosa
103     virginica
81     versicolor
42         setosa
108     virginica
57     versicolor
68     versicolor
30         setosa
104     virginica
105     virginica
63     versicolor
73     versicolor
53     versicolor
Name: species, dtype: object

In [16]:
df = ac.get_iris_data()
df

Unnamed: 0,measurement_id,sepal_length,sepal_width,petal_length,petal_width,species_id,species_name
0,1,5.1,3.5,1.4,0.2,1,setosa
1,2,4.9,3.0,1.4,0.2,1,setosa
2,3,4.7,3.2,1.3,0.2,1,setosa
3,4,4.6,3.1,1.5,0.2,1,setosa
4,5,5.0,3.6,1.4,0.2,1,setosa
...,...,...,...,...,...,...,...
145,146,6.7,3.0,5.2,2.3,3,virginica
146,147,6.3,2.5,5.0,1.9,3,virginica
147,148,6.5,3.0,5.2,2.0,3,virginica
148,149,6.2,3.4,5.4,2.3,3,virginica


In [17]:
# e. Create a function named prep_iris that accepts the
# untransformed iris data, and returns the data with the
# transformations above applied.

# def prep_iris(df):
#     df.drop(columns=["species_id", "measurement_id"], inplace=True)
#     df.rename(columns={"species_name": "species"}, inplace=True)
#     encoder = sklearn.preprocessing.LabelEncoder()
#     encoder.fit(["versicolor", "virginica", "setosa"])
#     df.species = encoder.transform(df[["species"]])
#     return df

In [18]:
# def drop_iris_columns(df):
#     return df.drop(columns=["species_id", "measurement_id"], inplace=True)


In [19]:
# drop_iris_columns(df) 

In [20]:
# df

In [21]:
# def rename_iris_columns(df):
#     return df.rename(columns={"species_name": "species"}, inplace=True)

In [22]:
# rename_iris_columns(df)

In [23]:
# df

In [24]:
# def encode_species(train, test):
#     encoder = sklearn.preprocessing.LabelEncoder()
#     encoder.fit(["versicolor", "virginica", "setosa"])
#     train.species = encoder.transform(train[["species"]])
#     test.species = encoder.transform(test[["species"]])
#     return train, test

In [25]:
# testing
train, test = pr.prep_iris(df)

In [26]:
train

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
8,4.4,2.9,1.4,0.2,0
73,6.1,2.8,4.7,1.2,1
113,5.7,2.5,5.0,2.0,2
31,5.4,3.4,1.5,0.4,0
29,4.7,3.2,1.6,0.2,0
...,...,...,...,...,...
96,5.7,2.9,4.2,1.3,1
57,4.9,2.4,3.3,1.0,1
58,6.6,2.9,4.6,1.3,1
40,5.0,3.5,1.3,0.3,0


In [27]:
test

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
60,5.0,2.0,3.5,1.0,1
36,5.5,3.5,1.3,0.2,0
127,6.1,3.0,4.9,1.8,2
145,6.7,3.0,5.2,2.3,2
41,4.5,2.3,1.3,0.3,0
94,5.6,2.7,4.2,1.3,1
34,4.9,3.1,1.5,0.2,0
0,5.1,3.5,1.4,0.2,0
149,5.9,3.0,5.1,1.8,2
14,5.8,4.0,1.2,0.2,0


---
### 2. Titanic Data

In [28]:
# a. Use the function you defined in acquire.py to load the
# titanic data set.
titanic = ac.get_titanic_data()
titanic

Unnamed: 0,passenger_id,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,deck,embark_town,alone
0,0,0,3,male,22.0,1,0,7.2500,S,Third,,Southampton,0
1,1,1,1,female,38.0,1,0,71.2833,C,First,C,Cherbourg,0
2,2,1,3,female,26.0,0,0,7.9250,S,Third,,Southampton,1
3,3,1,1,female,35.0,1,0,53.1000,S,First,C,Southampton,0
4,4,0,3,male,35.0,0,0,8.0500,S,Third,,Southampton,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,886,0,2,male,27.0,0,0,13.0000,S,Second,,Southampton,1
887,887,1,1,female,19.0,0,0,30.0000,S,First,B,Southampton,1
888,888,0,3,female,,1,2,23.4500,S,Third,,Southampton,0
889,889,1,1,male,26.0,0,0,30.0000,C,First,C,Cherbourg,1


In [29]:
# b. Handle the missing values in the embark_town and embarked
# columns.
titanic.embark_town.value_counts()

Southampton    644
Cherbourg      168
Queenstown      77
Name: embark_town, dtype: int64

In [30]:
titanic.embarked.value_counts()

S    644
C    168
Q     77
Name: embarked, dtype: int64

In [31]:
titanic.embark_town = titanic.embark_town.fillna("Southampton")

In [32]:
titanic.embark_town.value_counts()

Southampton    646
Cherbourg      168
Queenstown      77
Name: embark_town, dtype: int64

In [33]:
titanic.embarked = titanic.embarked.fillna("S")

In [34]:
titanic.embarked.value_counts()

S    646
C    168
Q     77
Name: embarked, dtype: int64

In [35]:
# c. Remove the deck column.
titanic.drop(columns="deck", inplace=True)
titanic

Unnamed: 0,passenger_id,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,embark_town,alone
0,0,0,3,male,22.0,1,0,7.2500,S,Third,Southampton,0
1,1,1,1,female,38.0,1,0,71.2833,C,First,Cherbourg,0
2,2,1,3,female,26.0,0,0,7.9250,S,Third,Southampton,1
3,3,1,1,female,35.0,1,0,53.1000,S,First,Southampton,0
4,4,0,3,male,35.0,0,0,8.0500,S,Third,Southampton,1
...,...,...,...,...,...,...,...,...,...,...,...,...
886,886,0,2,male,27.0,0,0,13.0000,S,Second,Southampton,1
887,887,1,1,female,19.0,0,0,30.0000,S,First,Southampton,1
888,888,0,3,female,,1,2,23.4500,S,Third,Southampton,0
889,889,1,1,male,26.0,0,0,30.0000,C,First,Cherbourg,1


In [36]:
# d. Use a label encoder to transform the embarked column.
titanic.embarked.value_counts()

S    646
C    168
Q     77
Name: embarked, dtype: int64

In [37]:
encoder = sklearn.preprocessing.LabelEncoder()
encoder.fit(["S", "C", "Q"])

LabelEncoder()

In [38]:
encoder.classes_

array(['C', 'Q', 'S'], dtype='<U1')

In [39]:
titanic.embarked = encoder.transform(titanic[["embarked"]])

In [40]:
titanic

Unnamed: 0,passenger_id,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,embark_town,alone
0,0,0,3,male,22.0,1,0,7.2500,2,Third,Southampton,0
1,1,1,1,female,38.0,1,0,71.2833,0,First,Cherbourg,0
2,2,1,3,female,26.0,0,0,7.9250,2,Third,Southampton,1
3,3,1,1,female,35.0,1,0,53.1000,2,First,Southampton,0
4,4,0,3,male,35.0,0,0,8.0500,2,Third,Southampton,1
...,...,...,...,...,...,...,...,...,...,...,...,...
886,886,0,2,male,27.0,0,0,13.0000,2,Second,Southampton,1
887,887,1,1,female,19.0,0,0,30.0000,2,First,Southampton,1
888,888,0,3,female,,1,2,23.4500,2,Third,Southampton,0
889,889,1,1,male,26.0,0,0,30.0000,0,First,Cherbourg,1


In [41]:
# f. Fill the missing values in age. The way you fill these
# values is up to you. Consider the tradeoffs of different
# methods.
titanic.age.isna().sum()

177

In [42]:
imputer = sklearn.impute.SimpleImputer(strategy="median")
imputer.fit(titanic[["age"]])
titanic.age = imputer.transform(titanic[["age"]])
titanic

Unnamed: 0,passenger_id,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,embark_town,alone
0,0,0,3,male,22.0,1,0,7.2500,2,Third,Southampton,0
1,1,1,1,female,38.0,1,0,71.2833,0,First,Cherbourg,0
2,2,1,3,female,26.0,0,0,7.9250,2,Third,Southampton,1
3,3,1,1,female,35.0,1,0,53.1000,2,First,Southampton,0
4,4,0,3,male,35.0,0,0,8.0500,2,Third,Southampton,1
...,...,...,...,...,...,...,...,...,...,...,...,...
886,886,0,2,male,27.0,0,0,13.0000,2,Second,Southampton,1
887,887,1,1,female,19.0,0,0,30.0000,2,First,Southampton,1
888,888,0,3,female,28.0,1,2,23.4500,2,Third,Southampton,0
889,889,1,1,male,26.0,0,0,30.0000,0,First,Cherbourg,1


In [43]:
titanic.age.isna().sum()

0

In [44]:
# e. Scale the age and fare columns using a min max scaler.
# Why might this be beneficial?
# When might you not want to do this?
scaler = sklearn.preprocessing.MinMaxScaler()
scaler.fit(titanic[["age", "fare"]])
titanic[["age", "fare"]] = scaler.transform(titanic[["age", "fare"]])
titanic

Unnamed: 0,passenger_id,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,embark_town,alone
0,0,0,3,male,0.271174,1,0,0.014151,2,Third,Southampton,0
1,1,1,1,female,0.472229,1,0,0.139136,0,First,Cherbourg,0
2,2,1,3,female,0.321438,0,0,0.015469,2,Third,Southampton,1
3,3,1,1,female,0.434531,1,0,0.103644,2,First,Southampton,0
4,4,0,3,male,0.434531,0,0,0.015713,2,Third,Southampton,1
...,...,...,...,...,...,...,...,...,...,...,...,...
886,886,0,2,male,0.334004,0,0,0.025374,2,Second,Southampton,1
887,887,1,1,female,0.233476,0,0,0.058556,2,First,Southampton,1
888,888,0,3,female,0.346569,1,2,0.045771,2,Third,Southampton,0
889,889,1,1,male,0.321438,0,0,0.058556,0,First,Cherbourg,1


In [45]:
# g. Create a function named prep_titanic that accepts the
# untransformed titanic data, and returns the data with the
# transformations above applied.
# def prep_titanic(df):
#     df.embark_town = df.embark_town.fillna("Southampton")
#     df.embarked = df.embarked.fillna("S")
    
#     df.drop(columns="deck", inplace=True)
    
#     encoder = sklearn.preprocessing.LabelEncoder()
#     encoder.fit(["S", "C", "Q"])
#     df.embarked = encoder.transform(df[["embarked"]])
    
#     imputer = sklearn.impute.SimpleImputer(strategy="median")
#     imputer.fit(df[["age"]])
#     titanic.age = imputer.transform(titanic[["age"]])
    
#     scaler = sklearn.preprocessing.MinMaxScaler()
#     scaler.fit(df[["age", "fare"]])
#     df[["age", "fare"]] = scaler.transform(df[["age", "fare"]])
    
#     return df
    
    

In [46]:
df = ac.get_titanic_data()
df

Unnamed: 0,passenger_id,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,deck,embark_town,alone
0,0,0,3,male,22.0,1,0,7.2500,S,Third,,Southampton,0
1,1,1,1,female,38.0,1,0,71.2833,C,First,C,Cherbourg,0
2,2,1,3,female,26.0,0,0,7.9250,S,Third,,Southampton,1
3,3,1,1,female,35.0,1,0,53.1000,S,First,C,Southampton,0
4,4,0,3,male,35.0,0,0,8.0500,S,Third,,Southampton,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,886,0,2,male,27.0,0,0,13.0000,S,Second,,Southampton,1
887,887,1,1,female,19.0,0,0,30.0000,S,First,B,Southampton,1
888,888,0,3,female,,1,2,23.4500,S,Third,,Southampton,0
889,889,1,1,male,26.0,0,0,30.0000,C,First,C,Cherbourg,1


In [47]:
train, test = pr.prep_titanic(df)

In [48]:
train

Unnamed: 0,passenger_id,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,deck,embark_town,alone
212,212,0,3,male,0.271174,0,0,0.014151,2,Third,,Southampton,1
222,222,0,3,male,0.635587,0,0,0.015713,2,Third,,Southampton,1
775,775,0,3,male,0.220910,0,0,0.015127,2,Third,,Southampton,1
229,229,0,3,female,0.346569,3,1,0.049708,2,Third,,Southampton,0
751,751,1,3,male,0.070118,0,1,0.024350,2,Third,E,Southampton,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
561,561,0,3,male,0.497361,0,0,0.015412,2,Third,,Southampton,1
394,394,1,3,female,0.296306,0,2,0.032596,2,Third,G,Southampton,0
648,648,0,3,male,0.346569,0,0,0.014737,2,Third,,Southampton,1
525,525,0,3,male,0.503644,0,0,0.015127,1,Third,,Queenstown,1


In [49]:
test

Unnamed: 0,passenger_id,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,deck,embark_town,alone
784,784,0,3,male,0.308872,0,0,0.013761,2,Third,,Southampton,1
382,382,0,3,male,0.396833,0,0,0.015469,2,Third,,Southampton,1
263,263,0,1,male,0.497361,0,0,0.000000,2,First,B,Southampton,1
647,647,1,1,male,0.698417,0,0,0.069291,0,First,A,Cherbourg,1
238,238,0,2,male,0.233476,0,0,0.020495,2,Second,,Southampton,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
384,384,0,3,male,0.346569,0,0,0.015412,2,Third,,Southampton,1
94,94,0,3,male,0.736115,0,0,0.014151,2,Third,,Southampton,1
610,610,0,3,female,0.484795,1,5,0.061045,2,Third,,Southampton,0
560,560,0,3,male,0.346569,0,0,0.015127,1,Third,,Queenstown,1
