In [1]:
import pandas as pd
import numpy as np
import env
from util import get_db_url
import acquire 

from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import MinMaxScaler

#### Titanic Data

- Use the function you defined in acquire.py to load the titanic data set.
- Handle the missing values in the embark_town and embarked columns.
- Remove the deck column.
- Use a label encoder to transform the embarked column.
- Scale the age and fare columns using a min max scaler. Why might this be beneficial? When might you not want to do this?
- Create a function named prep_titanic that accepts the untransformed titanic data, and returns the data with the transformations above applied.

In [2]:
df_titanic = acquire.get_titanic_data()

In [3]:
df_titanic.head()

Unnamed: 0,passenger_id,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,deck,embark_town,alone
0,0,0,3,male,22.0,1,0,7.25,S,Third,,Southampton,0
1,1,1,1,female,38.0,1,0,71.2833,C,First,C,Cherbourg,0
2,2,1,3,female,26.0,0,0,7.925,S,Third,,Southampton,1
3,3,1,1,female,35.0,1,0,53.1,S,First,C,Southampton,0
4,4,0,3,male,35.0,0,0,8.05,S,Third,,Southampton,1


In [4]:
df_titanic.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 13 columns):
passenger_id    891 non-null int64
survived        891 non-null int64
pclass          891 non-null int64
sex             891 non-null object
age             714 non-null float64
sibsp           891 non-null int64
parch           891 non-null int64
fare            891 non-null float64
embarked        889 non-null object
class           891 non-null object
deck            203 non-null object
embark_town     889 non-null object
alone           891 non-null int64
dtypes: float64(2), int64(6), object(5)
memory usage: 90.6+ KB


In [5]:
df_titanic.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
passenger_id,891.0,445.0,257.353842,0.0,222.5,445.0,667.5,890.0
survived,891.0,0.383838,0.486592,0.0,0.0,0.0,1.0,1.0
pclass,891.0,2.308642,0.836071,1.0,2.0,3.0,3.0,3.0
age,714.0,29.699118,14.526497,0.42,20.125,28.0,38.0,80.0
sibsp,891.0,0.523008,1.102743,0.0,0.0,0.0,1.0,8.0
parch,891.0,0.381594,0.806057,0.0,0.0,0.0,0.0,6.0
fare,891.0,32.204208,49.693429,0.0,7.9104,14.4542,31.0,512.3292
alone,891.0,0.602694,0.489615,0.0,0.0,1.0,1.0,1.0


In [6]:
df_titanic.shape

(891, 13)

In [7]:
df_titanic.isnull().sum()

passenger_id      0
survived          0
pclass            0
sex               0
age             177
sibsp             0
parch             0
fare              0
embarked          2
class             0
deck            688
embark_town       2
alone             0
dtype: int64

In [8]:
df_titanic.embark_town.value_counts(dropna=False)

Southampton    644
Cherbourg      168
Queenstown      77
NaN              2
Name: embark_town, dtype: int64

In [9]:
df_titanic.embark_town.fillna("Other", inplace=True)
df_titanic.embark_town.value_counts(dropna=False)

Southampton    644
Cherbourg      168
Queenstown      77
Other            2
Name: embark_town, dtype: int64

In [10]:
df_titanic.drop(columns=['deck'], inplace=True)
df_titanic.head()

Unnamed: 0,passenger_id,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,embark_town,alone
0,0,0,3,male,22.0,1,0,7.25,S,Third,Southampton,0
1,1,1,1,female,38.0,1,0,71.2833,C,First,Cherbourg,0
2,2,1,3,female,26.0,0,0,7.925,S,Third,Southampton,1
3,3,1,1,female,35.0,1,0,53.1,S,First,Southampton,0
4,4,0,3,male,35.0,0,0,8.05,S,Third,Southampton,1


Use a label encoder to transform the embarked column.

In [11]:
df_titanic.embarked.head()

0    S
1    C
2    S
3    S
4    S
Name: embarked, dtype: object

In [12]:
df_titanic.embarked.value_counts(dropna=False)

S      644
C      168
Q       77
NaN      2
Name: embarked, dtype: int64

In [13]:
df_titanic.embarked.fillna('unknown', inplace=True)
df_titanic.embarked.value_counts(dropna=False)

S          644
C          168
Q           77
unknown      2
Name: embarked, dtype: int64

In [14]:
encoder = LabelEncoder()
encoder.fit(df_titanic.embarked)
df_titanic.embarked = encoder.transform(df_titanic.embarked)
df_titanic.embarked.head()

0    2
1    0
2    2
3    2
4    2
Name: embarked, dtype: int64

In [15]:
df_titanic.isnull().sum()

passenger_id      0
survived          0
pclass            0
sex               0
age             177
sibsp             0
parch             0
fare              0
embarked          0
class             0
embark_town       0
alone             0
dtype: int64

In [16]:
df_titanic.fillna(np.nan, inplace=True)

In [17]:
# train, test = train_test_split(df_titanic, train_size=.8, random_state=123)
# train.head()

In [18]:
scaler = MinMaxScaler()
scaler.fit(df_titanic[['age']])
df_titanic.age = scaler.transform(df_titanic[['age']])

In [20]:
df_titanic.head()

Unnamed: 0,passenger_id,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,embark_town,alone
0,0,0,3,male,0.271174,1,0,7.25,2,Third,Southampton,0
1,1,1,1,female,0.472229,1,0,71.2833,0,First,Cherbourg,0
2,2,1,3,female,0.321438,0,0,7.925,2,Third,Southampton,1
3,3,1,1,female,0.434531,1,0,53.1,2,First,Southampton,0
4,4,0,3,male,0.434531,0,0,8.05,2,Third,Southampton,1


In [21]:
scaler = MinMaxScaler()
scaler.fit(df_titanic[['fare']])
df_titanic.fare = scaler.transform(df_titanic[['fare']])

In [22]:
df_titanic.head()

Unnamed: 0,passenger_id,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,embark_town,alone
0,0,0,3,male,0.271174,1,0,0.014151,2,Third,Southampton,0
1,1,1,1,female,0.472229,1,0,0.139136,0,First,Cherbourg,0
2,2,1,3,female,0.321438,0,0,0.015469,2,Third,Southampton,1
3,3,1,1,female,0.434531,1,0,0.103644,2,First,Southampton,0
4,4,0,3,male,0.434531,0,0,0.015713,2,Third,Southampton,1


In [23]:
df_titanic[['age', 'fare']].head()

Unnamed: 0,age,fare
0,0.271174,0.014151
1,0.472229,0.139136
2,0.321438,0.015469
3,0.434531,0.103644
4,0.434531,0.015713


Create function for the data prep.

#### Iris Data

- Use the function defined in acquire.py to load the iris data.
- Drop the species_id and measurement_id columns.
- Rename the species_name column to just species.
- Encode the species name using a sklearn label encoder. Research the inverse_transform method of the label encoder. - How might this be useful?
- Create a function named prep_iris that accepts the untransformed iris data, and returns the data with the transformations above applied.

In [24]:
df_iris = acquire.get_iris_data()
df_iris.head()

Unnamed: 0,species_id,measurement_id,sepal_length,sepal_width,petal_length,petal_width,species_name
0,1,1,5.1,3.5,1.4,0.2,setosa
1,1,2,4.9,3.0,1.4,0.2,setosa
2,1,3,4.7,3.2,1.3,0.2,setosa
3,1,4,4.6,3.1,1.5,0.2,setosa
4,1,5,5.0,3.6,1.4,0.2,setosa


In [25]:
df_iris.drop(columns=['species_id'], inplace=True)
df_iris.drop(columns=['measurement_id'], inplace=True)
df_iris.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species_name
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa


In [26]:
df_iris = df_iris.rename(columns={'species_name':'species'})
df_iris.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa


In [27]:
encoder = LabelEncoder()
encoder.fit(df_iris.species)
df_iris.species = encoder.transform(df_iris.species)
df_iris.species.head()

0    0
1    0
2    0
3    0
4    0
Name: species, dtype: int64

In [28]:
df_iris.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,5.1,3.5,1.4,0.2,0
1,4.9,3.0,1.4,0.2,0
2,4.7,3.2,1.3,0.2,0
3,4.6,3.1,1.5,0.2,0
4,5.0,3.6,1.4,0.2,0


To do:
- inverse_transform
- create function