# Data preparation
The end product of this exercise should be the specified functions in a python script named `prepare.py`. Do these in your `classification_exercises.ipynb` first, then transfer to the prepare.py file.

In [23]:
import pandas as pd
from acquire import get_iris_data, get_titanic_data

from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, MinMaxScaler

## Exercise 1 
Iris Data

- Use the function defined in `acquire.py` to load the iris data.
- Drop the species_id and measurement_id columns.
- Rename the species_name column to just species.
- Encode the species name using a sklearn label encoder. Research the inverse_transform method of the label encoder. How might this be useful?
- Create a function named `prep_iris` that accepts the untransformed iris data, and returns the - data with the transformations above applied.

In [3]:
#Use the function defined in acquire.py to load the iris data.
iris = get_iris_data()
iris.head()

Unnamed: 0,species_id,measurement_id,sepal_length,sepal_width,petal_length,petal_width,species_name
0,1,1,5.1,3.5,1.4,0.2,setosa
1,1,2,4.9,3.0,1.4,0.2,setosa
2,1,3,4.7,3.2,1.3,0.2,setosa
3,1,4,4.6,3.1,1.5,0.2,setosa
4,1,5,5.0,3.6,1.4,0.2,setosa


In [3]:
#Drop the species_id and measurement_id columns.
iris = iris.drop(columns = ['species_id', 'measurement_id'])
iris.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species_name
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa


In [5]:
#Rename the species_name column to just species.
iris = iris.rename(columns = {'species_name': 'species'})
iris.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa


In [8]:
iris.species.value_counts()

virginica     50
versicolor    50
setosa        50
Name: species, dtype: int64

In [11]:
#Encode the species name using a sklearn label encoder. 
encoder = LabelEncoder()
encoder.fit(iris.species)
iris["species_encoded"]= encoder.transform(iris.species)
iris.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species,species_encoded
0,5.1,3.5,1.4,0.2,setosa,0
1,4.9,3.0,1.4,0.2,setosa,0
2,4.7,3.2,1.3,0.2,setosa,0
3,4.6,3.1,1.5,0.2,setosa,0
4,5.0,3.6,1.4,0.2,setosa,0


In [12]:
#Research the inverse_transform method of the label encoder. 
#How might this be useful?
encoder.inverse_transform(iris.species_encoded)

array(['setosa', 'setosa', 'setosa', 'setosa', 'setosa', 'setosa',
       'setosa', 'setosa', 'setosa', 'setosa', 'setosa', 'setosa',
       'setosa', 'setosa', 'setosa', 'setosa', 'setosa', 'setosa',
       'setosa', 'setosa', 'setosa', 'setosa', 'setosa', 'setosa',
       'setosa', 'setosa', 'setosa', 'setosa', 'setosa', 'setosa',
       'setosa', 'setosa', 'setosa', 'setosa', 'setosa', 'setosa',
       'setosa', 'setosa', 'setosa', 'setosa', 'setosa', 'setosa',
       'setosa', 'setosa', 'setosa', 'setosa', 'setosa', 'setosa',
       'setosa', 'setosa', 'versicolor', 'versicolor', 'versicolor',
       'versicolor', 'versicolor', 'versicolor', 'versicolor',
       'versicolor', 'versicolor', 'versicolor', 'versicolor',
       'versicolor', 'versicolor', 'versicolor', 'versicolor',
       'versicolor', 'versicolor', 'versicolor', 'versicolor',
       'versicolor', 'versicolor', 'versicolor', 'versicolor',
       'versicolor', 'versicolor', 'versicolor', 'versicolor',
       'versicolo

In [13]:
#Create a function named prep_iris that accepts the untransformed iris data, 
#and returns the data with the transformations above applied.
def prep_iris(iris):
    iris = iris.drop(columns = ['species_id', 'measurement_id'])
    iris = iris.rename(columns = {'species_name': 'species'})
    encoder = LabelEncoder()
    encoder.fit(iris.species)
    iris["species_encoded"]= encoder.transform(iris.species)
    return iris


In [5]:
# validate funcion
from prepare import prep_iris

iris = prep_iris(iris)
iris.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species,species_encoded
0,5.1,3.5,1.4,0.2,setosa,0
1,4.9,3.0,1.4,0.2,setosa,0
2,4.7,3.2,1.3,0.2,setosa,0
3,4.6,3.1,1.5,0.2,setosa,0
4,5.0,3.6,1.4,0.2,setosa,0


## Exercise 2
Titanic Data

- Use the function you defined in acquire.py to load the titanic data set.
- Handle the missing values in the embark_town and embarked columns.
- Remove the deck column.
- Use a label encoder to transform the embarked column.
- Scale the age and fare columns using a min max scaler. Why might this be beneficial? When might you not want to do this?
- Fill the missing values in age. The way you fill these values is up to you. Consider the tradeoffs of different methods.
- Create a function named `prep_titanic` that accepts the untransformed titanic data, and returns the data with the transformations above applied.

In [61]:
#Use the function you defined in acquire.py to load the titanic data set.
titanic = get_titanic_data()
#
titanic.head()

Unnamed: 0,passenger_id,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,deck,embark_town,alone
0,0,0,3,male,22.0,1,0,7.25,S,Third,,Southampton,0
1,1,1,1,female,38.0,1,0,71.2833,C,First,C,Cherbourg,0
2,2,1,3,female,26.0,0,0,7.925,S,Third,,Southampton,1
3,3,1,1,female,35.0,1,0,53.1,S,First,C,Southampton,0
4,4,0,3,male,35.0,0,0,8.05,S,Third,,Southampton,1


In [50]:
#Handle the missing values in the embark_town and embarked columns.
titanic[['embark_town', 'embarked']].isna().sum()

embark_town    2
embarked       2
dtype: int64

In [49]:
titanic.embark_town.value_counts()

Southampton    644
Cherbourg      168
Queenstown      77
Name: embark_town, dtype: int64

In [14]:
titanic.embark_town = titanic.embark_town.fillna('Southampton')
titanic.embark_town.value_counts()

Southampton    646
Cherbourg      168
Queenstown      77
Name: embark_town, dtype: int64

In [15]:
titanic.embarked.value_counts()

S    644
C    168
Q     77
Name: embarked, dtype: int64

In [17]:
titanic.embarked = titanic.embarked.fillna('S')
titanic.embarked.value_counts()

S    646
C    168
Q     77
Name: embarked, dtype: int64

In [19]:
#Remove the deck column. 
titanic = titanic.drop(columns = 'deck')
titanic.head()

Unnamed: 0,passenger_id,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,embark_town,alone
0,0,0,3,male,22.0,1,0,7.25,S,Third,Southampton,0
1,1,1,1,female,38.0,1,0,71.2833,C,First,Cherbourg,0
2,2,1,3,female,26.0,0,0,7.925,S,Third,Southampton,1
3,3,1,1,female,35.0,1,0,53.1,S,First,Southampton,0
4,4,0,3,male,35.0,0,0,8.05,S,Third,Southampton,1


In [21]:
#Use a label encoder to transform the embarked column.
encoder = LabelEncoder()
encoder.fit(titanic.embarked)
titanic['embarked_encoded'] = encoder.transform(titanic.embarked)
titanic.head()

Unnamed: 0,passenger_id,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,embark_town,alone,embarked_encoded
0,0,0,3,male,22.0,1,0,7.25,S,Third,Southampton,0,2
1,1,1,1,female,38.0,1,0,71.2833,C,First,Cherbourg,0,0
2,2,1,3,female,26.0,0,0,7.925,S,Third,Southampton,1,2
3,3,1,1,female,35.0,1,0,53.1,S,First,Southampton,0,2
4,4,0,3,male,35.0,0,0,8.05,S,Third,Southampton,1,2


In [56]:
test.head()

Unnamed: 0,passenger_id,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,deck,embark_town,alone
172,172,1,3,female,1.0,1,1,11.1333,S,Third,,Southampton,0
524,524,0,3,male,,0,0,7.2292,C,Third,,Cherbourg,1
452,452,0,1,male,30.0,0,0,27.75,C,First,C,Cherbourg,1
170,170,0,1,male,61.0,0,0,33.5,S,First,B,Southampton,1
620,620,0,3,male,27.0,1,0,14.4542,C,Third,,Cherbourg,0


In [51]:
# split dataset
train, test = train_test_split(titanic, train_size = .8, random_state = 123)

In [57]:
#Scale the age and fare columns using a min max scaler. 
scaler = MinMaxScaler()
train_scaled = pd.DataFrame(scaler.fit_transform(train[['age', 'fare']]), 
                                columns = ['age_scaled', 'fare_scaled'],
                                index = train.index)
test_scaled = pd.DataFrame(scaler.transform(test[['age', 'fare']]), 
                                columns = ['age_scaled', 'fare_scaled'],
                                index = test.index)
test_scaled.head()

Unnamed: 0,age_scaled,fare_scaled
172,0.007288,0.021731
524,,0.01411
452,0.371701,0.054164
170,0.761247,0.065388
620,0.334004,0.028213


In [58]:
train = pd.concat([train, train_scaled], axis = 1)
test = pd.concat([test, test_scaled], axis = 1)
test.head()

Unnamed: 0,passenger_id,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,deck,embark_town,alone,age_scaled,fare_scaled
172,172,1,3,female,1.0,1,1,11.1333,S,Third,,Southampton,0,0.007288,0.021731
524,524,0,3,male,,0,0,7.2292,C,Third,,Cherbourg,1,,0.01411
452,452,0,1,male,30.0,0,0,27.75,C,First,C,Cherbourg,1,0.371701,0.054164
170,170,0,1,male,61.0,0,0,33.5,S,First,B,Southampton,1,0.761247,0.065388
620,620,0,3,male,27.0,1,0,14.4542,C,Third,,Cherbourg,0,0.334004,0.028213


In [22]:
#Why might this be beneficial? 
#When might you not want to do this?

In [59]:
#Fill the missing values in age. 
#The way you fill these values is up to you.
#Consider the tradeoffs of different methods.
imputer = SimpleImputer(strategy = 'mean')
train.age = imputer.fit_transform(train[['age']])
test.age = imputer.transform(test[['age']])
test.age

172     1.000000
524    29.832908
452    30.000000
170    61.000000
620    27.000000
         ...    
388    29.832908
338    45.000000
827     1.000000
773    29.832908
221    27.000000
Name: age, Length: 179, dtype: float64

In [60]:
#Create a function named prep_titanic that accepts the untransformed titanic data,
#and returns the data with the transformations above applied.
def prep_titanic(titanic):
    #Handle the missing values in the embark_town and embarked columns.
    titanic.embark_town = titanic.embark_town.fillna('Southampton')
    titanic.embarked = titanic.embarked.fillna('S')
    
    #Remove the deck column.
    titanic = titanic.drop(columns = 'deck')
    
    #Use a label encoder to transform the embarked column.
    encoder = LabelEncoder()
    encoder.fit(titanic.embarked)
    titanic['embarked_encoded'] = encoder.transform(titanic.embarked)
    
    # split dataset
    train, test = train_test_split(titanic, train_size = .8, random_state = 123)
   
    #scale the age and fare columns using a min_max_scaler
    scaler = MinMaxScaler()
    train_scaled = pd.DataFrame(scaler.fit_transform(train[['age', 'fare']]), 
                                    columns = ['age_scaled', 'fare_scaled'],
                                    index = train.index)
    test_scaled = pd.DataFrame(scaler.transform(test[['age', 'fare']]), 
                                    columns = ['age_scaled', 'fare_scaled'],
                                    index = test.index)
    train = pd.concat([train, train_scaled], axis = 1)
    test = pd.concat([test, test_scaled], axis = 1)
    
    #Fill the missing values in age.
    imputer = SimpleImputer(strategy = 'mean')
    train.age = imputer.fit_transform(train[['age']])
    test.age = imputer.transform(test[['age']])
    return train, test

In [62]:
train, test = prep_titanic(titanic)
train.head()

Unnamed: 0,passenger_id,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,embark_town,alone,embarked_encoded,age_scaled,fare_scaled
329,329,1,1,female,16.0,0,1,57.9792,C,First,Cherbourg,0,0,0.195778,0.113168
749,749,0,3,male,31.0,0,0,7.75,Q,Third,Queenstown,1,1,0.384267,0.015127
203,203,0,3,male,45.5,0,0,7.225,C,Third,Cherbourg,1,0,0.566474,0.014102
421,421,0,3,male,21.0,0,0,7.7333,Q,Third,Queenstown,1,1,0.258608,0.015094
97,97,1,1,male,23.0,0,1,63.3583,C,First,Cherbourg,0,0,0.28374,0.123667
