In [1]:
import pandas as pd
import numpy as np
import sqlite3
from sklearn.preprocessing import MinMaxScaler
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.externals import joblib

pd.set_option('display.max_columns', None)



# 1.1 Pipelines with the standard library

## 1.1.1 Review: ` reduce()`

In [2]:
from functools import reduce

reduce(lambda a, b: a + b, [x for x in range(1, 6)]) #lambda is a name function

15

We can use `reduce()` to build a custom `pipe()` function:

In [3]:
def pipe(obj, *fns): #* takes more than one argument
    return reduce(lambda x , y: y(x), [obj] + list(fns))

In [4]:
def get_consonants(str_):
    vowels = list('aeiou')
    return ''.join([char for char in str_ if char not in vowels])

test = 'Python'

pipe(test,
    lambda x: x.lower(),
    get_consonants
    )

'pythn'

# Pipelines with Pandas: `.pipe()`

In [5]:
import seaborn as sns
df = sns.load_dataset('titanic')
titanic = df[['pclass', 'age', 'sex', 'survived']]

In [6]:
connection = sqlite3.connect('../../../data-sets/titanic.db')

In [7]:
df = pd.read_sql_query(
    """
    SELECT O.pclass, O.age, S.sex, O.survived
    FROM Observation AS O
    JOIN Sex AS S ON S.sex_id = O.sex_id
    """,
    connection
)

df.head()

Unnamed: 0,pclass,age,sex,survived
0,3,22.0,male,0
1,1,38.0,female,1
2,3,26.0,female,1
3,1,35.0,female,1
4,3,35.0,male,0


In [8]:
def drop_nuls(df):
    no_nulls = df.dropna(how = 'any').reset_index(drop = True)
    n_dropped = df.shape[0] - no_nulls.shape[0]
    print(f"{n_dropped} of {df.shape[0]} ({round(n_dropped / df.shape[0] * 100, 2)}%) rows dropped")
    return no_nulls

In [9]:
drop_nuls(titanic).head()

177 of 891 (19.87%) rows dropped


Unnamed: 0,pclass,age,sex,survived
0,3,22.0,male,0
1,1,38.0,female,1
2,3,26.0,female,1
3,1,35.0,female,1
4,3,35.0,male,0


In [10]:
def rescale_numbers(df, scaler):
    for col in df:
        if df[col].dtype in ['int64', 'float64']:
            numbers = df[col].astype(float).values.reshape(-1,1)
            df[col] = scaler().fit_transform(numbers)
    return df

In [11]:
rescale_numbers(titanic, MinMaxScaler).head()

Unnamed: 0,pclass,age,sex,survived
0,1.0,0.271174,male,0.0
1,0.0,0.472229,female,1.0
2,1.0,0.321438,female,1.0
3,0.0,0.434531,female,1.0
4,1.0,0.434531,male,0.0


In [12]:
def onehot_encode(df):
    return pd.get_dummies(df, columns = [col for col in df if df[col].dtype == 'object'])

In [13]:
def preprocess(df):
    return (df
           .pipe(drop_nuls)
           .pipe(rescale_numbers, MinMaxScaler)
           .pipe(onehot_encode)
           )

preprocess(titanic).head()

177 of 891 (19.87%) rows dropped


Unnamed: 0,pclass,age,survived,sex_female,sex_male
0,1.0,0.271174,0.0,0,1
1,0.0,0.472229,1.0,1,0
2,1.0,0.321438,1.0,1,0
3,0.0,0.434531,1.0,1,0
4,1.0,0.434531,0.0,0,1


# 1.3 Cross-Validation

In [None]:
# 

Provides an estimate of how accurately a model will perform when given new data

## 1.3.1 Review: `train_test_split`

In [14]:
def train_test(df, target):
    return train_test_split(
    df[[col for col in df if col != target]],
    df[target],
    test_size = .2,
    random_state = 42
    )

In [15]:
def evaluate_model(algorithm, train_test):
    train_X, test_X, train_y, test_y = train_test
    model = algorithm().fit(train_X, train_y)
    score = model.score(test_X, test_y)
    print(f"Accuracy: {round(score, 2)}")
    return model , score

In [16]:
evaluate_model(
    GaussianNB,
    train_test(
        preprocess(titanic),
        target = 'survived'
    )
)

177 of 891 (19.87%) rows dropped
Accuracy: 0.73


(GaussianNB(priors=None, var_smoothing=1e-09), 0.7342657342657343)

## 1.3.2 Cross-Validation with `KFold`

In [17]:
kf = KFold(
    n_splits = 5,
    random_state = 42  
)

for train_index, test_index in kf.split(titanic.head()):
    print(train_index, test_index)

[1 2 3 4] [0]
[0 2 3 4] [1]
[0 1 3 4] [2]
[0 1 2 4] [3]
[0 1 2 3] [4]


In [18]:
for train_index, _ in kf.split(titanic.head()):
    print(f"{titanic.head().iloc[train_index]}\n")

   pclass       age     sex  survived
1     0.0  0.472229  female       1.0
2     1.0  0.321438  female       1.0
3     0.0  0.434531  female       1.0
4     1.0  0.434531    male       0.0

   pclass       age     sex  survived
0     1.0  0.271174    male       0.0
2     1.0  0.321438  female       1.0
3     0.0  0.434531  female       1.0
4     1.0  0.434531    male       0.0

   pclass       age     sex  survived
0     1.0  0.271174    male       0.0
1     0.0  0.472229  female       1.0
3     0.0  0.434531  female       1.0
4     1.0  0.434531    male       0.0

   pclass       age     sex  survived
0     1.0  0.271174    male       0.0
1     0.0  0.472229  female       1.0
2     1.0  0.321438  female       1.0
4     1.0  0.434531    male       0.0

   pclass       age     sex  survived
0     1.0  0.271174    male       0.0
1     0.0  0.472229  female       1.0
2     1.0  0.321438  female       1.0
3     0.0  0.434531  female       1.0



In [25]:
def k_fold(df, target):
    scores = []
    features = df[[col for col in df if col != target]]
    target = df[target]
    kf = KFold(n_splits = 5, random_state = 42)
    for train_i, test_i in kf.split(df):
        scores.append(
            evaluate_model(
                GaussianNB,
                (features.iloc[train_i],
                 features.iloc[test_i],
                 target.iloc[train_i],
                 target.iloc[test_i]
                )[1]
        )
    return sum(scores) / len(scores)

In [26]:
k_fold(preprocess(titanic), target = 'survived')

177 of 891 (19.87%) rows dropped
Accuracy: 0.81
Accuracy: 0.8
Accuracy: 0.77
Accuracy: 0.73
Accuracy: 0.8


0.7801339505564858

# Saved trained models with `joblib`

In [27]:
final_model = evaluate_model(
    GaussianNB,
    train_test(
        preprocess(titanic),
        target = 'survived'
    )
)[0]

final_model

177 of 891 (19.87%) rows dropped
Accuracy: 0.73


GaussianNB(priors=None, var_smoothing=1e-09)

Save trainde model:

In [28]:
filename = 'final_model.sav'
joblib.dump(final_model, filename)

['final_model.sav']

Load saved model:

In [29]:
joblib.load(filename)

GaussianNB(priors=None, var_smoothing=1e-09)

In [35]:
model = joblib.load(filename)
# model.predict() #use saved model and call it into necessary 