# Pickles and Pipelines

This notebook goes over basics of how to write and read pickles then goes over an example of how to use the `Pipeline()` function from `scikitlearn`.

## Pickles

In [None]:
import pickle

import pandas as pd

In [None]:
my_diamonds = pd.read_csv("data/diamonds.csv")

In [None]:
with open('data/pickled_diamonds.pkl', 'wb') as pickle_file:
    pickle.dump(my_diamonds, pickle_file)

In [None]:
!ls -la data/

In [None]:
# Check it out with vim! (also ipynb...?)

In [None]:
with open('data/pickled_diamonds.pkl','rb') as pickle_file:
    loaded_in_data = pickle.load(pickle_file)

In [None]:
# Should you ever open a pickle from an untrusted source???

In [None]:
# Helllll naawwwwwww

## Pipeline 

First thing we need to do is import in our libraries!! 

In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline

import warnings
warnings.filterwarnings('ignore')

In [None]:
# Import the diamonds data 

df = loaded_in_data

# Print the first five rows
df.head()

In [None]:
df.describe()

In [None]:
# Split the predictor and target variables
y = df['price']
X = df.drop(['price','x','y','z','cut','color','clarity'], axis=1)

# Split into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

In [None]:
# Instantiate StandardScaler
scaler = StandardScaler()

# Transform the training and test sets
scaled_data_train = scaler.fit_transform(X_train)
scaled_data_test = scaler.transform(X_test)

# Convert into a DataFrame
scaled_df_train = pd.DataFrame(scaled_data_train, columns=X_train.columns)
scaled_df_train.head()

In [None]:
# Instantiate KNeighborsClassifier
clf = KNeighborsClassifier()

# Fit the classifier
clf.fit(scaled_data_train, y_train)

In [None]:
KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=5, p=2,
                     weights='uniform')

In [None]:
# Print the accuracy on test set
clf.score(scaled_data_test, y_test)

In [None]:
# Build a pipeline with StandardScaler and KNeighborsClassifier
scaled_pipeline_1 = Pipeline([('ss', StandardScaler()), 
                              ('knn', KNeighborsClassifier())])

In [None]:
# Fit the training data to pipeline
scaled_pipeline_1.fit(X_train, y_train)

# Print the accuracy on test set
scaled_pipeline_1.score(X_test, y_test)

In [None]:
# Build a pipeline with StandardScaler and RandomForestClassifier
scaled_pipeline_2 = Pipeline([('ss', StandardScaler()), 
                              ('RF', RandomForestClassifier(random_state=123))])

In [None]:
# Define the grid
grid = [{'RF__max_depth': [4, 5, 6], 
         'RF__min_samples_split': [2, 5, 10], 
         'RF__min_samples_leaf': [1, 3, 5]}]

In [None]:
# Define a grid search
gridsearch = GridSearchCV(estimator=scaled_pipeline_2, 
                          param_grid=grid, 
                          scoring='accuracy', 
                          cv=5)

In [None]:
# Fit the training data
gridsearch.fit(X_train, y_train)

# Print the accuracy on test set
gridsearch.score(X_test, y_test)