# Pickles and Pipelines

This notebook goes over basics of how to write and read pickles then goes over an example of how to use the `Pipeline()` function from `scikitlearn`.

## Pickles

In [1]:
import pickle

import pandas as pd

In [2]:
my_diamonds = pd.read_csv("data/diamonds.csv")

In [3]:
with open('data/pickled_diamonds.pkl', 'wb') as pickle_file:
    pickle.dump(my_diamonds, pickle_file)

In [4]:
!ls -la data/

total 11896
drwxr-xr-x  4 dbaker  staff      128  6 Apr 13:39 [34m.[m[m
drwxr-xr-x  8 dbaker  staff      256  8 Apr 09:53 [34m..[m[m
-rw-r--r--  1 dbaker  staff  2448483  6 Apr 13:30 diamonds.csv
-rw-r--r--  1 dbaker  staff  3346044  8 Apr 09:54 pickled_diamonds.pkl


In [5]:
# Check it out with vim! (also ipynb...?)

In [6]:
with open('data/pickled_diamonds.pkl','rb') as pickle_file:
    loaded_in_data = pickle.load(pickle_file)

In [7]:
# Should you ever open a pickle from an untrusted source???

In [8]:
# Helllll naawwwwwww

## Pipeline 

First thing we need to do is import in our libraries!! 

In [9]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline

import warnings
warnings.filterwarnings('ignore')

In [10]:
# Import the diamonds data 

df = loaded_in_data

# Print the first five rows
df.head()

Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z
0,0.23,Ideal,E,SI2,61.5,55.0,326,3.95,3.98,2.43
1,0.21,Premium,E,SI1,59.8,61.0,326,3.89,3.84,2.31
2,0.23,Good,E,VS1,56.9,65.0,327,4.05,4.07,2.31
3,0.29,Premium,I,VS2,62.4,58.0,334,4.2,4.23,2.63
4,0.31,Good,J,SI2,63.3,58.0,335,4.34,4.35,2.75


In [None]:
df.describe()

In [13]:
# Split the predictor and target variables
y = df['price']
X = df.drop(['price','x','y','z','cut','color','clarity'], axis=1)

# Split into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

In [14]:
# Instantiate StandardScaler
scaler = StandardScaler()

# Transform the training and test sets
scaled_data_train = scaler.fit_transform(X_train)
scaled_data_test = scaler.transform(X_test)

# Convert into a DataFrame
scaled_df_train = pd.DataFrame(scaled_data_train, columns=X_train.columns)
scaled_df_train.head()

Unnamed: 0,carat,depth,table
0,-1.156665,2.207837,0.242414
1,0.086917,0.038517,-0.654923
2,0.529547,-0.451329,0.242414
3,0.466314,-0.731242,-0.654923
4,-0.397869,0.038517,-0.206254


In [15]:
# Instantiate KNeighborsClassifier
clf = KNeighborsClassifier()

# Fit the classifier
clf.fit(scaled_data_train, y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=5, p=2,
                     weights='uniform')

In [16]:
KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=5, p=2,
                     weights='uniform')

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=5, p=2,
                     weights='uniform')

In [17]:
# Print the accuracy on test set
clf.score(scaled_data_test, y_test)

0.005116796440489433

In [18]:
# Build a pipeline with StandardScaler and KNeighborsClassifier
scaled_pipeline_1 = Pipeline([('ss', StandardScaler()), 
                              ('knn', KNeighborsClassifier())])

In [19]:
# Fit the training data to pipeline
scaled_pipeline_1.fit(X_train, y_train)

# Print the accuracy on test set
scaled_pipeline_1.score(X_test, y_test)

0.005116796440489433

In [20]:
# Build a pipeline with StandardScaler and RandomForestClassifier
scaled_pipeline_2 = Pipeline([('ss', StandardScaler()), 
                              ('RF', RandomForestClassifier(random_state=123))])

In [21]:
# Define the grid
grid = [{'RF__max_depth': [4, 5, 6], 
         'RF__min_samples_split': [2, 5, 10], 
         'RF__min_samples_leaf': [1, 3, 5]}]

In [22]:
# Define a grid search
gridsearch = GridSearchCV(estimator=scaled_pipeline_2, 
                          param_grid=grid, 
                          scoring='accuracy', 
                          cv=5)

In [23]:
# Fit the training data
gridsearch.fit(X_train, y_train)

# Print the accuracy on test set
gridsearch.score(X_test, y_test)

KeyboardInterrupt: 