# Too Fast. Too Furious. (AKA This Week.)

**Goal:** Build a classification model to classify unseen faces to 'match' those of characters from the beloved _**Fast and Furious**_ movie franchise.


In [None]:
# # Required installations (run once)
# !brew install wget  # Added by Miles
# !pip install --upgrade --ignore-installed wrapt  # Added by Miles
# !pip install tensorflow==2.0.0-beta0  # Edited by Miles (switch to CPU version)
# !pip install tensorflow_datasets  # Added by Miles

In [1]:
%matplotlib inline
%load_ext autoreload
%autoreload 2  # 1 would be where you need to specify the files
#%aimport helper
#%aimport image_feature_extractor


## Import Libraries

In [2]:
import os 
import numpy as np
from io import BytesIO

# Visualization
import matplotlib.pyplot as plt

# ML
import tensorflow as tf
keras = tf.keras
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import BaggingClassifier, RandomForestClassifier
from sklearn.metrics import (accuracy_score, confusion_matrix
                            , classification_report)
from sklearn.pipeline import Pipeline
import pickle
import dill
import joblib

# Image handling
from PIL import Image

# Data storage
import boto3    # AWS
import pymongo  # MongoDB

# Custom
import helper  # Helper functions 
from image_feature_extractor import ImageFeatureExtractor

## Data Procurement

To set up our data ingestion process, we centralized all our images to a local folder. They are also stored on an **AWS S3 bucket**.

#### Future Improvement
- Revise to pull directly from s3 bucket.

In [3]:
image_paths = []
target_labels = []

for dirpath, dirnames, filenames in os.walk('downloads/all_photos'):
#for dirpath, dirnames, filenames in os.walk('test_data'):
    for ff in filenames:
        if ff[:1] != '.':
            curr_path = os.path.join('.',dirpath, ff)
            temp_name = dirpath[dirpath.rfind('/') + 1:]
            target_labels.append(temp_name)
            image_paths.append(curr_path)

## Machine Learning Pipeline

We constructed a pipeline that consists of the following:
- We created a class that uses a pre-trained model (MobileNetV2) to extract features.
- A Random Forest classifier built on top of those extracted features. 

In [4]:
# Instantiate our feature extractor
extractor = ImageFeatureExtractor()

# Instantiate our chosen classification model
#forest = RandomForestClassifier(n_estimators=10, n_jobs=-1)
forest = RandomForestClassifier(n_estimators=10)

In [5]:
# Create the ML pipeline
pipe = Pipeline([
    ('extract_deep_features', extractor),
    ('classify', forest)
])

In [6]:
# Create train/test split
X_train, X_test, y_train, y_test = train_test_split(image_paths 
                                                    ,target_labels
                                                    ,random_state=57
                                                    ,stratify=target_labels)

In [7]:
# Fit on training data!
pipe.fit(X_train, y_train)

  ' expressed in bytes should be converted ' +
  " Skipping tag %s" % (size, len(data), tag))
  " Skipping tag %s" % (size, len(data), tag))


Pipeline(memory=None,
     steps=[('extract_deep_features', ImageFeatureExtractor(height=160,
           model=<tensorflow.python.keras.engine.sequential.Sequential object at 0x1a39057da0>,
           width=160)), ('classify', RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=No...obs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False))])

In [8]:
pipe.predict(X_test)

  ' expressed in bytes should be converted ' +


array(['brixton', 'one_77', 'tej', ..., 'gisele', 'letty', 'brixton'],
      dtype='<U7')

In [9]:
pipe.score(X_test, y_test)

  ' expressed in bytes should be converted ' +


0.3504823151125402

In [15]:
pipe.score(X_train, y_train)

  ' expressed in bytes should be converted ' +
  " Skipping tag %s" % (size, len(data), tag))
  " Skipping tag %s" % (size, len(data), tag))


0.9957127545551983

### Test URL Inputs

In [16]:
# Random Testing

vinny = ['http://www.sosia.biz/files/immagini/1289210942-DSCF0851.JPG']
laura = ['https://cdn-images-1.medium.com/max/1200/1*jM7PrjvG20306cXwjgN6' + 
         'hA@2x.jpeg']
mia = ['https://media.licdn.com/dms/image/C4E03AQEUu7pgy0zqrw/profile-' + 
       'displayphoto-shrink_200_200/0?e=1563408000&v=beta&t=IQESr0Ho16othge' + 
       'TRgp0nrGlXRkv6c-WiSHf_nCzRlk']
werlindo = ['https://cdn-images-1.medium.com/max/1200/2*T33SKqm3ldv2QkT' + 
            'E3QQ0Dw.jpeg']


In [17]:
pipe.predict(vinny)

array(['brixton'], dtype='<U7')

In [18]:
pipe.predict(laura)

array(['gisele'], dtype='<U7')

In [19]:
pipe.predict(mia)

array(['mia'], dtype='<U7')

In [20]:
pipe.predict(werlindo)

array(['brixton'], dtype='<U7')

### In A Pickle.

So it appears at this moment can't **pickle** a **pipeline** with a **Keras model** embedded in it, so will adjust to just **pickle the Random Forest**.

In [21]:
# Extract the random forest
model = pipe.named_steps['classify']

In [22]:
type(model)

sklearn.ensemble.forest.RandomForestClassifier

In [23]:
pkl_model_filenm = 'model.pkl' 

Create the pickled file:

In [24]:
with open(pkl_model_filenm, 'wb') as file:  
    pickle.dump(model, file)

Testing loading of the pickled file:

In [25]:
# Load from file
with open(pkl_model_filenm, 'rb') as file:  
    pickle_model = pickle.load(file)

In [26]:
type(pickle_model)

sklearn.ensemble.forest.RandomForestClassifier

In [27]:
features_for_web = extractor.transform(werlindo)

# Issues with Pickling the Pipeline

### Encountered Errors:
- can't pickle _thread.RLock objects
- can't pickle SwigPyObject objects

### Info?
- [Keras models not pickle-able?](https://github.com/keras-team/keras/issues/10528)


In [None]:
# Save to file in the current working directory
pkl_model_filenm = "model.pkl"  

### Try 1 - Pickle
https://pypi.org/project/dill/

In [None]:
with open(pkl_model_filenm, 'wb') as file:  
    pickle.dump(pipe, file)

In [None]:
# Load from file
with open(pkl_model_filenm, 'rb') as file:  
    pickle_model = pickle.load(file)

##### Calculate the accuracy score and predict target values
score = pickle_model.score(X_test, y_test)  
print("Test score: {0:.2f} %".format(100 * score))  
predict = pickle_model.predict(X_test)  

### Try 2 - Dill
https://pypi.org/project/dill/

In [None]:
with open(pkl_model_filenm, "wb") as dill_file:
    dill.dump(pipe, dill_file)

### Try 3 - joblib
https://scikit-learn.org/stable/modules/model_persistence.html

In [None]:
# Export the classifier to a file
joblib.dump(pipe, 'model.joblib')

### Try 4 - joblib 'hack'?
https://stackoverflow.com/questions/37984304/how-to-save-a-scikit-learn-pipline-with-keras-regressor-inside-to-disk



In [None]:
# Save the Keras model first:
# pipeline.named_steps['estimator'].model.save('keras_model.h5')
pipe.named_steps['extract_deep_features'].model.save('deep_feat')

# This hack allows us to save the sklearn pipeline:
pipe.named_steps['classify'].model = None

# Finally, save the pipeline:
joblib.dump(pipe, 'model.pkl')

---

---

# Appendix

## Development on MongoDB storage.

Not currently implemented. Intended to eventually be integrated into the **ImageFeatureExtractor()** class.

In [None]:
# Instantiate Class
extractor = ImageFeatureExtractor()

In [None]:
# Store Features - list of arrays
features = extractor.transform(image_paths)

In [None]:
# Turn into list of lists because easier with MongoDB
features_list = [feature.tolist() for feature in features]

In [None]:
# Zip them so can iterate through them
zipped_imgs = list(zip(image_paths,features_list))

In [None]:
# Create list of dictionaries; so can be ingested by MongoDB
list_of_dicts = [{'url': img[0], 'features':img[1]} for img in zipped_imgs]

### Upload results to MongoDB

In [None]:
# Define path to secret
#secret_path = os.path.join(os.environ['HOME'], '.secret', 'mongo.json')

In [None]:
# keys = helper.get_keys(secret_path)
# mongo_user = keys['user_id']
# mongo_pw = keys['password']

In [None]:
# Instantiate client
client = pymongo.MongoClient("mongodb+srv://" + mongo_user + ":" 
                         + mongo_pw 
                         + "@dsaf-oy1s0.mongodb.net/test?retryWrites=true")


In [None]:
# Get DB, Collection
db = client['furious']
coll = db['images']

In [None]:
# Wipe collection to start fresh
coll.delete_many({})

In [None]:
# Insert Results
coll.insert_many(list_of_dicts)

### Testing getting the features back

In [None]:
features_returned = [np.array(x['features']) for x in coll.find()]

In [None]:
features_returned

---

## Development: Classification Models

In [None]:
target = target_labels

In [None]:
X_train, X_test, y_train, y_test = train_test_split(features, 
                                                    target, 
                                                    test_size = 0.25,
                                                    random_state=123)

### Development: Decision Trees

In [None]:
tree_clf = DecisionTreeClassifier(criterion = "gini", max_depth = 5) 
tree_clf.fit(X_train, y_train)

In [None]:
tree_clf.feature_importances_

In [None]:
pred = tree_clf.predict(X_test)
print(confusion_matrix(y_test, pred))
print(classification_report(y_test, pred))

### Random Forest

In [None]:
forest = RandomForestClassifier(n_estimators=20)
forest.fit(X_train, y_train)

In [None]:
forest.score(X_train, y_train)

In [None]:
forest.score(X_test, y_test)

In [None]:
forest.predict(X_train)