# Too Fast. Too Furryious. (Working Title)

**Goal:** Build a classification model to classify unseen faces to 'match' those of characters from the beloved _**Fast and Furious**_ movie franchise.


In [1]:
# # Required installations (run once)
# !brew install wget  # Added by Miles
# !pip install --upgrade --ignore-installed wrapt  # Added by Miles
# !pip install tensorflow==2.0.0-beta0  # Edited by Miles (switch to CPU version)
# !pip install tensorflow_datasets  # Added by Miles

In [2]:
%matplotlib inline
%load_ext autoreload
%autoreload 2
#%aimport helper
#%aimport image_feature_extractor


## Import Libraries

In [3]:
import os 
import numpy as np
from io import BytesIO

# Visualization
import matplotlib.pyplot as plt

# ML
import tensorflow as tf
keras = tf.keras
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import BaggingClassifier, RandomForestClassifier
from sklearn.metrics import (accuracy_score, confusion_matrix
                            , classification_report)
from sklearn.pipeline import Pipeline
import pickle
import dill
import joblib

# Image handling
from PIL import Image

# Data storage
import boto3    # AWS
import pymongo  # MongoDB

# Custom
import helper  # Helper functions 
from image_feature_extractor import ImageFeatureExtractor

## Data Procurement

To set up our data ingestion process, we centralized all our images to a local folder. They are also stored on an **AWS S3 bucket**.

In [4]:
image_paths = []
target_labels = []

#for dirpath, dirnames, filenames in os.walk('downloads/all_photos'):
for dirpath, dirnames, filenames in os.walk('test_data'):
    for ff in filenames:
        if ff[:1] != '.':
            curr_path = os.path.join('.',dirpath, ff)
            temp_name = dirpath[dirpath.rfind('/') + 1:]
            target_labels.append(temp_name)
            image_paths.append(curr_path)

## Machine Learning Pipeline

We constructed a pipeline that consists of the following:
- We created a class that uses a pre-trained model (MobileNetV2) to extract features.
- A Random Forest classifier built on top of those extracted features. 

In [20]:
# Instantiate our feature extractor
extractor = ImageFeatureExtractor()

# Instantiate our chosen classification model
forest = RandomForestClassifier(n_estimators=10, n_jobs=-1)
#forest = RandomForestClassifier(n_estimators=10)

In [21]:
# Create the ML pipeline
pipe = Pipeline([
    ('extract_deep_features', extractor),
    ('classify', forest)
])

In [22]:
# Create train/test split
X_train, X_test, y_train, y_test = train_test_split(image_paths 
                                                    ,target_labels
                                                    ,random_state=41916
                                                    ,stratify=target_labels)

In [23]:
# Fit on training data!
pipe.fit(X_train, y_train)

Pipeline(memory=None,
     steps=[('extract_deep_features', ImageFeatureExtractor(height=160,
           model=<tensorflow.python.keras.engine.sequential.Sequential object at 0x14431b048>,
           width=160)), ('classify', RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=Non..._jobs=-1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False))])

In [24]:
pipe.predict(X_test)

array(['brian', 'brian', 'brian', 'hobbs', 'brian', 'brian', 'letty',
       'hobbs', 'letty', 'hobbs', 'letty', 'brian', 'letty', 'letty',
       'letty', 'brian', 'hobbs', 'brian', 'hobbs', 'brian', 'brian',
       'hobbs', 'letty', 'brian', 'letty', 'letty', 'letty', 'brian',
       'brian', 'hobbs', 'brian', 'hobbs', 'letty', 'hobbs', 'brian',
       'letty', 'hobbs', 'brian'], dtype='<U5')

In [25]:
pipe.score(X_test, y_test)

0.8157894736842105

### Test URL Inputs

In [26]:
# Random Testing

vinny = ['http://www.sosia.biz/files/immagini/1289210942-DSCF0851.JPG']
laura = ['https://cdn-images-1.medium.com/max/1200/1*jM7PrjvG20306cXwjgN6' + 
         'hA@2x.jpeg']
mia = ['https://media.licdn.com/dms/image/C4E03AQEUu7pgy0zqrw/profile-' + 
       'displayphoto-shrink_200_200/0?e=1563408000&v=beta&t=IQESr0Ho16othge' + 
       'TRgp0nrGlXRkv6c-WiSHf_nCzRlk']
werlindo = ['https://cdn-images-1.medium.com/max/1200/2*T33SKqm3ldv2QkT' + 
            'E3QQ0Dw.jpeg']


In [27]:
pipe.predict(vinny)

array(['hobbs'], dtype='<U5')

In [28]:
pipe.predict(laura)

array(['letty'], dtype='<U5')

In [29]:
pipe.predict(mia)

array(['letty'], dtype='<U5')

In [30]:
pipe.predict(werlindo)

array(['brian'], dtype='<U5')

![](https://images.unsplash.com/photo-1462536546956-beef6399d8cf?ixlib=rb-1.2.1&ixid=eyJhcHBfaWQiOjEyMDd9&auto=format&fit=crop&w=1350&q=80)

### Encountered Errors:
- can't pickle _thread.RLock objects
- can't pickle SwigPyObject objects

### Info?
- [Keras models not pickle-able?](https://github.com/keras-team/keras/issues/10528)


In [33]:
# Save to file in the current working directory
pkl_model_filenm = "model.pkl"  

### Try 1 - Pickle
https://pypi.org/project/dill/

In [35]:
with open(pkl_model_filenm, 'wb') as file:  
    pickle.dump(pipe, file)

TypeError: can't pickle _thread.RLock objects

In [None]:
# Load from file
with open(pkl_model_filenm, 'rb') as file:  
    pickle_model = pickle.load(file)

##### Calculate the accuracy score and predict target values
score = pickle_model.score(X_test, y_test)  
print("Test score: {0:.2f} %".format(100 * score))  
predict = pickle_model.predict(X_test)  

### Try 2 - Dill
https://pypi.org/project/dill/

In [36]:
with open(pkl_model_filenm, "wb") as dill_file:
    dill.dump(pipe, dill_file)

TypeError: can't pickle SwigPyObject objects

### Try 3 - joblib
https://scikit-learn.org/stable/modules/model_persistence.html

In [40]:
# Export the classifier to a file
joblib.dump(pipe, 'model.joblib')

TypeError: can't pickle SwigPyObject objects

### Try 4 - joblib 'hack'?
https://stackoverflow.com/questions/37984304/how-to-save-a-scikit-learn-pipline-with-keras-regressor-inside-to-disk



In [39]:
# Save the Keras model first:
# pipeline.named_steps['estimator'].model.save('keras_model.h5')
pipe.named_steps['extract_deep_features'].model.save('deep_feat')

# This hack allows us to save the sklearn pipeline:
pipe.named_steps['classify'].model = None

# Finally, save the pipeline:
joblib.dump(pipe, 'model.pkl')

W0621 00:44:41.702590 4626376128 saved_model.py:722] Skipping full serialization of object <tensorflow.python.keras.layers.normalization.BatchNormalization object at 0x13da68710>, because an error occurred while tracing layer functions. Error message: Expected Operation, Variable, or Tensor, got None
W0621 00:44:41.762170 4626376128 saved_model.py:722] Skipping full serialization of object <tensorflow.python.keras.layers.normalization.BatchNormalization object at 0x13d9b6470>, because an error occurred while tracing layer functions. Error message: Expected Operation, Variable, or Tensor, got None
W0621 00:44:41.817981 4626376128 saved_model.py:722] Skipping full serialization of object <tensorflow.python.keras.layers.normalization.BatchNormalization object at 0x13d7e3080>, because an error occurred while tracing layer functions. Error message: Expected Operation, Variable, or Tensor, got None
W0621 00:44:41.872256 4626376128 saved_model.py:722] Skipping full serialization of object <te

W0621 00:44:43.333414 4626376128 saved_model.py:722] Skipping full serialization of object <tensorflow.python.keras.layers.normalization.BatchNormalization object at 0x14328e898>, because an error occurred while tracing layer functions. Error message: Expected Operation, Variable, or Tensor, got None
W0621 00:44:43.386149 4626376128 saved_model.py:722] Skipping full serialization of object <tensorflow.python.keras.layers.normalization.BatchNormalization object at 0x1401d7748>, because an error occurred while tracing layer functions. Error message: Expected Operation, Variable, or Tensor, got None
W0621 00:44:43.438484 4626376128 saved_model.py:722] Skipping full serialization of object <tensorflow.python.keras.layers.normalization.BatchNormalization object at 0x142e22e80>, because an error occurred while tracing layer functions. Error message: Expected Operation, Variable, or Tensor, got None
W0621 00:44:43.495869 4626376128 saved_model.py:722] Skipping full serialization of object <te

TypeError: can't pickle SwigPyObject objects

---

---

# Appendix

## Development on MongoDB storage.

Not currently implemented. Intended to eventually be integrated into the **ImageFeatureExtractor()** class.

In [None]:
# Instantiate Class
extractor = ImageFeatureExtractor()

In [None]:
# Store Features - list of arrays
features = extractor.transform(image_paths)

In [None]:
# Turn into list of lists because easier with MongoDB
features_list = [feature.tolist() for feature in features]

In [None]:
# Zip them so can iterate through them
zipped_imgs = list(zip(image_paths,features_list))

In [None]:
# Create list of dictionaries; so can be ingested by MongoDB
list_of_dicts = [{'url': img[0], 'features':img[1]} for img in zipped_imgs]

### Upload results to MongoDB

In [None]:
# Define path to secret
#secret_path = os.path.join(os.environ['HOME'], '.secret', 'mongo.json')

In [None]:
# keys = helper.get_keys(secret_path)
# mongo_user = keys['user_id']
# mongo_pw = keys['password']

In [None]:
# Instantiate client
client = pymongo.MongoClient("mongodb+srv://" + mongo_user + ":" 
                         + mongo_pw 
                         + "@dsaf-oy1s0.mongodb.net/test?retryWrites=true")


In [None]:
# Get DB, Collection
db = client['furious']
coll = db['images']

In [None]:
# Wipe collection to start fresh
coll.delete_many({})

In [None]:
# Insert Results
coll.insert_many(list_of_dicts)

### Testing getting the features back

In [None]:
features_returned = [np.array(x['features']) for x in coll.find()]

In [None]:
features_returned

---

## Development: Classification Models

In [None]:
target = target_labels

In [None]:
X_train, X_test, y_train, y_test = train_test_split(features, 
                                                    target, 
                                                    test_size = 0.25,
                                                    random_state=123)

### Development: Decision Trees

In [None]:
tree_clf = DecisionTreeClassifier(criterion = "gini", max_depth = 5) 
tree_clf.fit(X_train, y_train)

In [None]:
tree_clf.feature_importances_

In [None]:
pred = tree_clf.predict(X_test)
print(confusion_matrix(y_test, pred))
print(classification_report(y_test, pred))

### Random Forest

In [None]:
forest = RandomForestClassifier(n_estimators=20)
forest.fit(X_train, y_train)

In [None]:
forest.score(X_train, y_train)

In [None]:
forest.score(X_test, y_test)

In [None]:
forest.predict(X_train)