In [None]:
# # Required installations (run once)
# !brew install wget  # Added by Miles
# !pip install --upgrade --ignore-installed wrapt  # Added by Miles
# !pip install tensorflow==2.0.0-beta0  # Edited by Miles (switch to CPU version)
# !pip install tensorflow_datasets  # Added by Miles

In [1]:
%matplotlib inline
%load_ext autoreload
%autoreload 1
%aimport helper

## Import Libraries

In [2]:
import os
import numpy as np
import matplotlib.pyplot as plt

import tensorflow as tf
keras = tf.keras

import tensorflow_datasets as tfds
tfds.disable_progress_bar()

import pymongo        # MongoDB

from PIL import Image

import helper

from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import BaggingClassifier, RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.pipeline import Pipeline
from io import BytesIO
import boto3

from image_feature_extractor import ImageFeatureExtractor


### Mongo Keys

In [3]:
# Define path to secret
secret_path = os.path.join(os.environ['HOME'], '.secret', 'mongo.json')

In [4]:
# keys = helper.get_keys(secret_path)
# mongo_user = keys['user_id']
# mongo_pw = keys['password']

## Data download

In [5]:
# from os import walk

image_paths = []
target_labels = []
#for dirpath, dirnames, filenames in os.walk('downloads/all_photos'):
for dirpath, dirnames, filenames in os.walk('test_data'):
    for ff in filenames:
        if ff[:1] != '.':
            curr_path = os.path.join('.',dirpath, ff)
            temp_name = dirpath[dirpath.rfind('/') + 1:]
#             print(curr_path)  
            target_labels.append(temp_name)
            image_paths.append(curr_path)
        
target_labels[:20]

['brian',
 'brian',
 'brian',
 'brian',
 'brian',
 'brian',
 'brian',
 'brian',
 'brian',
 'brian',
 'hobbs',
 'hobbs',
 'hobbs',
 'hobbs',
 'hobbs',
 'hobbs',
 'hobbs',
 'hobbs',
 'hobbs']

In [6]:
image_paths[:20]

['./test_data/brian/6.paul-walker-medium.jpg',
 './test_data/brian/11.2.37830931.jpg',
 './test_data/brian/2.paul-walker-21044993-1-402.jpg',
 './test_data/brian/9.66892393.jpg',
 './test_data/brian/1.maxresdefault.jpg',
 './test_data/brian/12.gettyimages-168243951.jpg',
 './test_data/brian/5._102791014_gettyimages-164559639.jpg',
 './test_data/brian/8.paul-walker.jpg',
 './test_data/brian/3.220px-PaulWalkerEdit-1.jpg',
 './test_data/brian/4.MV5BMjIwODc0OTk2Nl5BMl5BanBnXkFtZTcwOTQ5MDA0Mg@@._V1_UX214_CR0,0,214,317_AL_.jpg',
 './test_data/hobbs/3.dwayne-johnson-11818916-1-402.jpg',
 './test_data/hobbs/5.416x416.jpg',
 './test_data/hobbs/2.MV5BMTkyNDQ3NzAxM15BMl5BanBnXkFtZTgwODIwMTQ0NTE@._V1_.jpg',
 './test_data/hobbs/4.5b462c57f4af9c1a008b45eb-750-563.jpg',
 './test_data/hobbs/8.the-rock-1.jpg',
 './test_data/hobbs/7.190127_3871676_Dwayne__The_Rock__Johnson_Wants_To_Be_In__Cr_800x450_1432791619872.jpg',
 './test_data/hobbs/1.220px-Dwayne_Johnson_2%2C_2013.jpg',
 './test_data/hobbs/9.MV5B

IMG_SHAPE = (IMG_SIZE, IMG_SIZE, 3)

# Create the base model from the pre-trained model MobileNet V2
base_model = tf.keras.applications.MobileNetV2(input_shape=IMG_SHAPE,
                                               include_top=False,
                                               weights='imagenet')

global_average_layer = tf.keras.layers.GlobalAveragePooling2D()

model = tf.keras.Sequential([
  base_model,
  global_average_layer,
])

def extract_features(image):
    """Return a vector of 1280 deep features for image."""
    image_resized = prepare_image(image)
    image_np = image_resized.numpy()
    images_np = np.expand_dims(image_np, axis=0)
    image_np.shape, images_np.shape
    deep_features = model.predict(images_np)
    return deep_features[0]

extract_features(image)

### Get Features

In [None]:
#%pdb

In [7]:
forest = RandomForestClassifier(n_estimators=10, n_jobs=-1)
extractor = ImageFeatureExtractor()

In [8]:
pipe = Pipeline([
    ('extract_deep_features', extractor),
    ('classify', forest)
])

In [9]:
X_train, X_test, y_train, y_test = train_test_split(image_paths, target_labels)

In [10]:
pipe.fit(X_train, y_train)

Pipeline(memory=None,
     steps=[('extract_deep_features', ImageFeatureExtractor(height=160,
           model=<tensorflow.python.keras.engine.sequential.Sequential object at 0x1a3aaf3d30>,
           width=160)), ('classify', RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=No..._jobs=-1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False))])

In [11]:
pipe.predict(X_test)

array(['brian', 'brian', 'brian', 'hobbs', 'brian'], dtype='<U5')

In [12]:
pipe.score(X_test, y_test)

0.6

In [13]:
#import webbrowser

# !cd downloads/public_photos
# !wget http://www.sosia.biz/files/immagini/1289210942-DSCF0851.JPG
# !ls

vinny = ['http://www.sosia.biz/files/immagini/1289210942-DSCF0851.JPG']

In [14]:
vinny

['http://www.sosia.biz/files/immagini/1289210942-DSCF0851.JPG']

In [15]:
pipe.predict(vinny)

array(['brian'], dtype='<U5')

In [None]:
# Instantiate Class
extractor = ImageFeatureExtractor()

In [None]:
# Store Features - list of arrays
features = extractor.transform(image_paths)

In [None]:
# Turn into list of lists because easier with MongoDB
features_list = [feature.tolist() for feature in features]

In [None]:
# Zip them so can iterate through them
zipped_imgs = list(zip(image_paths,features_list))

In [None]:
# Create list of dictionaries; so can be ingested by MongoDB
list_of_dicts = [{'url': img[0], 'features':img[1]} for img in zipped_imgs]

### Upload results to MongoDB

In [None]:
# Instantiate client
client = pymongo.MongoClient("mongodb+srv://" + mongo_user + ":" 
                         + mongo_pw 
                         + "@dsaf-oy1s0.mongodb.net/test?retryWrites=true")


In [None]:
# Get DB, Collection
db = client['furious']
coll = db['images']

In [None]:
# Wipe collection to start fresh
coll.delete_many({})

In [None]:
# Insert Results
coll.insert_many(list_of_dicts)

In [None]:
features_returned = [np.array(x['features']) for x in coll.find()]

In [None]:
features_returned

# LSM Random Forests

In [None]:
target = target_labels

In [None]:
X_train, X_test, y_train, y_test = train_test_split(features, 
                                                    target, 
                                                    test_size = 0.25,
                                                    random_state=123)

### Regular Tree

In [None]:
tree_clf = DecisionTreeClassifier(criterion = "gini", max_depth = 5) 
tree_clf.fit(X_train, y_train)

In [None]:
tree_clf.feature_importances_

In [None]:
pred = tree_clf.predict(X_test)
print(confusion_matrix(y_test, pred))
print(classification_report(y_test, pred))

### Random Forest

In [None]:
forest = RandomForestClassifier(n_estimators=20)
forest.fit(X_train, y_train)

In [None]:
forest.score(X_train, y_train)

In [None]:
forest.score(X_test, y_test)

In [None]:
forest.predict(X_train)

#### This is likely not right.... 

In [None]:
len(features_returned)

![](https://images.unsplash.com/photo-1518137569197-67c41a95d8de?ixlib=rb-1.2.1&ixid=eyJhcHBfaWQiOjEyMDd9&auto=format&fit=crop&w=2560&q=80)

# Graveyard

----
### _We can do train / test split before we push through classification models?_

In [None]:
train_images = [(img.numpy(), label.numpy())
                for (img, label) in raw_train.take(10000)]

In [None]:
test_images = [(img.numpy(), label.numpy())
               for (img, label) in raw_test.take(1000)]

In [None]:
plt.imshow(train_images[0][0])
plt.title(get_label_name(train_images[0][1]))

In [None]:
train_X_img, train_y = zip(*train_images)
test_X_img, test_y = zip(*test_images)

# Extract features from some images

In [None]:
TRAIN_LIMIT = 1000
TEST_LIMIT = 200

In [None]:
train_X_small = [extract_features(img)
                 for img in train_X_img[:TRAIN_LIMIT]]
test_X_small = [extract_features(img)
                for img in test_X_img[:TEST_LIMIT]]

In [None]:
train_y_small = train_y[:TRAIN_LIMIT]
test_y_small = test_y[:TEST_LIMIT]

# Logistic Regression on Deep Features

In [None]:
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression(solver='lbfgs')
lr.fit(train_X_small, train_y_small)

In [None]:
train_preds = lr.predict(train_X_small)
(sum(train_y_small) / len(train_y_small),
 sum(train_preds == train_y_small) / len(train_y_small))

In [None]:
test_preds = lr.predict(test_X_small)
(sum(test_y_small) / len(test_y_small),
 sum(test_preds == test_y_small) / len(test_y_small))

In [None]:
idxs=np.argsort(np.abs(lr.coef_[0]))[::-1]

In [None]:
plt.hist(lr.coef_[0])

In [None]:
import pandas as pd
pd.DataFrame({'values': lr.coef_[0][idxs], 'idx': idxs})

# Random Forest on Deep Features

In [None]:
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier(n_estimators=1000, n_jobs=-1)
rfc.fit(train_X_small, train_y_small)

In [None]:
train_preds = rfc.predict(train_X_small)
(sum(train_y_small) / len(train_y_small),
 sum(train_preds == train_y_small) / len(train_y_small))

In [None]:
test_df = pd.DataFrame(test_X_small)

In [None]:
test_df.head()

In [None]:
test_preds = rfc.predict(test_X_small)
(sum(test_y_small) / len(test_y_small),
 sum(test_preds == test_y_small) / len(test_y_small))

In [None]:
rf_idxs = np.argsort(rfc.feature_importances_)[::-1]

In [None]:
rf_idxs

In [None]:
rfc.feature_importances_[rf_idxs]

In [None]:
permuted_test_df = test_df.copy()
idx_list = rf_idxs[:10]
for idx in idx_list:
    important_feature = test_df.loc[:, idx].copy().values
    np.random.shuffle(important_feature)
    permuted_test_df.loc[:, idx] = important_feature

In [None]:
test_preds = rfc.predict(permuted_test_df)
(sum(test_y_small) / len(test_y_small),
 sum(test_preds == test_y_small) / len(test_y_small))

##### Look at DB names
cur = client.list_databases()

for item in cur:
    print(item)

###### Look at everything in our collection!
cur = coll.find({})

for item in cur:
    print(item)