In [1]:
# # Required installations (run once)
# !brew install wget  # Added by Miles
# !pip install --upgrade --ignore-installed wrapt  # Added by Miles
# !pip install tensorflow==2.0.0-beta0  # Edited by Miles (switch to CPU version)
# !pip install tensorflow_datasets  # Added by Miles

In [88]:
%matplotlib inline
%load_ext autoreload
%autoreload 1
%aimport helper

## Import Libraries

In [2]:
import os
import numpy as np
import matplotlib.pyplot as plt

import tensorflow as tf
keras = tf.keras

import tensorflow_datasets as tfds
tfds.disable_progress_bar()

import pymongo        # MongoDB

from PIL import Image

import helper

### Mongo Keys

In [3]:
# Define path to secret
secret_path = os.path.join(os.environ['HOME'], '.secret', 'mongo.json')

In [4]:
keys = helper.get_keys(secret_path)
mongo_user = keys['user_id']
mongo_pw = keys['password']

## Data download

In [7]:
# from os import walk

image_paths = []
for dirpath, dirnames, filenames in os.walk('test_data'):
    
    for ff in filenames:
        if ff[:1] != '.':
            curr_path = os.path.join('.',dirpath, ff)
#             print(curr_path)  
            image_paths.append(curr_path)
    
image_paths

['./test_data/brian/6.paul-walker-medium.jpg',
 './test_data/brian/11.2.37830931.jpg',
 './test_data/brian/2.paul-walker-21044993-1-402.jpg',
 './test_data/brian/9.66892393.jpg',
 './test_data/brian/1.maxresdefault.jpg',
 './test_data/brian/12.gettyimages-168243951.jpg',
 './test_data/brian/5._102791014_gettyimages-164559639.jpg',
 './test_data/brian/8.paul-walker.jpg',
 './test_data/brian/3.220px-PaulWalkerEdit-1.jpg',
 './test_data/brian/4.MV5BMjIwODc0OTk2Nl5BMl5BanBnXkFtZTcwOTQ5MDA0Mg@@._V1_UX214_CR0,0,214,317_AL_.jpg',
 './test_data/hobbs/3.dwayne-johnson-11818916-1-402.jpg',
 './test_data/hobbs/5.416x416.jpg',
 './test_data/hobbs/2.MV5BMTkyNDQ3NzAxM15BMl5BanBnXkFtZTgwODIwMTQ0NTE@._V1_.jpg',
 './test_data/hobbs/4.5b462c57f4af9c1a008b45eb-750-563.jpg',
 './test_data/hobbs/8.the-rock-1.jpg',
 './test_data/hobbs/7.190127_3871676_Dwayne__The_Rock__Johnson_Wants_To_Be_In__Cr_800x450_1432791619872.jpg',
 './test_data/hobbs/1.220px-Dwayne_Johnson_2%2C_2013.jpg',
 './test_data/hobbs/9.MV5B

IMG_SHAPE = (IMG_SIZE, IMG_SIZE, 3)

# Create the base model from the pre-trained model MobileNet V2
base_model = tf.keras.applications.MobileNetV2(input_shape=IMG_SHAPE,
                                               include_top=False,
                                               weights='imagenet')

global_average_layer = tf.keras.layers.GlobalAveragePooling2D()

model = tf.keras.Sequential([
  base_model,
  global_average_layer,
])

def extract_features(image):
    """Return a vector of 1280 deep features for image."""
    image_resized = prepare_image(image)
    image_np = image_resized.numpy()
    images_np = np.expand_dims(image_np, axis=0)
    image_np.shape, images_np.shape
    deep_features = model.predict(images_np)
    return deep_features[0]

extract_features(image)

## Create Class

In [133]:
from sklearn.base import BaseEstimator, TransformerMixin

class ImageFeatureExtractor(BaseEstimator, TransformerMixin):
    """Extracts deep features from images."""
    
    def __init__(self, model="MobileNetV2", height=160, width=160):
        """Creates an ImageFeatureExtractor using the specified model."""
        self.height, self.width = height, width
        if model == "MobileNetV2":
            base_model = tf.keras.applications.MobileNetV2(
                input_shape=(height, width, 3),
               include_top=False,
               weights='imagenet'
            )
        else:
            raise Exception("Model unknown")
        global_average_layer = tf.keras.layers.GlobalAveragePooling2D()
        self.model = tf.keras.Sequential([base_model, global_average_layer])
    
    def fit(self, X, y):
        """We're using a pre-trained model, so there's nothing to fit."""
        pass
    
    def transform(self, X):
        """Transforms image file paths into Numpy arrays of deep features."""
        result = []
        for image_pathname in X:
            result.append(self._transform_one(image_pathname))
#         print(len(result))
#         return np.array(result)
        return result
    
    def _transform_one(self, image_pathname):
        """Transforms a single image pathname into deep features."""
        img = Image.open(image_pathname)
        img.load()
        image = np.asarray(img)
        return self._extract_features(image)
    
    def _extract_features(self, image):
        """Return a vector of 1280 deep features for image."""
        image_resized = prepare_image(image)
        image_np = image_resized.numpy()
        images_np = np.expand_dims(image_np, axis=0)
        image_np.shape, images_np.shape
        deep_features = self.model.predict(images_np)
        return deep_features[0]
    
    def _prepare_image(self, image):
        """Converts an image to the expected format for prediction."""
        image = tf.cast(image, tf.float32)
        image = (image/127.5) - 1
        image = tf.image.resize(image, (self.height, self.width))
        return image

    def fetch_image_from_s3(self, bucket, key):             #Anna's Code
        """Fetches an image from S3 and returns a numpy array."""
        s3 = boto3.client('s3')
        response = s3.get_object(Bucket=bucket, Key=key)
        body = response['Body']
        data = body.read()    
        f = BytesIO(data)
        image = Image.open(f)   
        image_data = np.asarray(image)
        return image_data

### Get Features

In [134]:
# Instantiate Class
extractor = ImageFeatureExtractor()

In [135]:
# Store Features - list of arrays
features = extractor.transform(image_paths)

In [136]:
# Turn into list of lists because easier with MongoDB
features_list = [feature.tolist() for feature in features]

In [137]:
# Zip them so can iterate through them
zipped_imgs = list(zip(image_paths,features_list))

In [138]:
# Create list of dictionaries; so can be ingested by MongoDB
list_of_dicts = [{'url': img[0], 'features':img[1]} for img in zipped_imgs]

### Upload results to MongoDB

In [139]:
# Instantiate client
client = pymongo.MongoClient("mongodb+srv://" + mongo_user + ":" 
                         + mongo_pw 
                         + "@dsaf-oy1s0.mongodb.net/test?retryWrites=true")


In [140]:
# Get DB, Collection
db = client['furious']
coll = db['images']

In [141]:
# Wipe collection to start fresh
coll.delete_many({})

<pymongo.results.DeleteResult at 0x157cb6b08>

In [142]:
# Insert Results
coll.insert_many(list_of_dicts)

<pymongo.results.InsertManyResult at 0x1502fbd48>

In [143]:
features_returned = [np.array(x['features']) for x in coll.find()]

In [144]:
features_returned

[array([0.45510256, 0.        , 0.        , ..., 0.41784862, 0.        ,
        0.01271284]),
 array([0., 0., 0., ..., 0., 0., 0.]),
 array([0.07392223, 0.        , 0.00697432, ..., 0.        , 0.        ,
        0.        ]),
 array([0.2498211 , 0.        , 0.17990871, ..., 0.        , 0.        ,
        0.        ]),
 array([0.93050528, 0.        , 0.83294296, ..., 0.09675258, 0.        ,
        0.        ]),
 array([0.16764668, 0.        , 0.        , ..., 0.00537357, 0.        ,
        0.        ]),
 array([0.01346662, 0.        , 0.39356804, ..., 0.        , 0.        ,
        0.        ]),
 array([0.93678182, 0.2793051 , 0.        , ..., 0.        , 0.        ,
        0.        ]),
 array([0.13647854, 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ]),
 array([0.12628074, 0.        , 0.00428738, ..., 0.        , 0.        ,
        0.        ]),
 array([0.04088522, 0.        , 0.23422989, ..., 0.        , 0.        ,
        0.        ]),
 array([0.5

# LSM Random Forests

In [53]:
target = ['brian', 'brian', 'brian', 'brian', 'brian', 'brian', 'brian', 'brian', 'brian', 'brian', 
          'hobbs', 'hobbs', 'hobbs', 'hobbs', 'hobbs', 'hobbs', 'hobbs', 'hobbs', 'hobbs']

In [54]:
X_train, X_test, y_train, y_test = train_test_split(features, 
                                                    target, 
                                                    test_size = 0.25,
                                                    random_state=123)

### Regular Tree

In [65]:
tree_clf = DecisionTreeClassifier(criterion = "gini", max_depth = 5) 
tree_clf.fit(X_train, y_train)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=5,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')

In [58]:
tree_clf.feature_importances_

array([0., 0., 0., ..., 0., 0., 0.])

In [64]:
pred = tree_clf.predict(X_test)
print(confusion_matrix(y_test, pred))
print(classification_report(y_test, pred))

[[1 2]
 [0 2]]
              precision    recall  f1-score   support

       brian       1.00      0.33      0.50         3
       hobbs       0.50      1.00      0.67         2

   micro avg       0.60      0.60      0.60         5
   macro avg       0.75      0.67      0.58         5
weighted avg       0.80      0.60      0.57         5



### Random Forest

In [75]:
forest = RandomForestClassifier(n_estimators=20, max_depth= 5)
forest.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=5, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=20, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [76]:
forest.score(X_train, y_train)

1.0

In [78]:
forest.score(X_test, y_test)

1.0

#### This is likely not right.... 

In [145]:
len(features_returned)

19

![](https://images.unsplash.com/photo-1518137569197-67c41a95d8de?ixlib=rb-1.2.1&ixid=eyJhcHBfaWQiOjEyMDd9&auto=format&fit=crop&w=2560&q=80)

# Graveyard

----
### _We can do train / test split before we push through classification models?_

In [None]:
train_images = [(img.numpy(), label.numpy())
                for (img, label) in raw_train.take(10000)]

In [None]:
test_images = [(img.numpy(), label.numpy())
               for (img, label) in raw_test.take(1000)]

In [None]:
plt.imshow(train_images[0][0])
plt.title(get_label_name(train_images[0][1]))

In [None]:
train_X_img, train_y = zip(*train_images)
test_X_img, test_y = zip(*test_images)

# Extract features from some images

In [None]:
TRAIN_LIMIT = 1000
TEST_LIMIT = 200

In [None]:
train_X_small = [extract_features(img)
                 for img in train_X_img[:TRAIN_LIMIT]]
test_X_small = [extract_features(img)
                for img in test_X_img[:TEST_LIMIT]]

In [None]:
train_y_small = train_y[:TRAIN_LIMIT]
test_y_small = test_y[:TEST_LIMIT]

# Logistic Regression on Deep Features

In [None]:
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression(solver='lbfgs')
lr.fit(train_X_small, train_y_small)

In [None]:
train_preds = lr.predict(train_X_small)
(sum(train_y_small) / len(train_y_small),
 sum(train_preds == train_y_small) / len(train_y_small))

In [None]:
test_preds = lr.predict(test_X_small)
(sum(test_y_small) / len(test_y_small),
 sum(test_preds == test_y_small) / len(test_y_small))

In [None]:
idxs=np.argsort(np.abs(lr.coef_[0]))[::-1]

In [None]:
plt.hist(lr.coef_[0])

In [None]:
import pandas as pd
pd.DataFrame({'values': lr.coef_[0][idxs], 'idx': idxs})

# Random Forest on Deep Features

In [None]:
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier(n_estimators=1000, n_jobs=-1)
rfc.fit(train_X_small, train_y_small)

In [None]:
train_preds = rfc.predict(train_X_small)
(sum(train_y_small) / len(train_y_small),
 sum(train_preds == train_y_small) / len(train_y_small))

In [None]:
test_df = pd.DataFrame(test_X_small)

In [None]:
test_df.head()

In [None]:
test_preds = rfc.predict(test_X_small)
(sum(test_y_small) / len(test_y_small),
 sum(test_preds == test_y_small) / len(test_y_small))

In [None]:
rf_idxs = np.argsort(rfc.feature_importances_)[::-1]

In [None]:
rf_idxs

In [None]:
rfc.feature_importances_[rf_idxs]

In [None]:
permuted_test_df = test_df.copy()
idx_list = rf_idxs[:10]
for idx in idx_list:
    important_feature = test_df.loc[:, idx].copy().values
    np.random.shuffle(important_feature)
    permuted_test_df.loc[:, idx] = important_feature

In [None]:
test_preds = rfc.predict(permuted_test_df)
(sum(test_y_small) / len(test_y_small),
 sum(test_preds == test_y_small) / len(test_y_small))

##### Look at DB names
cur = client.list_databases()

for item in cur:
    print(item)

###### Look at everything in our collection!
cur = coll.find({})

for item in cur:
    print(item)