In [8]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
%matplotlib inline

In [6]:
hotdog_data = pd.read_csv('hotdog.csv', sep=',', engine='c', dtype='uint8')
not_hotdog_data = pd.read_csv('not_hotdog.csv', sep=',', engine='c', dtype='uint8')

In [7]:
import img_processing

In [8]:
def get_hotdog_class_dirs(root_dir):
    return os.path.join(root_dir, 'hotdog'), os.path.join(root_dir, 'not_hotdog')

In [11]:
import os
dataset_folder = os.path.join(os.environ['HOME'], 'hse/data/hotdogs_dataset')
test_folder = os.path.join(dataset_folder, 'test')
train_folder = os.path.join(dataset_folder, 'train')
hotdogs_folder, not_hotdogs_folder = get_hotdog_class_dirs(train_folder)

In [13]:
def to_array_multi(images):
    """convert each image into an array and return a list of those"""
    names = []
    data = []
    for name, img in images:
        img.load()
        arr = np.asarray(img, dtype="uint8")
        arr = arr.reshape(-1)
        names.append(name)
        data.append(arr)
    return names, data

In [14]:
hotdog_images, hotdog_errors = img_processing.preprocess_images(hotdogs_folder, (224, 224))
not_hotdog_images, not_hotdog_errors = img_processing.preprocess_images(not_hotdogs_folder, (224, 224))

hotdog_names, hotdog_arrays = to_array_multi(hotdog_images)
not_hotdog_names, not_hotdog_arrays = to_array_multi(not_hotdog_images)

In [27]:
hotdog_labels = pd.Series([1]*len(hotdog_arrays), name='label')
not_hotdog_labels = pd.Series([0]*len(not_hotdog_arrays), name='label')

In [39]:
# hotdogs_df = pd.concat([
#         pd.Series(hotdog_names),
#         hotdog_labels,
#         hotdog_arrays
#     ],
#     sort=False,
#     axis=1
# )

# not_hotdogs_df = pd.concat([
#         pd.Series(not_hotdog_names),
#         not_hotdog_labels,
#         not_hotdog_arrays
#     ],
#     sort=False,
#     axis=1
# )
h_names = pd.Series(hotdog_names, name='name')
nh_names = pd.Series(not_hotdog_names, name='name')
# hotdogs_df = hotdogs_df.append(hotdog_labels)
# hotdogs_df.append([pd.Series(hotdog_names), hotdog_labels, hotdog_arrays])

# not_hotdogs_df = pd.DataFrame()
# not_hotdogs_df.append([pd.Series(not_hotdog_names), not_hotdog_labels, not_hotdog_arrays])

In [40]:
hotdog_df = pd.concat([hotdog_labels, h_names, pd.DataFrame(hotdog_arrays)], sort=False, axis=1)
hotdog_df.head()

Unnamed: 0,label,name,0,1,2,3,4,5,6,7,...,150518,150519,150520,150521,150522,150523,150524,150525,150526,150527
0,1,00651.jpg,18,18,26,15,15,23,15,16,...,0,0,0,0,0,0,0,0,0,0
1,1,00337.jpg,37,13,0,42,15,2,41,13,...,0,0,0,0,0,0,0,0,0,0
2,1,00287.jpg,8,8,8,8,8,8,9,9,...,0,0,0,0,0,0,0,0,0,0
3,1,00506.jpg,247,224,244,245,223,242,244,223,...,0,0,0,0,0,0,0,0,0,0
4,1,00064.jpg,214,136,139,195,115,118,195,115,...,0,0,0,0,0,0,0,0,0,0


In [41]:
not_hotdog_df = pd.concat([not_hotdog_labels, nh_names, pd.DataFrame(not_hotdog_arrays)], sort=False, axis=1)
not_hotdog_df.head()

Unnamed: 0,label,name,0,1,2,3,4,5,6,7,...,150518,150519,150520,150521,150522,150523,150524,150525,150526,150527
0,0,00651.jpg,113,108,73,91,85,45,84,54,...,0,0,0,0,0,0,0,0,0,0
1,0,00825.jpg,73,66,41,66,54,34,59,44,...,0,0,0,0,0,0,0,0,0,0
2,0,00337.jpg,2,1,0,2,1,0,3,1,...,0,0,0,0,0,0,0,0,0,0
3,0,00287.jpg,194,205,247,193,207,249,194,207,...,0,0,0,0,0,0,0,0,0,0
4,0,00506.jpg,255,255,255,255,255,255,255,255,...,0,0,0,0,0,0,0,0,0,0


In [42]:
hotdog_df.to_csv('hotdog_df.csv', sep=',')
not_hotdog_df.to_csv('not_hotdog_df.csv', sep=',')

In [43]:
train_df = pd.concat([hotdog_df, not_hotdog_df])
train_df.head()

In [2]:
train_df = pd.read_csv('hotdog_train.csv', sep=',', engine='c', dtype=np.int8)
# train_df = train_df.sample(frac=1).reset_index(drop=True)
train_df.to_csv('hotdog_train.csv', sep=',')
train_df.head()

Unnamed: 0.1,Unnamed: 0,label,name,0,1,2,3,4,5,6,...,150518,150519,150520,150521,150522,150523,150524,150525,150526,150527
0,943,0,00298.jpg,255,255,255,255,255,255,255,...,0,0,0,0,0,0,0,0,0,0
1,516,1,00100.jpg,1,1,1,1,1,1,2,...,0,0,0,0,0,0,0,0,0,0
2,220,0,00948.jpg,72,57,41,68,59,40,70,...,0,0,0,0,0,0,0,0,0,0
3,669,0,00672.jpg,42,61,32,41,60,30,41,...,0,0,0,0,0,0,0,0,0,0
4,626,1,00691.jpg,182,159,121,189,168,131,197,...,0,0,0,0,0,0,0,0,0,0


In [12]:
not_needed_data = pd.DataFrame(train_df[['label', 'name']])
not_needed_data.head()

Unnamed: 0,label,name
0,0,00298.jpg
1,1,00100.jpg
2,0,00948.jpg
3,0,00672.jpg
4,1,00691.jpg


In [13]:
train_df.drop(['label', 'name'], axis=1, inplace=True)
train_df.head()

Unnamed: 0.1,Unnamed: 0,0,1,2,3,4,5,6,7,8,...,150518,150519,150520,150521,150522,150523,150524,150525,150526,150527
0,943,255,255,255,255,255,255,255,255,255,...,0,0,0,0,0,0,0,0,0,0
1,516,1,1,1,1,1,1,2,0,1,...,0,0,0,0,0,0,0,0,0,0
2,220,72,57,41,68,59,40,70,58,41,...,0,0,0,0,0,0,0,0,0,0
3,669,42,61,32,41,60,30,41,60,30,...,0,0,0,0,0,0,0,0,0,0
4,626,182,159,121,189,168,131,197,175,144,...,0,0,0,0,0,0,0,0,0,0


In [5]:
from sklearn.model_selection import RandomizedSearchCV
from sklearn.linear_model import SGDClassifier
import scipy as sp

In [15]:
clf = SGDClassifier()
search = RandomizedSearchCV(
    clf,
    param_distributions={
        'alpha': sp.stats.uniform(loc=0.0001, scale=1),
        'loss': ['hinge', 'log', 'modified_huber', 'squared_hinge', 'perceptron'],
        'penalty': [None, 'l2', 'l1', 'elasticnet']
    },
    cv=None,
#     n_jobs=3
)

In [16]:
search.fit(train_df, not_needed_data['label'])





RandomizedSearchCV(cv=None, error_score='raise',
          estimator=SGDClassifier(alpha=0.0001, average=False, class_weight=None, epsilon=0.1,
       eta0=0.0, fit_intercept=True, l1_ratio=0.15,
       learning_rate='optimal', loss='hinge', max_iter=None, n_iter=None,
       n_jobs=1, penalty='l2', power_t=0.5, random_state=None,
       shuffle=True, tol=None, verbose=0, warm_start=False),
          fit_params=None, iid=True, n_iter=10, n_jobs=1,
          param_distributions={'alpha': <scipy.stats._distn_infrastructure.rv_frozen object at 0x7fc52cf44438>, 'penalty': [None, 'l2', 'l1', 'elasticnet'], 'loss': ['hinge', 'log', 'modified_huber', 'squared_hinge', 'perceptron']},
          pre_dispatch='2*n_jobs', random_state=None, refit=True,
          return_train_score='warn', scoring=None, verbose=0)

In [19]:
best_cv_err = 1 - search.best_score_
print(best_cv_err)
print(search.best_params_)

0.2716935966487134
{'alpha': 0.00539279331424104, 'penalty': None, 'loss': 'squared_hinge'}


In [18]:
print(search.best_params_)

{'alpha': 0.00539279331424104, 'penalty': None, 'loss': 'squared_hinge'}
