# Hotdog classification: Machine Learning methods

In [2]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

In [2]:
import img_processing

In [3]:
def get_hotdog_class_dirs(root_dir):
    return os.path.join(root_dir, 'hotdog'), os.path.join(root_dir, 'not_hotdog')

In [7]:
import sys
sys.platform

'win32'

In [8]:
import os
dataset_folder = ''
if sys.platform == 'win32':
    dataset_folder = '_data\hotdogs_dataset'
else:
    dataset_folder = os.path.join(os.environ['HOME'], 'hse/data/hotdogs_dataset')
test_folder = os.path.join(dataset_folder, 'test')
train_folder = os.path.join(dataset_folder, 'train')
hotdogs_folder, not_hotdogs_folder = get_hotdog_class_dirs(train_folder)

In [9]:
def to_array_multi(images):
    """convert each image into an array and return a list of those"""
    names = []
    data = []
    for name, img in images:
        img.load()
        arr = np.asarray(img, dtype="uint8")
        arr = arr.reshape(-1)
        names.append(name)
        data.append(arr)
    return names, data

In [10]:
crop_size = (50, 50)

### Image processing

Используется resize изображения с сохранением aspect ratio, затем вырезается центральная часть изображения

In [11]:
hotdog_images, hotdog_errors = img_processing.preprocess_images(hotdogs_folder, (50, 50))
not_hotdog_images, not_hotdog_errors = img_processing.preprocess_images(not_hotdogs_folder, (50, 50))

hotdog_names, hotdog_arrays = to_array_multi(hotdog_images)
not_hotdog_names, not_hotdog_arrays = to_array_multi(not_hotdog_images)

In [12]:
hotdog_labels = pd.Series([1]*len(hotdog_arrays), name='label')
not_hotdog_labels = pd.Series([0]*len(not_hotdog_arrays), name='label')

In [13]:
# hotdogs_df = pd.concat([
#         pd.Series(hotdog_names),
#         hotdog_labels,
#         hotdog_arrays
#     ],
#     sort=False,
#     axis=1
# )

# not_hotdogs_df = pd.concat([
#         pd.Series(not_hotdog_names),
#         not_hotdog_labels,
#         not_hotdog_arrays
#     ],
#     sort=False,
#     axis=1
# )
h_names = pd.Series(hotdog_names, name='name')
nh_names = pd.Series(not_hotdog_names, name='name')
# hotdogs_df = hotdogs_df.append(hotdog_labels)
# hotdogs_df.append([pd.Series(hotdog_names), hotdog_labels, hotdog_arrays])

# not_hotdogs_df = pd.DataFrame()
# not_hotdogs_df.append([pd.Series(not_hotdog_names), not_hotdog_labels, not_hotdog_arrays])

#### Image size - 224x224

In [40]:
hotdog_df = pd.concat([hotdog_labels, h_names, pd.DataFrame(hotdog_arrays)], sort=False, axis=1)
hotdog_df.head()

Unnamed: 0,label,name,0,1,2,3,4,5,6,7,...,150518,150519,150520,150521,150522,150523,150524,150525,150526,150527
0,1,00651.jpg,18,18,26,15,15,23,15,16,...,0,0,0,0,0,0,0,0,0,0
1,1,00337.jpg,37,13,0,42,15,2,41,13,...,0,0,0,0,0,0,0,0,0,0
2,1,00287.jpg,8,8,8,8,8,8,9,9,...,0,0,0,0,0,0,0,0,0,0
3,1,00506.jpg,247,224,244,245,223,242,244,223,...,0,0,0,0,0,0,0,0,0,0
4,1,00064.jpg,214,136,139,195,115,118,195,115,...,0,0,0,0,0,0,0,0,0,0


In [41]:
not_hotdog_df = pd.concat([not_hotdog_labels, nh_names, pd.DataFrame(not_hotdog_arrays)], sort=False, axis=1)
not_hotdog_df.head()

Unnamed: 0,label,name,0,1,2,3,4,5,6,7,...,150518,150519,150520,150521,150522,150523,150524,150525,150526,150527
0,0,00651.jpg,113,108,73,91,85,45,84,54,...,0,0,0,0,0,0,0,0,0,0
1,0,00825.jpg,73,66,41,66,54,34,59,44,...,0,0,0,0,0,0,0,0,0,0
2,0,00337.jpg,2,1,0,2,1,0,3,1,...,0,0,0,0,0,0,0,0,0,0
3,0,00287.jpg,194,205,247,193,207,249,194,207,...,0,0,0,0,0,0,0,0,0,0
4,0,00506.jpg,255,255,255,255,255,255,255,255,...,0,0,0,0,0,0,0,0,0,0


#### Image size - 50x50

In [18]:
hotdog_df = pd.concat([hotdog_labels, h_names, pd.DataFrame(hotdog_arrays)], sort=False, axis=1)
hotdog_df.head()

Unnamed: 0,label,name,0,1,2,3,4,5,6,7,...,7490,7491,7492,7493,7494,7495,7496,7497,7498,7499
0,1,00000.jpg,109,32,1,114,35,2,120,38,...,0,0,0,0,0,0,0,0,0,0
1,1,00001.jpg,142,156,168,159,170,178,168,178,...,0,0,0,0,0,0,0,0,0,0
2,1,00002.jpg,79,81,86,82,84,89,94,96,...,0,0,0,0,0,0,0,0,0,0
3,1,00003.jpg,80,114,131,118,90,53,146,137,...,0,0,0,0,0,0,0,0,0,0
4,1,00004.jpg,138,163,212,120,142,179,124,152,...,0,0,0,0,0,0,0,0,0,0


In [19]:
not_hotdog_df = pd.concat([not_hotdog_labels, nh_names, pd.DataFrame(not_hotdog_arrays)], sort=False, axis=1)
not_hotdog_df.head()

Unnamed: 0,label,name,0,1,2,3,4,5,6,7,...,7490,7491,7492,7493,7494,7495,7496,7497,7498,7499
0,0,00000.jpg,147,133,119,135,120,111,142,125,...,0,0,0,0,0,0,0,0,0,0
1,0,00001.jpg,101,110,102,165,164,156,182,184,...,0,0,0,0,0,0,0,0,0,0
2,0,00002.jpg,28,31,22,20,22,17,28,29,...,0,0,0,0,0,0,0,0,0,0
3,0,00003.jpg,20,30,35,36,55,60,34,48,...,0,0,0,0,0,0,0,0,0,0
4,0,00004.jpg,28,59,11,46,76,28,31,58,...,0,0,0,0,0,0,0,0,0,0


In [20]:
hotdog_df.to_csv('hotdog_df.csv', sep=',')
not_hotdog_df.to_csv('not_hotdog_df.csv', sep=',')

In [21]:
train_df = pd.concat([hotdog_df, not_hotdog_df])
train_df.head()

Unnamed: 0,label,name,0,1,2,3,4,5,6,7,...,7490,7491,7492,7493,7494,7495,7496,7497,7498,7499
0,1,00000.jpg,109,32,1,114,35,2,120,38,...,0,0,0,0,0,0,0,0,0,0
1,1,00001.jpg,142,156,168,159,170,178,168,178,...,0,0,0,0,0,0,0,0,0,0
2,1,00002.jpg,79,81,86,82,84,89,94,96,...,0,0,0,0,0,0,0,0,0,0
3,1,00003.jpg,80,114,131,118,90,53,146,137,...,0,0,0,0,0,0,0,0,0,0
4,1,00004.jpg,138,163,212,120,142,179,124,152,...,0,0,0,0,0,0,0,0,0,0


#### Image size - 224x224

In [2]:
train_df = pd.read_csv('hotdog_train.csv', sep=',', engine='c', dtype=np.int8)
# train_df = train_df.sample(frac=1).reset_index(drop=True)
train_df.to_csv('hotdog_train.csv', sep=',')
train_df.head()

Unnamed: 0.1,Unnamed: 0,label,name,0,1,2,3,4,5,6,...,150518,150519,150520,150521,150522,150523,150524,150525,150526,150527
0,943,0,00298.jpg,255,255,255,255,255,255,255,...,0,0,0,0,0,0,0,0,0,0
1,516,1,00100.jpg,1,1,1,1,1,1,2,...,0,0,0,0,0,0,0,0,0,0
2,220,0,00948.jpg,72,57,41,68,59,40,70,...,0,0,0,0,0,0,0,0,0,0
3,669,0,00672.jpg,42,61,32,41,60,30,41,...,0,0,0,0,0,0,0,0,0,0
4,626,1,00691.jpg,182,159,121,189,168,131,197,...,0,0,0,0,0,0,0,0,0,0


#### Image size - 50x50

In [23]:
# train_df = pd.read_csv('hotdog_train.csv', sep=',', engine='c', dtype=np.int8)
train_df = train_df.sample(frac=1).reset_index(drop=True)
train_df.to_csv('hotdog_train.csv', sep=',')
train_df.head()

Unnamed: 0,label,name,0,1,2,3,4,5,6,7,...,7490,7491,7492,7493,7494,7495,7496,7497,7498,7499
0,0,00868.jpg,155,177,211,100,145,201,99,142,...,0,0,0,0,0,0,0,0,0,0
1,1,00345.jpg,168,163,149,239,233,207,181,110,...,0,0,0,0,0,0,0,0,0,0
2,1,00456.jpg,136,115,95,156,133,110,175,147,...,0,0,0,0,0,0,0,0,0,0
3,0,00379.jpg,73,78,69,83,96,83,106,109,...,0,0,0,0,0,0,0,0,0,0
4,0,00951.jpg,136,141,136,155,169,157,134,158,...,0,0,0,0,0,0,0,0,0,0


In [24]:
not_needed_data = pd.DataFrame(train_df[['label', 'name']])
not_needed_data.head()

Unnamed: 0,label,name
0,0,00868.jpg
1,1,00345.jpg
2,1,00456.jpg
3,0,00379.jpg
4,0,00951.jpg


In [25]:
train_df.drop(['label', 'name'], axis=1, inplace=True)
train_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,7490,7491,7492,7493,7494,7495,7496,7497,7498,7499
0,155,177,211,100,145,201,99,142,200,91,...,0,0,0,0,0,0,0,0,0,0
1,168,163,149,239,233,207,181,110,70,248,...,0,0,0,0,0,0,0,0,0,0
2,136,115,95,156,133,110,175,147,119,183,...,0,0,0,0,0,0,0,0,0,0
3,73,78,69,83,96,83,106,109,95,122,...,0,0,0,0,0,0,0,0,0,0
4,136,141,136,155,169,157,134,158,137,132,...,0,0,0,0,0,0,0,0,0,0


In [105]:
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from sklearn.linear_model import SGDClassifier
import scipy as sp

## Classifier - SGD

In [145]:
def random_search_cv():
    return RandomizedSearchCV(
        SGDClassifier(),
        param_distributions = {
            'alpha': sp.stats.uniform(loc=0.0001, scale=1),
            'loss': ['hinge', 'log', 'modified_huber', 'squared_hinge', 'perceptron'],
            'penalty': [None, 'l2', 'l1', 'elasticnet']
        },
        cv=None,
    #     n_jobs=3
    )
def grid_search_cv():
    return GridSearchCV(
        SGDClassifier(),
        param_grid = {
            'alpha': np.arange(0.1, 1, 0.05),
            'loss': ['squared_hinge', 'log', 'hinge'],
            'penalty': [None, 'elasticnet']
        }
    )

#### Image size - 224x224

In [16]:
search = random_search_cv()
search.fit(train_df, not_needed_data['label'])





RandomizedSearchCV(cv=None, error_score='raise',
          estimator=SGDClassifier(alpha=0.0001, average=False, class_weight=None, epsilon=0.1,
       eta0=0.0, fit_intercept=True, l1_ratio=0.15,
       learning_rate='optimal', loss='hinge', max_iter=None, n_iter=None,
       n_jobs=1, penalty='l2', power_t=0.5, random_state=None,
       shuffle=True, tol=None, verbose=0, warm_start=False),
          fit_params=None, iid=True, n_iter=10, n_jobs=1,
          param_distributions={'alpha': <scipy.stats._distn_infrastructure.rv_frozen object at 0x7fc52cf44438>, 'penalty': [None, 'l2', 'l1', 'elasticnet'], 'loss': ['hinge', 'log', 'modified_huber', 'squared_hinge', 'perceptron']},
          pre_dispatch='2*n_jobs', random_state=None, refit=True,
          return_train_score='warn', scoring=None, verbose=0)

In [19]:
best_cv_err = 1 - search.best_score_
print(best_cv_err)
print(search.best_params_)

0.2716935966487134
{'alpha': 0.00539279331424104, 'penalty': None, 'loss': 'squared_hinge'}


In [18]:
print(search.best_params_)

{'alpha': 0.00539279331424104, 'penalty': None, 'loss': 'squared_hinge'}


#### Image size - 50x50

In [146]:
search = grid_search_cv()
search.fit(train_df, not_needed_data['label'])







































GridSearchCV(cv=None, error_score='raise',
       estimator=SGDClassifier(alpha=0.0001, average=False, class_weight=None, epsilon=0.1,
       eta0=0.0, fit_intercept=True, l1_ratio=0.15,
       learning_rate='optimal', loss='hinge', max_iter=None, n_iter=None,
       n_jobs=1, penalty='l2', power_t=0.5, random_state=None,
       shuffle=True, tol=None, verbose=0, warm_start=False),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'alpha': array([0.1 , 0.15, 0.2 , 0.25, 0.3 , 0.35, 0.4 , 0.45, 0.5 , 0.55, 0.6 ,
       0.65, 0.7 , 0.75, 0.8 , 0.85, 0.9 , 0.95]), 'loss': ['squared_hinge', 'log', 'hinge'], 'penalty': [None, 'elasticnet']},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [147]:
best_cv_err = 1 - search.best_score_
print(best_cv_err)
print(search.best_params_)

0.25249169435215946
{'alpha': 0.1, 'loss': 'log', 'penalty': 'elasticnet'}


In [148]:
print(search.best_params_)

{'alpha': 0.1, 'loss': 'log', 'penalty': 'elasticnet'}


#### 224x224 vs 50x50

Можно заметить, что ошибка на кросс-валидации изменилась не сильно, будем использовать картинки размером 50х50

### PCA

In [149]:
from sklearn.decomposition import PCA
image_reducer = PCA()

In [150]:
transformed_train_df = pd.DataFrame(image_reducer.fit_transform(train_df))
transformed_train_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1495,1496,1497,1498,1499,1500,1501,1502,1503,1504
0,-1000.319381,-104.35701,-775.078442,19.533003,380.114107,137.821136,-1102.142154,-456.246727,463.448389,-197.22076,...,-0.022179,-0.01021,0.015557,-0.011226,0.007567,9.335445e-13,5.175268e-16,2.682945e-15,-1.03061e-16,5.311793e-15
1,-1854.850627,854.493658,-53.246104,-828.472543,-1181.339175,355.195868,-314.67648,-272.358877,-440.783663,-237.924586,...,-0.057086,-0.118309,0.089992,0.071827,0.081032,9.335445e-13,5.175268e-16,2.682945e-15,-1.03061e-16,5.311793e-15
2,-1468.972998,-144.412262,-624.076097,36.258713,353.180214,-110.260412,-14.831683,-139.575504,-234.633508,-222.418884,...,-0.600347,0.328553,-0.24334,0.112752,-0.093124,9.335445e-13,5.175268e-16,2.682945e-15,-1.03061e-16,5.311793e-15
3,-1594.726223,-249.59165,-484.967277,125.719217,400.727872,-208.614868,319.365726,160.136914,-34.494662,8.320724,...,-0.09665,0.160322,0.047497,0.022003,-0.032355,9.335445e-13,5.175268e-16,2.682945e-15,-1.03061e-16,5.311793e-15
4,2782.968557,-756.174775,376.618702,-23.42416,-41.454873,-628.374084,-213.997807,-125.694587,-205.75223,792.312813,...,-0.001142,-0.000801,-0.000793,-0.000936,0.001186,9.335445e-13,5.175268e-16,2.682945e-15,-1.03061e-16,5.311793e-15


In [151]:
print('Original: {0} | Reduced: {1}'.format(train_df.shape, transformed_train_df.shape))

Original: (1505, 7500) | Reduced: (1505, 1505)


#### PCA результаты

In [89]:
image_reducer.components_

array([[ 0.00134617,  0.0016262 ,  0.0018337 , ...,  0.        ,
         0.        ,  0.        ],
       [ 0.02552708,  0.02698564,  0.02908832, ..., -0.        ,
        -0.        , -0.        ],
       [-0.02242583, -0.02237634, -0.02329015, ..., -0.        ,
        -0.        , -0.        ],
       ...,
       [ 0.00068649, -0.01005893,  0.01104631, ...,  0.        ,
         0.        ,  0.        ],
       [ 0.00050356,  0.00073526, -0.00280907, ...,  0.        ,
         0.        ,  0.        ],
       [-0.00142071,  0.00808851, -0.00733932, ..., -0.        ,
        -0.        , -0.        ]])

In [152]:
import numpy as np
np.set_printoptions(threshold=np.nan)
variance = np.array(image_reducer.explained_variance_)
variance

array([4.42910916e+06, 1.51129958e+06, 5.15741639e+05, 4.88058486e+05,
       3.02438591e+05, 2.53417331e+05, 2.17576831e+05, 1.82101333e+05,
       1.70719594e+05, 1.55955055e+05, 1.33421383e+05, 1.19490485e+05,
       9.55410410e+04, 8.81997311e+04, 8.14564057e+04, 7.44386660e+04,
       6.55389944e+04, 6.25933994e+04, 5.85137099e+04, 5.80520479e+04,
       5.22547882e+04, 4.82028581e+04, 4.41371624e+04, 4.28686061e+04,
       4.10708663e+04, 3.60261751e+04, 3.52176308e+04, 3.28491950e+04,
       3.14933264e+04, 3.02789354e+04, 2.95486607e+04, 2.86849052e+04,
       2.77496413e+04, 2.58438828e+04, 2.49511451e+04, 2.38210762e+04,
       2.33667006e+04, 2.21127032e+04, 2.13629153e+04, 2.06656313e+04,
       2.03117786e+04, 1.95180293e+04, 1.82465897e+04, 1.76456772e+04,
       1.70783506e+04, 1.63547993e+04, 1.57705723e+04, 1.55227986e+04,
       1.51463603e+04, 1.45012151e+04, 1.41910250e+04, 1.39326654e+04,
       1.37702477e+04, 1.35729089e+04, 1.32457657e+04, 1.27526209e+04,
      

In [153]:
elite = [v for v in variance if v >= np.mean(variance) / 3]
print(elite)
print(len(elite))

[4429109.162933712, 1511299.5770482507, 515741.63925116096, 488058.48562670156, 302438.5908433855, 253417.33084050508, 217576.83054984896, 182101.33312079517, 170719.59412648945, 155955.05526668252, 133421.383315154, 119490.48547895903, 95541.04103002066, 88199.73112532115, 81456.40565508552, 74438.66600973147, 65538.99435770535, 62593.399420436304, 58513.70994701576, 58052.04793738648, 52254.78821904581, 48202.85809919451, 44137.16238123179, 42868.606091126974, 41070.86627660455, 36026.17512680676, 35217.630770815944, 32849.19495724299, 31493.326410736467, 30278.93543144773, 29548.660708256448, 28684.905248966657, 27749.641318574366, 25843.88282622513, 24951.145052464006, 23821.07623908292, 23366.700590604687, 22112.703183463476, 21362.91527950835, 20665.631251694118, 20311.778550712403, 19518.029327788656, 18246.589675450097, 17645.67718101539, 17078.350635556995, 16354.799314860866, 15770.572270522489, 15522.798587871495, 15146.36029723542, 14501.21510156616, 14191.024954226801, 139

In [154]:
np.mean(elite)

65450.35267081308

### Post-PCA cross-validation

In [155]:
search = grid_search_cv()
search.fit(transformed_train_df, not_needed_data['label'])
best_cv_err = 1 - search.best_score_





































In [156]:
print(best_cv_err)
print(search.best_params_)

0.2710963455149502
{'alpha': 0.1, 'loss': 'log', 'penalty': 'elasticnet'}


### Reducing further

In [157]:
image_reducer = PCA(n_components=len(elite))
hard_reduced_train_df = pd.DataFrame(image_reducer.fit_transform(train_df))

In [158]:
hard_reduced_train_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,151,152,153,154,155,156,157,158,159,160
0,-1000.319381,-104.35701,-775.078442,19.533003,380.114107,137.821136,-1102.142154,-456.246727,463.448389,-197.22076,...,15.565361,2.135367,16.432858,55.58144,36.338159,-49.698193,-82.589426,-8.67238,-31.974309,-42.712474
1,-1854.850627,854.493658,-53.246104,-828.472543,-1181.339175,355.195868,-314.67648,-272.358877,-440.783663,-237.924586,...,99.169615,-30.498724,-3.474285,-80.429347,18.249458,-2.585613,14.754721,34.763398,42.991353,-11.190567
2,-1468.972998,-144.412262,-624.076097,36.258713,353.180214,-110.260412,-14.831683,-139.575504,-234.633508,-222.418884,...,12.784537,-2.413818,33.978231,-3.44552,-17.007542,6.93134,12.550975,26.079426,2.60448,-15.271528
3,-1594.726223,-249.59165,-484.967277,125.719217,400.727872,-208.614868,319.365726,160.136914,-34.494662,8.320723,...,-33.89127,-2.36548,11.648025,0.128648,-56.402681,5.309051,-18.420711,-34.889198,26.390614,16.710062
4,2782.968557,-756.174775,376.618702,-23.42416,-41.454873,-628.374084,-213.997807,-125.694587,-205.75223,792.312814,...,-36.720187,51.857059,-84.647385,-45.776226,-83.332638,19.103726,-34.814087,-66.397862,-45.468091,-17.624687


In [159]:
search = grid_search_cv()
search.fit(hard_reduced_train_df, not_needed_data['label'])
best_cv_err = 1 - search.best_score_

























In [160]:
print(best_cv_err)
print(search.best_params_)

0.26112956810631227
{'alpha': 0.5500000000000002, 'loss': 'squared_hinge', 'penalty': 'elasticnet'}


### Выводы для данной задачи

1) Лучше использовать малый размер изображения

2) Не стоит использовать конверсию в gray-scale <=> Хотдоги должны быть красные (ошибка кросс-валидации выше при gray-scale: для изображений 224х224 = ~0.35)

3) PCA практически не влияет на результат, при этом выкидывая большую часть пикселей, что довольно неплохо для production модели

4) Можно уменьшить размерность вектора признаков еще сильнее, точность слегка улучшиться по сравнению с обычным PCA