In [3]:
import pandas as pd
from glob import glob
import os
from PIL import Image
import numpy as np
import random

In [4]:
from keras.applications.resnet50 import ResNet50
from keras.preprocessing import image
from keras.applications.resnet50 import preprocess_input, decode_predictions
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

random.seed(34)

In [5]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [6]:
def read_dataset(path):
    X = []
    y = []
    
    image_paths_list = glob(os.path.join(path, 'train', '*.jpg'))
    image_paths_sample = random.sample(image_paths_list, 1000)

    for image_path in image_paths_sample:
        image_name = os.path.basename(image_path)
        image_name_parts = image_name.split('.')
        label = image_name_parts[0] if len(image_name_parts) == 3 else None

        if label:
            y.append(int(label == 'cat'))
            
        x = image.img_to_array(image.load_img(image_path, target_size=(224, 224)))
        x = preprocess_input(x)
            
        X.append(x)
            
    return np.array(X), y

In [7]:
!unzip -q '/content/drive/MyDrive/Colab Notebooks/train.zip' -d /content

replace /content/train/cat.0.jpg? [y]es, [n]o, [A]ll, [N]one, [r]ename: 

In [8]:
url = "/content"

In [9]:
model = ResNet50(weights='imagenet')

In [10]:
train =[]
test = []
trainAns =[]
testAns = []
for i in range(25):
    print(i)
    data = read_dataset(url)
    X_train, X_test, y_train, y_test = train_test_split(data[0], data[1], test_size=0.30)
    
    preds = model.predict(X_train)
    train.extend(preds)
    
    preds = model.predict(X_test)
    test.extend(preds)
    
    trainAns.extend(y_train)
    testAns.extend(y_test)

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24


In [11]:
clf = XGBClassifier()

In [12]:
clf.fit(np.array(train),trainAns)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0,
              learning_rate=0.1, max_delta_step=0, max_depth=3,
              min_child_weight=1, missing=None, n_estimators=100, n_jobs=1,
              nthread=None, objective='binary:logistic', random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
              silent=None, subsample=1, verbosity=1)

In [13]:
clf.evals_result

<bound method XGBClassifier.evals_result of XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0,
              learning_rate=0.1, max_delta_step=0, max_depth=3,
              min_child_weight=1, missing=None, n_estimators=100, n_jobs=1,
              nthread=None, objective='binary:logistic', random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
              silent=None, subsample=1, verbosity=1)>

In [14]:
pred = clf.predict(test)
accuracy_score(testAns, pred)

0.9905333333333334

In [15]:
# эксперементальным путем получили такие параметры, другие давали хуже результаты
param_dist = { 'num_iterations':1000, 'learning_rate': 0.1}

clf2 = XGBClassifier(**param_dist)

In [16]:
clf2.fit(np.array(train),trainAns)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0,
              learning_rate=0.1, max_delta_step=0, max_depth=3,
              min_child_weight=1, missing=None, n_estimators=100, n_jobs=1,
              nthread=None, num_iterations=1000, objective='binary:logistic',
              random_state=0, reg_alpha=0, reg_lambda=1, scale_pos_weight=1,
              seed=None, silent=None, subsample=1, verbosity=1)

In [17]:
pred2 = clf2.predict(test)
accuracy_score(testAns, pred2)

0.9905333333333334

In [18]:
import hyperopt
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials

In [19]:
# здесь задаются параметры, которые мы изменяем
# параметры не все так как итерация по всем знаимает больше 5 часов
space={'max_depth': hp.quniform("max_depth", 3, 18, 1),
        'gamma': hp.uniform ('gamma', 1,9),
        'colsample_bytree' : hp.uniform('colsample_bytree', 0.5,1),
        'min_child_weight' : hp.quniform('min_child_weight', 0, 10, 1),
        'n_estimators': 180
    }

In [20]:
def hyperparameter_tuning(space):
    print(space['max_depth'])
    model = XGBClassifier(n_estimators =space['n_estimators'], 
                          max_depth = int(space['max_depth']), 
                          gamma = space['gamma'],
                          **param_dist)
    evaluation = [( train, trainAns), ( test, testAns)]
    
    model.fit(np.array(train), trainAns,
            eval_set=evaluation, eval_metric="rmse",
            early_stopping_rounds=10,verbose=False)

    pred = model.predict(test)
    accuracy = accuracy_score(testAns, pred>0.5)
    print ("SCORE:", accuracy)
    return {'loss': -accuracy, 'status': STATUS_OK, 'model': model}

In [21]:
trials = Trials()
best = fmin(fn=hyperparameter_tuning,
            space=space,
            algo=tpe.suggest,
            max_evals=5,
            trials=trials)

print (best)

9.0
SCORE:
0.9926666666666667
3.0
SCORE:
0.9896
15.0
SCORE:
0.9913333333333333
16.0
SCORE:
0.9918666666666667
11.0
SCORE:
0.9913333333333333
100%|██████████| 5/5 [26:19<00:00, 315.99s/it, best loss: -0.9926666666666667]
{'colsample_bytree': 0.6134924489813431, 'gamma': 3.1020056915276673, 'max_depth': 9.0, 'min_child_weight': 9.0}


In [22]:
print(best)

{'colsample_bytree': 0.6134924489813431, 'gamma': 3.1020056915276673, 'max_depth': 9.0, 'min_child_weight': 9.0}


In [23]:
# выжные параметры
i = 0
import_param = []
for el in clf.feature_importances_:
  i += 1
  if(el > 0.01):
    print(el, i)
    import_param.append(i)

0.013307261 160
0.011589089 176
0.012168011 183
0.011672517 195
0.01808207 201
0.0130567355 228
0.020534467 235
0.27448246 282
0.021842286 284
0.014873535 285
0.11640169 286
0.055002734 288
0.01652918 589
0.018145414 897
