In [None]:
import pandas as pd
import numpy as np
import sklearn
from tqdm import tqdm_notebook as tqdm
from sklearn.model_selection import train_test_split, GridSearchCV

In [None]:
%tensorflow_version 1.x
from keras.utils.np_utils import to_categorical # convert to one-hot-encoding
from keras.models import Sequential
from keras.layers import BatchNormalization, Dense, Dropout, Flatten, Conv2D, MaxPool2D, AveragePooling2D
from keras.optimizers import RMSprop, Adam
from keras.preprocessing.image import ImageDataGenerator
from keras.callbacks import ReduceLROnPlateau

## Загрузка файлов для тренировки
### (см. страницу соревнования и google drive)

In [None]:
data_test = pd.read_csv('/content/drive/My Drive/gtrain.csv')

In [None]:
data_train_1 = pd.read_csv('/content/drive/My Drive/gtrain_five_step.csv')
data_train_2 = pd.read_csv('/content/drive/My Drive/gtrain_five_step_plus.csv')

In [None]:
data_train = pd.concat([data_train_1, data_train_2])

In [None]:
drop_x = ['x_' + str(t) for t in range(484)]
drop_y = ['y_' + str(t) for t in range(484)]

## Всего обучается 5 моделей - по одной на каждый шаг,

In [None]:
step = 1 # Выбрать шаг

In [None]:
Xstep_1 = data_train.drop(drop_y, axis = 1)
Xstep_1 = Xstep_1.drop(['id'], axis = 1)
Xstep_1 = Xstep_1.loc[Xstep_1['steps'] == step]
Xstep_1 = Xstep_1.drop(['steps'], axis = 1)

ystep_1 = data_train.drop(drop_x, axis = 1)
ystep_1 = ystep_1.drop(['id'], axis = 1)
ystep_1 = ystep_1.loc[ystep_1['steps'] == step]
ystep_1 = ystep_1.drop(['steps'], axis = 1)

In [None]:
Xtest = data_test.drop(drop_y, axis = 1)
Xtest = Xtest.drop(['id'], axis = 1)
Xtest = Xtest.loc[Xtest['steps'] == step]
Xtest = Xtest.drop(['steps'], axis = 1)
Xtest = Xtest.to_numpy().reshape(Xtest.shape[0], 22, 22, 1)

ytest = data_test.drop(drop_x, axis = 1)
ytest = ytest.drop(['id'], axis = 1)
ytest = ytest.loc[ytest['steps'] == step]
ytest = ytest.drop(['steps'], axis = 1)

## Модель (CNN)

In [None]:
# Формируем модель
model5 = Sequential()

model5.add(Conv2D(filters = 32, kernel_size = (5, 5),padding = 'Same',
                  activation ='relu', input_shape=(22,22,1)))
model5.add(Conv2D(filters = 32, kernel_size = (5, 5),padding = 'Same', 
                  activation ='relu'))
model5.add(MaxPool2D(pool_size=(2,2)))
model5.add(Dropout(0.25))

model5.add(Conv2D(filters = 64, kernel_size = (5, 5),padding = 'Same', 
                   activation ='relu'))
model5.add(Conv2D(filters = 64, kernel_size = (5, 5),padding = 'Same', 
                   activation ='relu'))
model5.add(MaxPool2D(pool_size=(2,2), strides=(2,2)))
model5.add(Dropout(0.5))


model5.add(Flatten())
model5.add(Dense(900, activation = "relu"))
model5.add(Dropout(0.5))
model5.add(Dense(484, activation = "sigmoid"))

In [None]:
optimizer = Adam(lr = 1e-3)

In [None]:
model5.compile(optimizer = optimizer , loss = "binary_crossentropy", metrics=["accuracy"])

In [None]:
learning_rate_reduction = [ ReduceLROnPlateau(monitor='val_acc', 
                                            patience=3, 
                                            verbose=1, 
                                            factor=0.5, 
                                            min_lr=0.00001) ]

In [None]:
epochs = 30
batch_size = 300

In [None]:
X_train, X_test, y_train, y_test = train_test_split(Xstep_1, ystep_1, test_size = 0.02)

In [None]:
Xstep_1_r = X_train.to_numpy().reshape(X_train.shape[0], 22, 22, 1)
x_test = X_test.to_numpy().reshape(X_test.shape[0], 22, 22, 1)

history = model5.fit(Xstep_1_r, y_train.values, batch_size = batch_size, epochs = epochs, 
                     validation_data = (x_test, y_test.values), verbose = 5, callbacks = learning_rate_reduction)

Train on 98000 samples, validate on 2000 samples
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30

Epoch 00012: ReduceLROnPlateau reducing learning rate to 0.0005000000237487257.
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30

Epoch 00018: ReduceLROnPlateau reducing learning rate to 0.0002500000118743628.
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30

Epoch 00023: ReduceLROnPlateau reducing learning rate to 0.0001250000059371814.
Epoch 24/30
Epoch 25/30
Epoch 26/30

Epoch 00026: ReduceLROnPlateau reducing learning rate to 6.25000029685907e-05.
Epoch 27/30
Epoch 28/30
Epoch 29/30

Epoch 00029: ReduceLROnPlateau reducing learning rate to 3.125000148429535e-05.
Epoch 30/30


## Подсчет качества

In [None]:
y_pred = model5.predict(x_test)
y_pred[y_pred < 0.5] = 0
y_pred[y_pred >= 0.5] = 1

In [None]:
sklearn.metrics.mean_absolute_error(y_test.values, y_pred) # 0.354995867768595

0.38155681818181814

In [None]:
y_pred = model5.predict(Xstep_1_r)
y_pred[y_pred < 0.5] = 0
y_pred[y_pred >= 0.5] = 1

In [None]:
sklearn.metrics.mean_absolute_error(y_train.values, y_pred) # 0.35063829201101926

0.379359398718165

In [None]:
y_pred = model5.predict(Xtest)
y_pred[y_pred < 0.5] = 0
y_pred[y_pred >= 0.5] = 1

In [None]:
sklearn.metrics.mean_absolute_error(ytest.values, y_pred) # 0.3511643808564629

0.3796690695296942

In [None]:
#models = []

In [None]:
models.append(model5)

In [None]:
len(models)

5

## CatBoost

In [None]:
from catboost import CatBoostClassifier

In [None]:
pip install catboost

Collecting catboost
[?25l  Downloading https://files.pythonhosted.org/packages/94/ec/12b9a42b2ea7dfe5b602f235692ab2b61ee1334ff34334a15902272869e8/catboost-0.22-cp36-none-manylinux1_x86_64.whl (64.4MB)
[K     |████████████████████████████████| 64.4MB 82kB/s 
Installing collected packages: catboost
Successfully installed catboost-0.22


In [None]:
CBSES = []
for i in tqdm(drop_y):
  cb = CatBoostClassifier(task_type = 'GPU', verbose=0, iterations=400)
  cb.fit(Xstep_1, ystep_1[i])
  CBSES.append(cb)

HBox(children=(IntProgress(value=0, max=484), HTML(value='')))




In [None]:
y_pred_cb = []
for i in tqdm(range(484)):
    y = CBSES[i].predict_proba(X_test)
    y = y[:, 1]
    y_pred_cb.append(y)

HBox(children=(IntProgress(value=0, max=484), HTML(value='')))




In [None]:
y_pred_cb = np.array(y_pred_cb)
y_pred_cb = y_pred_cb.transpose()

NameError: ignored

In [None]:
y_pred_cb[y_pred_cb < 0.5] = 0
y_pred_cb[y_pred_cb >= 0.5] = 1

NameError: ignored

In [None]:
sklearn.metrics.mean_absolute_error(y_pred_cb, y_test)

0.3598119834710744

## LGBM

In [None]:
import lightgbm

In [None]:
!git clone --recursive https://github.com/Microsoft/LightGBM
%cd /content/LightGBM
!mkdir build
!cmake -DUSE_GPU=1
!make -j$(nproc)
!sudo apt-get -y install python-pip
!sudo -H pip install setuptools pandas numpy scipy scikit-learn -U
%cd /content/LightGBM/python-package
!sudo python setup.py install --precompile

In [None]:
params = {
        'device' : 'gpu',
        'objective' : 'binary'
      }

In [None]:
lgbms = []

for i in tqdm(drop_y):
  
  d_train = lightgbm.Dataset(Xstep_1, ystep_1[i])
  #d_valid = lightgbm.Dataset(X_test, y_test[i])

  lgbm = lightgbm.train(params, d_train, verbose_eval = 100)
  lgbms.append(lgbm)

HBox(children=(IntProgress(value=0, max=484), HTML(value='')))




In [None]:
y_pred_lgbm = []
for i in tqdm(range(484)):
    y = lgbms[i].predict(Xtest)
    y_pred_lgbm.append(y)

HBox(children=(IntProgress(value=0, max=484), HTML(value='')))




In [None]:
y_pred_lgbm = np.array(y_pred_lgbm)
y_pred_lgbm = y_pred_lgbm.transpose()
y_pred_lgbm[y_pred_lgbm < 0.5] = 0
y_pred_lgbm[y_pred_lgbm >= 0.5] = 1 
sklearn.metrics.mean_absolute_error(y_pred_lgbm, ytest)

0.3452748013688707

### Сохраняем модели по LightGBM'у

In [None]:
lgbms = []

for j in tqdm(range(484)):
  file_name = '/content/drive/My Drive/models_step_' + str(5) + '_lgbm' + '/' + 'lgbm_step_' + str(5) + '_' + str(j+1) + '.pkl'
  f = open(file_name,'rb') # запись модели                                                                                                                                                                                                                                                          
  lgbms.append(pickle.load(f))                                                                                                                                                                                                                                                                       
  f.close()

HBox(children=(IntProgress(value=0, max=484), HTML(value='')))

In [None]:
y_pred_lgbm = [] # Предиктим lgbm
for j in tqdm(range(484)):
  y = lgbms[j].predict(Xtest)
  y_pred_lgbm.append(y)

y_pred_lgbm = np.array(y_pred_lgbm)
y_pred_lgbm = y_pred_lgbm.transpose()
y_pred_lgbm[y_pred_lgbm < 0.5] = 0
y_pred_lgbm[y_pred_lgbm >= 0.5] = 1

HBox(children=(IntProgress(value=0, max=484), HTML(value='')))

In [None]:
sklearn.metrics.mean_absolute_error(ytest, y_pred_lgbm)

0.31044467835548895

### Сохраняем модели по CatBoost'у

In [None]:
CBSES = []

for j in tqdm(range(484)):
  file_name = '/content/drive/My Drive/models_step_' + str(5) + '_cb' + '/' + 'cb_step_' + str(5) + '_' + str(j+1) + '.pkl'
  f = open(file_name,'rb') # запись модели                                                                                                                                                                                                                                                          
  CBSES.append(pickle.load(f))                                                                                                                                                                                                                                                                       
  f.close()

HBox(children=(IntProgress(value=0, max=484), HTML(value='')))




In [None]:
y_pred_cb = [] # Предиктим lgbm
for j in tqdm(range(484)):
  y = CBSES[j].predict(Xtest)
  y_pred_cb.append(y)

y_pred_cb = np.array(y_pred_cb)
y_pred_cb = y_pred_cb.transpose()
y_pred_cb[y_pred_cb < 0.5] = 0
y_pred_cb[y_pred_cb >= 0.5] = 1

HBox(children=(IntProgress(value=0, max=484), HTML(value='')))




In [None]:
sklearn.metrics.mean_absolute_error(ytest, y_pred_cb)

0.32121070246706124

In [None]:
from matplotlib.pylab import plt

##  Подготавливаем submission, делаем ансамбль просто как сумму с коэффициентами моделей

In [None]:
data_ans = pd.read_csv('/content/drive/My Drive/gtest.csv')

In [None]:
import pickle

In [None]:
coefs = []
coefs.append([1/4, 3/8, 3/8])
coefs.append([1/3, 1/3, 1/3])
coefs.append([1/2, 1/4, 1/4])
coefs.append([2/3, 1/6, 1/6])
coefs.append([2/3, 1/6, 1/6])

In [None]:
data_to_send = []
for i in tqdm(range(5)):

    #ПОЛУЧАЕМ ЛГБМ на i + 1 шаг
    lgbms = []

    for j in tqdm(range(484)):
      file_name = '/content/drive/My Drive/models_step_' + str(i+1) + '_lgbm' + '/' + 'lgbm_step_' + str(i+1) + '_' + str(j+1) + '.pkl'
      f = open(file_name,'rb') # запись модели                                                                                                                                                                                                                                                          
      lgbms.append(pickle.load(f))                                                                                                                                                                                                                                                                       
      f.close()

    #ПОЛУЧАЕМ CATBOOST на i + 1 шаг
    CBSES = []

    for j in tqdm(range(484)):
      file_name = '/content/drive/My Drive/models_step_' + str(i+1) + '_cb' + '/' + 'cb_step_' + str(i+1) + '_' + str(j+1) + '.pkl'
      f = open(file_name,'rb') # запись модели                                                                                                                                                                                                                                                          
      CBSES.append(pickle.load(f))                                                                                                                                                                                                                                                                       
      f.close()

    data_step = data_ans[data_ans['steps'] == i + 1]

    data_predict = data_step.drop(columns = ['id', 'steps'])

    y_pred_lgbm = [] # Предиктим lgbm
    for j in tqdm(range(484)):
      y = lgbms[j].predict(data_predict)
      y_pred_lgbm.append(y)

    y_pred_lgbm = np.array(y_pred_lgbm)
    y_pred_lgbm = y_pred_lgbm.transpose()

    y_pred_cb = [] # Предиктим catboost
    for j in tqdm(range(484)):
      y = CBSES[j].predict_proba(data_predict)
      y = y[:, 1]
      y_pred_cb.append(y)
    
    y_pred_cb = np.array(y_pred_cb)
    y_pred_cb = y_pred_cb.transpose()

    data_predict = data_predict.to_numpy().reshape(data_predict.shape[0], 22, 22, 1)
    y_pred = models[i].predict(data_predict) # предиктим cnn
    
    #ансамбль
    y_final = coefs[i][0] * y_pred + coefs[i][1] * y_pred_lgbm + coefs[i][2] * y_pred_cb
    y_final[y_final < 0.5] = 0
    y_final[y_final >= 0.5] = 1

    data_answer =  pd.DataFrame({'id': data_step['id']})
    for j in range(484):
        data_answer['y_' + str(j)] = y_final[:, j]
    data_to_send.append(data_answer)
    
data_to_send = pd.concat(data_to_send)
data_to_send = data_to_send.sort_values(by=['id'])
print(data_to_send.shape)
data_to_send.head()

HBox(children=(IntProgress(value=0, max=5), HTML(value='')))

HBox(children=(IntProgress(value=0, max=484), HTML(value='')))

HBox(children=(IntProgress(value=0, max=484), HTML(value='')))

HBox(children=(IntProgress(value=0, max=484), HTML(value='')))

HBox(children=(IntProgress(value=0, max=484), HTML(value='')))

HBox(children=(IntProgress(value=0, max=484), HTML(value='')))

HBox(children=(IntProgress(value=0, max=484), HTML(value='')))

HBox(children=(IntProgress(value=0, max=484), HTML(value='')))

HBox(children=(IntProgress(value=0, max=484), HTML(value='')))

HBox(children=(IntProgress(value=0, max=484), HTML(value='')))

HBox(children=(IntProgress(value=0, max=484), HTML(value='')))

HBox(children=(IntProgress(value=0, max=484), HTML(value='')))

HBox(children=(IntProgress(value=0, max=484), HTML(value='')))

HBox(children=(IntProgress(value=0, max=484), HTML(value='')))

HBox(children=(IntProgress(value=0, max=484), HTML(value='')))

HBox(children=(IntProgress(value=0, max=484), HTML(value='')))

HBox(children=(IntProgress(value=0, max=484), HTML(value='')))

HBox(children=(IntProgress(value=0, max=484), HTML(value='')))

HBox(children=(IntProgress(value=0, max=484), HTML(value='')))

HBox(children=(IntProgress(value=0, max=484), HTML(value='')))

HBox(children=(IntProgress(value=0, max=484), HTML(value='')))

(50000, 485)


Unnamed: 0,id,y_0,y_1,y_2,y_3,y_4,y_5,y_6,y_7,y_8,y_9,y_10,y_11,y_12,y_13,y_14,y_15,y_16,y_17,y_18,y_19,y_20,y_21,y_22,y_23,y_24,y_25,y_26,y_27,y_28,y_29,y_30,y_31,y_32,y_33,y_34,y_35,y_36,y_37,y_38,...,y_444,y_445,y_446,y_447,y_448,y_449,y_450,y_451,y_452,y_453,y_454,y_455,y_456,y_457,y_458,y_459,y_460,y_461,y_462,y_463,y_464,y_465,y_466,y_467,y_468,y_469,y_470,y_471,y_472,y_473,y_474,y_475,y_476,y_477,y_478,y_479,y_480,y_481,y_482,y_483
0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
1,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
2,2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
3,3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
4,4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,1.0,1.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [None]:
data_to_send.to_csv('drive/My Drive/Colab Notebooks/keras_ans.csv', index=False)

In [None]:
import pickle

In [None]:
for i in tqdm(range(484)):
  file_name = '/content/drive/My Drive/models_step_3_cb_new/cb_step_3_' + str(i + 1) + '.pkl'
  f = open(file_name,'wb') # запись модели                                                                                                                                                                                                                                                          
  pickle.dump(CBSES[i],f)                                                                                                                                                                                                                                                                        
  f.close()            

HBox(children=(IntProgress(value=0, max=484), HTML(value='')))




In [None]:
f = open('/content/drive/My Drive/cnn_step_1.pkl', 'wb')
pickle.dump(model5, f)
f.close()

In [None]:
f = open('/content/drive/My Drive/cnn_step_1.pkl', 'rb')
model = pickle.load(f)
f.close()