In [None]:
!pip install catboost

Collecting catboost
[?25l  Downloading https://files.pythonhosted.org/packages/96/6c/6608210b29649267de52001b09e369777ee2a5cfe1c71fa75eba82a4f2dc/catboost-0.24-cp36-none-manylinux1_x86_64.whl (65.9MB)
[K     |████████████████████████████████| 65.9MB 57kB/s 
Installing collected packages: catboost
Successfully installed catboost-0.24


In [None]:
import pandas as pd
import numpy as np
from tqdm import tqdm
import copy

In [None]:
from catboost import CatBoostClassifier
from sklearn.preprocessing import LabelEncoder

In [None]:
train_ = pd.read_csv('Train.csv')
test_ = pd.read_csv('Test.csv')
submission_ = pd.read_csv('SampleSubmission.csv')

In [None]:
from sklearn.model_selection import KFold

def get_train_test_names(train_, test_, submission_):
  kf = KFold(n_splits=5, shuffle=False)
  for r, (train_index, test_index) in enumerate(kf.split(train_)):
    test = train_.iloc[test_index]

    X_test = []
    X_test_columns = test.columns
    for v in test.values:
      info = v[:8]
      binary = v[8:]
      index = [k for k, i in enumerate(binary) if i == 1]
      for i in index:
        for k in range(len(binary)):
          if k == i:
            binary_transformed = list(copy.copy(binary))
            binary_transformed[i] = 0
            X_test.append(list(info) + binary_transformed)

    X_test = pd.DataFrame(X_test)
    X_test.columns = ['ID', 'join_date', 'sex', 'marital_status', 'birth_year', 'branch_code',
          'occupation_code', 'occupation_category_code', 'P5DA', 'RIBP', '8NN1',
          '7POT', '66FJ', 'GYSR', 'SOP4', 'RVSZ', 'PYUQ', 'LJR9', 'N2MW', 'AHXO',
          'BSTQ', 'FM3X', 'K6QO', 'QBOL', 'JWFN', 'JZ9D', 'J9JW', 'GHYX', 'ECY3']
    X_test['ID'] = [str(r)+'_'+str(i) for i in range(X_test.shape[0])]

    yield train_.iloc[train_index], X_test, submission_, '0_fold' + str(r) + '.csv'
  yield train_, test_, submission_, '0_main.csv'

### Get folds

In [None]:
for train, test, submission, name in get_train_test_names(train_, test_, submission_):
  X_train = []
  X_train_columns = train.columns
  c = 0
  for v in train.values:
    info = v[:8]
    binary = v[8:]
    index = [k for k, i in enumerate(binary) if i == 1]
    for i in index:
      c+=1
      for k in range(len(binary)):
        if k == i:
          binary_transformed = list(copy.copy(binary))
          binary_transformed[i] = 0
          X_train.append(list(info) + binary_transformed + [X_train_columns[8+k]] + [c])

  X_train = pd.DataFrame(X_train)
  X_train.columns = ['ID', 'join_date', 'sex', 'marital_status', 'birth_year', 'branch_code',
        'occupation_code', 'occupation_category_code', 'P5DA', 'RIBP', '8NN1',
        '7POT', '66FJ', 'GYSR', 'SOP4', 'RVSZ', 'PYUQ', 'LJR9', 'N2MW', 'AHXO',
        'BSTQ', 'FM3X', 'K6QO', 'QBOL', 'JWFN', 'JZ9D', 'J9JW', 'GHYX', 'ECY3', 'product_pred', 'ID2']


  X_test = []
  true_values = []
  c = 0
  for v in test.values:
    c += 1
    info = v[:8]
    binary = v[8:]
    index = [k for k, i in enumerate(binary) if i == 1]
    X_test.append(list(info) + list(binary) + [c])
    for k in test.columns[8:][index]:
      true_values.append(v[0] + ' X ' + k)

  X_test = pd.DataFrame(X_test)
  X_test.columns = ['ID', 'join_date', 'sex', 'marital_status', 'birth_year', 'branch_code',
        'occupation_code', 'occupation_category_code', 'P5DA', 'RIBP', '8NN1',
        '7POT', '66FJ', 'GYSR', 'SOP4', 'RVSZ', 'PYUQ', 'LJR9', 'N2MW', 'AHXO',
        'BSTQ', 'FM3X', 'K6QO', 'QBOL', 'JWFN', 'JZ9D', 'J9JW', 'GHYX', 'ECY3', 'ID2']


  features_train = []
  features_test = []
  columns = []

  append_features = ['P5DA', 'RIBP', '8NN1', '7POT', '66FJ', 'GYSR', 'SOP4', 'RVSZ', 'PYUQ', 'LJR9', 
  'N2MW', 'AHXO','BSTQ', 'FM3X', 'K6QO', 'QBOL', 'JWFN', 'JZ9D', 'J9JW', 'GHYX', 
  'ECY3', 'ID', 'ID2', 'join_date', 'sex', 'marital_status', 'branch_code', 'occupation_code', 'occupation_category_code',
  'birth_year']
  for v in append_features:
    features_train.append(X_train[v].values.reshape(-1, 1))
    features_test.append(X_test[v].values.reshape(-1, 1))
    columns.append(np.array([v]))

  y_train = X_train[['product_pred']]


  features_train = np.concatenate(features_train, axis=1)
  features_test = np.concatenate(features_test, axis=1)
  columns = np.concatenate(np.array(columns))

  X_train = pd.DataFrame(features_train)
  X_train.columns = columns
  X_test = pd.DataFrame(features_test)
  X_test.columns = columns

  X_train['date1'] = X_train['join_date'].apply(lambda x: int(x.split('/')[0]) if (x == x) else np.nan)
  X_train['date2'] = X_train['join_date'].apply(lambda x: int(x.split('/')[1]) if (x == x) else np.nan)
  X_train['date3'] = X_train['join_date'].apply(lambda x: int(x.split('/')[2]) if (x == x) else np.nan)
  X_train.drop('join_date', axis=1, inplace=True)

  X_test['date1'] = X_test['join_date'].apply(lambda x: int(x.split('/')[0]) if (x == x) else np.nan)
  X_test['date2'] = X_test['join_date'].apply(lambda x: int(x.split('/')[1]) if (x == x) else np.nan)
  X_test['date3'] = X_test['join_date'].apply(lambda x: int(x.split('/')[2]) if (x == x) else np.nan)
  X_test.drop('join_date', axis=1, inplace=True)

  X_train['date_diff'] = X_train['date3'] - X_train['birth_year']
  X_test['date_diff'] = X_test['date3'] - X_test['birth_year']

  X_train = X_train.fillna(0)
  X_test = X_test.fillna(0)
  y_train = y_train.fillna(0)

  le = LabelEncoder()
  data = X_train.append(X_test)
  for v in ['sex', 'marital_status', 'branch_code', 'occupation_code', 'occupation_category_code',]:
    data.loc[:,v] = le.fit_transform(data.loc[:,v])
  X_train = data[:X_train.shape[0]]
  X_test = data[-X_test.shape[0]:]

  le.fit(y_train.iloc[:,0])
  y_train = pd.DataFrame(le.transform(y_train.iloc[:,0]))
  y_train.columns = ['target']

  model = CatBoostClassifier()
  model.fit(X_train.drop(columns=['ID', 'ID2']), y_train, cat_features=['sex','marital_status','branch_code','occupation_code','occupation_category_code'])

  proba = model.predict_proba(X_test.drop(columns=['ID','ID2'], axis=1))
  y_test = pd.DataFrame(proba)
  y_test.columns = le.inverse_transform(y_test.columns)

  answer_mass = []
  for i in range(X_test.shape[0]):
    id = X_test['ID'].iloc[i]
    for c in y_test.columns:
      answer_mass.append([id + ' X ' + c, y_test[c].iloc[i]])

  df_answer = pd.DataFrame(answer_mass)
  df_answer.columns = ['ID X PCODE', 'Label']
  for i in range(df_answer.shape[0]):
    if df_answer['ID X PCODE'].iloc[i] in true_values:
      df_answer['Label'].iloc[i] = 1.0

  df_answer.reset_index(drop=True, inplace=True)
  df_answer.to_csv(name, index=False)

Learning rate set to 0.096895
0:	learn: 1.8109339	total: 6.92s	remaining: 1h 55m 11s
1:	learn: 1.5713372	total: 14.3s	remaining: 1h 58m 57s
2:	learn: 1.4168405	total: 21.4s	remaining: 1h 58m 24s
3:	learn: 1.2899459	total: 28.6s	remaining: 1h 58m 39s
4:	learn: 1.1906342	total: 35.4s	remaining: 1h 57m 33s
5:	learn: 1.1091309	total: 42.1s	remaining: 1h 56m 22s
6:	learn: 1.0418884	total: 48.7s	remaining: 1h 55m 4s
7:	learn: 0.9847474	total: 55.6s	remaining: 1h 54m 51s
8:	learn: 0.9372262	total: 1m 2s	remaining: 1h 54m 53s
9:	learn: 0.8927672	total: 1m 8s	remaining: 1h 52m 29s
10:	learn: 0.8561545	total: 1m 14s	remaining: 1h 52m 6s
11:	learn: 0.8296837	total: 1m 18s	remaining: 1h 47m 50s
12:	learn: 0.8023483	total: 1m 25s	remaining: 1h 48m 2s
13:	learn: 0.7764526	total: 1m 31s	remaining: 1h 47m 8s
14:	learn: 0.7511097	total: 1m 36s	remaining: 1h 45m 54s
15:	learn: 0.7290884	total: 1m 42s	remaining: 1h 45m 31s
16:	learn: 0.7066879	total: 1m 48s	remaining: 1h 44m 38s
17:	learn: 0.6926798	tota

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_with_indexer(indexer, value)


Learning rate set to 0.096899
0:	learn: 1.8793109	total: 7.22s	remaining: 2h 11s
1:	learn: 1.5922195	total: 12.9s	remaining: 1h 47m 22s
2:	learn: 1.4271221	total: 19.3s	remaining: 1h 47m 4s
3:	learn: 1.2956967	total: 26.1s	remaining: 1h 48m 23s
4:	learn: 1.1977830	total: 33.3s	remaining: 1h 50m 18s
5:	learn: 1.1142794	total: 40.5s	remaining: 1h 51m 41s
6:	learn: 1.0426657	total: 46.8s	remaining: 1h 50m 42s
7:	learn: 0.9878196	total: 53.7s	remaining: 1h 51m
8:	learn: 0.9353034	total: 1m	remaining: 1h 50m 19s
9:	learn: 0.8905959	total: 1m 6s	remaining: 1h 48m 54s
10:	learn: 0.8515281	total: 1m 12s	remaining: 1h 48m 57s
11:	learn: 0.8187762	total: 1m 19s	remaining: 1h 49m 10s
12:	learn: 0.7907669	total: 1m 26s	remaining: 1h 49m 12s
13:	learn: 0.7635813	total: 1m 32s	remaining: 1h 48m 43s
14:	learn: 0.7416123	total: 1m 39s	remaining: 1h 49m 16s
15:	learn: 0.7193764	total: 1m 46s	remaining: 1h 49m 16s
16:	learn: 0.6988703	total: 1m 52s	remaining: 1h 48m 33s
17:	learn: 0.6805513	total: 1m 58

OK