In [None]:
from google.colab import drive 
drive.mount('/content/gdrive/')

Mounted at /content/gdrive/


In [47]:
import numpy as np
import pandas as pd

import random
import os
import gc

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn import preprocessing
from sklearn.model_selection import StratifiedKFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.metrics import accuracy_score

from sklearn.decomposition import PCA

import time
from tqdm import tqdm

In [None]:
def reduce_mem_usage(df):
    """ iterate through all the columns of a dataframe and modify the data type
        to reduce memory usage.        
    """
    start_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))
    
    for col in df.columns:
        col_type = df[col].dtype
        
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            df[col] = df[col].astype('category')

    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
    
    return df

In [None]:
def set_seed(seed=42):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)

In [None]:
SEED = 7777
set_seed(SEED)
DATA_PATH = '/content/gdrive/MyDrive/KaggleTabularPlaygroundSeries/data'
SAVE_PATH = '/content/gdrive/MyDrive/KaggleTabularPlaygroundSeries/results'

print('Load the Train, Test data')
df_train_ = pd.read_csv(f'{DATA_PATH}/train.csv')
df_test_ = pd.read_csv(f'{DATA_PATH}/test.csv')
df_submit = pd.read_csv(f'{DATA_PATH}/sample_submission.csv')

print(f'\nThe Shape of train set: {df_train_.shape}')
print(f'The Shape of test set: {df_test_.shape}')

print('\nReduce Train, Test memory')
df_train = reduce_mem_usage(df_train_)
df_test = reduce_mem_usage(df_test_)

Load the Train, Test data

The Shape of train set: (200000, 288)
The Shape of test set: (100000, 287)

Reduce Train, Test memory
Memory usage of dataframe is 439.45 MB
Memory usage after optimization is: 110.05 MB
Decreased by 75.0%
Memory usage of dataframe is 218.96 MB
Memory usage after optimization is: 54.93 MB
Decreased by 74.9%


In [None]:
label_encoder = preprocessing.LabelEncoder()
df_train['target'] = label_encoder.fit_transform(df_train['target'])

In [None]:
df_train.head(2)

Unnamed: 0,row_id,A0T0G0C10,A0T0G1C9,A0T0G2C8,A0T0G3C7,A0T0G4C6,A0T0G5C5,A0T0G6C4,A0T0G7C3,A0T0G8C2,A0T0G9C1,A0T0G10C0,A0T1G0C9,A0T1G1C8,A0T1G2C7,A0T1G3C6,A0T1G4C5,A0T1G5C4,A0T1G6C3,A0T1G7C2,A0T1G8C1,A0T1G9C0,A0T2G0C8,A0T2G1C7,A0T2G2C6,A0T2G3C5,A0T2G4C4,A0T2G5C3,A0T2G6C2,A0T2G7C1,A0T2G8C0,A0T3G0C7,A0T3G1C6,A0T3G2C5,A0T3G3C4,A0T3G4C3,A0T3G5C2,A0T3G6C1,A0T3G7C0,A0T4G0C6,...,A5T3G2C0,A5T4G0C1,A5T4G1C0,A5T5G0C0,A6T0G0C4,A6T0G1C3,A6T0G2C2,A6T0G3C1,A6T0G4C0,A6T1G0C3,A6T1G1C2,A6T1G2C1,A6T1G3C0,A6T2G0C2,A6T2G1C1,A6T2G2C0,A6T3G0C1,A6T3G1C0,A6T4G0C0,A7T0G0C3,A7T0G1C2,A7T0G2C1,A7T0G3C0,A7T1G0C2,A7T1G1C1,A7T1G2C0,A7T2G0C1,A7T2G1C0,A7T3G0C0,A8T0G0C2,A8T0G1C1,A8T0G2C0,A8T1G0C1,A8T1G1C0,A8T2G0C0,A9T0G0C1,A9T0G1C0,A9T1G0C0,A10T0G0C0,target
0,0,-9.536743e-07,-1e-05,-4.3e-05,-0.000114,-0.0002,-0.00024,-0.0002,-0.000114,-4.3e-05,-1e-05,-9.536743e-07,-1e-05,-8.6e-05,-0.000343,-0.000801,-0.001202,-0.001202,-0.000801,-0.000343,-8.6e-05,-1e-05,-4.3e-05,-0.000343,-0.001202,-0.002403,-0.003004,-0.002403,-0.001202,-0.000343,-4.3e-05,-0.000114,-0.000801,-0.002403,-0.004005,-0.004005,-0.002403,-0.000801,-0.000114,-0.0002,...,-0.002403,0.008797,-0.001202,0.009758,-0.0002,-0.000801,-0.001202,-0.000801,-0.0002,-0.000801,-0.002403,-0.002403,-0.000801,-0.001202,-0.002403,-0.001202,-0.000801,-0.000801,-0.0002,-0.000114,-0.000343,-0.000343,-0.000114,-0.000343,-0.000687,-0.000343,-0.000343,-0.000343,-0.000114,-4.3e-05,-8.6e-05,-4.3e-05,-8.6e-05,-8.6e-05,-4.3e-05,-1e-05,-1e-05,-1e-05,-9.536743e-07,9
1,1,-9.536743e-07,-1e-05,-4.3e-05,0.000885,-0.0002,0.00076,-0.0002,-0.000114,-4.3e-05,-1e-05,-9.536743e-07,-1e-05,-8.6e-05,-0.000343,0.000199,-0.000202,0.001799,-0.000801,-0.000343,-8.6e-05,-1e-05,-4.3e-05,-0.000343,0.001799,-0.000403,0.001995,0.003597,-0.001202,-0.000343,-4.3e-05,-0.000114,-0.000801,-0.000403,0.002995,0.002995,0.000597,0.001199,-0.000114,-0.0002,...,0.000597,0.002798,0.001799,0.00076,-0.0002,-0.000801,-0.000202,-0.000801,0.0008,0.000199,0.003597,0.001596,-0.000801,-0.000202,-0.001403,0.000798,0.001199,0.001199,0.0008,-0.000114,-0.000343,-0.000343,-0.000114,-0.000343,0.001313,0.000657,0.001657,0.001657,0.000885,-4.3e-05,-8.6e-05,-4.3e-05,0.000914,0.000914,-4.3e-05,-1e-05,-1e-05,-1e-05,-9.536743e-07,6


In [43]:
features = [f for f in df_train.columns if f not in ['row_id', 'target']]
print(features)

['A0T0G0C10', 'A0T0G1C9', 'A0T0G2C8', 'A0T0G3C7', 'A0T0G4C6', 'A0T0G5C5', 'A0T0G6C4', 'A0T0G7C3', 'A0T0G8C2', 'A0T0G9C1', 'A0T0G10C0', 'A0T1G0C9', 'A0T1G1C8', 'A0T1G2C7', 'A0T1G3C6', 'A0T1G4C5', 'A0T1G5C4', 'A0T1G6C3', 'A0T1G7C2', 'A0T1G8C1', 'A0T1G9C0', 'A0T2G0C8', 'A0T2G1C7', 'A0T2G2C6', 'A0T2G3C5', 'A0T2G4C4', 'A0T2G5C3', 'A0T2G6C2', 'A0T2G7C1', 'A0T2G8C0', 'A0T3G0C7', 'A0T3G1C6', 'A0T3G2C5', 'A0T3G3C4', 'A0T3G4C3', 'A0T3G5C2', 'A0T3G6C1', 'A0T3G7C0', 'A0T4G0C6', 'A0T4G1C5', 'A0T4G2C4', 'A0T4G3C3', 'A0T4G4C2', 'A0T4G5C1', 'A0T4G6C0', 'A0T5G0C5', 'A0T5G1C4', 'A0T5G2C3', 'A0T5G3C2', 'A0T5G4C1', 'A0T5G5C0', 'A0T6G0C4', 'A0T6G1C3', 'A0T6G2C2', 'A0T6G3C1', 'A0T6G4C0', 'A0T7G0C3', 'A0T7G1C2', 'A0T7G2C1', 'A0T7G3C0', 'A0T8G0C2', 'A0T8G1C1', 'A0T8G2C0', 'A0T9G0C1', 'A0T9G1C0', 'A0T10G0C0', 'A1T0G0C9', 'A1T0G1C8', 'A1T0G2C7', 'A1T0G3C6', 'A1T0G4C5', 'A1T0G5C4', 'A1T0G6C3', 'A1T0G7C2', 'A1T0G8C1', 'A1T0G9C0', 'A1T1G0C8', 'A1T1G1C7', 'A1T1G2C6', 'A1T1G3C5', 'A1T1G4C4', 'A1T1G5C3', 'A1T1G6C2', 

In [44]:
len(label_encoder.classes_)

10

In [45]:
cols = features + ['target']
cols

['A0T0G0C10',
 'A0T0G1C9',
 'A0T0G2C8',
 'A0T0G3C7',
 'A0T0G4C6',
 'A0T0G5C5',
 'A0T0G6C4',
 'A0T0G7C3',
 'A0T0G8C2',
 'A0T0G9C1',
 'A0T0G10C0',
 'A0T1G0C9',
 'A0T1G1C8',
 'A0T1G2C7',
 'A0T1G3C6',
 'A0T1G4C5',
 'A0T1G5C4',
 'A0T1G6C3',
 'A0T1G7C2',
 'A0T1G8C1',
 'A0T1G9C0',
 'A0T2G0C8',
 'A0T2G1C7',
 'A0T2G2C6',
 'A0T2G3C5',
 'A0T2G4C4',
 'A0T2G5C3',
 'A0T2G6C2',
 'A0T2G7C1',
 'A0T2G8C0',
 'A0T3G0C7',
 'A0T3G1C6',
 'A0T3G2C5',
 'A0T3G3C4',
 'A0T3G4C3',
 'A0T3G5C2',
 'A0T3G6C1',
 'A0T3G7C0',
 'A0T4G0C6',
 'A0T4G1C5',
 'A0T4G2C4',
 'A0T4G3C3',
 'A0T4G4C2',
 'A0T4G5C1',
 'A0T4G6C0',
 'A0T5G0C5',
 'A0T5G1C4',
 'A0T5G2C3',
 'A0T5G3C2',
 'A0T5G4C1',
 'A0T5G5C0',
 'A0T6G0C4',
 'A0T6G1C3',
 'A0T6G2C2',
 'A0T6G3C1',
 'A0T6G4C0',
 'A0T7G0C3',
 'A0T7G1C2',
 'A0T7G2C1',
 'A0T7G3C0',
 'A0T8G0C2',
 'A0T8G1C1',
 'A0T8G2C0',
 'A0T9G0C1',
 'A0T9G1C0',
 'A0T10G0C0',
 'A1T0G0C9',
 'A1T0G1C8',
 'A1T0G2C7',
 'A1T0G3C6',
 'A1T0G4C5',
 'A1T0G5C4',
 'A1T0G6C3',
 'A1T0G7C2',
 'A1T0G8C1',
 'A1T0G9C0',
 'A1T1G0C

In [46]:
df_train = df_train[cols]
df_train = df_train.drop_duplicates(keep='first')
print(df_train.shape)

(123993, 287)


In [49]:
df_test[features].head()

Unnamed: 0,A0T0G0C10,A0T0G1C9,A0T0G2C8,A0T0G3C7,A0T0G4C6,A0T0G5C5,A0T0G6C4,A0T0G7C3,A0T0G8C2,A0T0G9C1,A0T0G10C0,A0T1G0C9,A0T1G1C8,A0T1G2C7,A0T1G3C6,A0T1G4C5,A0T1G5C4,A0T1G6C3,A0T1G7C2,A0T1G8C1,A0T1G9C0,A0T2G0C8,A0T2G1C7,A0T2G2C6,A0T2G3C5,A0T2G4C4,A0T2G5C3,A0T2G6C2,A0T2G7C1,A0T2G8C0,A0T3G0C7,A0T3G1C6,A0T3G2C5,A0T3G3C4,A0T3G4C3,A0T3G5C2,A0T3G6C1,A0T3G7C0,A0T4G0C6,A0T4G1C5,...,A5T3G1C1,A5T3G2C0,A5T4G0C1,A5T4G1C0,A5T5G0C0,A6T0G0C4,A6T0G1C3,A6T0G2C2,A6T0G3C1,A6T0G4C0,A6T1G0C3,A6T1G1C2,A6T1G2C1,A6T1G3C0,A6T2G0C2,A6T2G1C1,A6T2G2C0,A6T3G0C1,A6T3G1C0,A6T4G0C0,A7T0G0C3,A7T0G1C2,A7T0G2C1,A7T0G3C0,A7T1G0C2,A7T1G1C1,A7T1G2C0,A7T2G0C1,A7T2G1C0,A7T3G0C0,A8T0G0C2,A8T0G1C1,A8T0G2C0,A8T1G0C1,A8T1G1C0,A8T2G0C0,A9T0G0C1,A9T0G1C0,A9T1G0C0,A10T0G0C0
0,-9.536743e-07,-2e-06,-8.940697e-07,2.4e-05,3.4e-05,-2e-06,2.1e-05,2.4e-05,-9e-06,-8e-06,-9.536743e-07,-3e-06,-1.7e-05,-7.4e-05,3.8e-05,6.8e-05,0.000164,0.00018,-3e-06,-2.5e-05,-9e-06,4e-06,-0.000101,-0.000171,6e-06,0.00033,0.00058,0.000228,-1.5e-05,-1.6e-05,-2.6e-05,-0.000146,-7.5e-05,0.000212,0.000565,0.000359,7.2e-05,-5e-06,-4.3e-05,-0.00011,...,0.000163,0.000105,0.000178,0.000151,0.000156,5.573034e-05,0.000258,0.000321,0.000204,6e-06,0.00018,0.00054,0.00049,0.000174,0.000175,0.000447,0.000262,0.000279,0.000254,0.000111,4.3e-05,0.00015,0.000192,7.6e-05,0.000191,0.000387,0.000214,0.000184,0.000268,0.000121,3.9e-05,8.5e-05,5.5e-05,0.000108,9e-05,5.9e-05,1e-05,6e-06,2.7e-05,5.960464e-08
1,-9.536743e-07,-1e-05,-4.291534e-05,-0.000114,0.0018,-0.00024,0.0018,-0.000114,0.000957,-1e-05,-9.536743e-07,-1e-05,-8.6e-05,-0.000343,0.001199,0.001799,0.000798,0.000199,0.000657,-8.6e-05,-1e-05,-4.3e-05,-0.000343,-0.001202,0.001596,0.002996,-0.002403,0.000798,-0.000343,-4.3e-05,-0.000114,-0.000801,0.003597,0.001995,0.002995,-0.000403,0.001199,-0.000114,-0.0002,-0.000202,...,-0.002806,0.000597,0.000798,0.001799,0.00076,-0.0002002716,0.000199,0.001799,0.000199,-0.0002,-0.000801,0.000597,-0.000403,0.000199,-0.001202,0.000597,0.001799,0.000199,0.001199,-0.0002,-0.000114,-0.000343,-0.000343,-0.000114,-0.000343,-0.000687,0.002657,-0.000343,-0.000343,-0.000114,-4.3e-05,0.000914,-4.3e-05,-8.6e-05,-8.6e-05,-4.3e-05,-1e-05,-1e-05,-1e-05,-9.536743e-07
2,5.960464e-08,3e-06,5.960464e-08,-1.4e-05,7e-06,-5e-06,-4e-06,3e-06,4e-06,-8e-06,-9.536743e-07,-3e-06,-3e-06,-4.5e-05,-3.8e-05,-0.00011,-0.000127,-7.6e-05,-1.2e-05,-4e-06,-5e-06,9e-06,-5.2e-05,-0.00011,-0.00028,-0.00022,-0.000165,-4.8e-05,-2.3e-05,-5e-06,-6e-06,-7.3e-05,-0.000154,-0.000332,-0.000288,-0.000172,-3.5e-05,6e-06,-7e-06,-5.3e-05,...,0.0007,0.000423,0.0003,0.000281,0.000104,3.170967e-05,9.7e-05,0.000247,0.000162,7.3e-05,0.000156,0.000623,0.000478,0.000213,0.000264,0.00056,0.00042,0.000304,0.000329,0.000112,3.9e-05,0.000142,0.000155,8e-05,0.000177,0.000428,0.00025,0.000251,0.000338,0.000122,4.1e-05,0.000102,8.4e-05,0.000111,0.000117,7e-05,2e-05,3e-05,2.1e-05,5.960464e-08
3,-9.536743e-07,-8e-06,8.106232e-06,0.000216,0.00042,0.000514,0.000452,0.000187,-5e-06,-8e-06,5.960464e-08,-6e-06,-2.4e-05,6e-05,0.000712,0.001579,0.001887,0.001299,0.000203,-4.1e-05,-9e-06,-1.6e-05,-0.000128,0.000128,0.001283,0.002352,0.002028,0.00067,-3.7e-05,-2.5e-05,-6.3e-05,-0.000171,0.000338,0.001279,0.001806,0.000838,3.9e-05,-5.3e-05,-6.8e-05,-0.000136,...,-0.000282,-2.6e-05,0.000226,0.00019,0.000284,7.152557e-07,0.000168,0.000405,0.000378,3.7e-05,4.4e-05,0.000462,0.000797,0.000306,9.1e-05,0.000766,0.000449,0.000417,0.000425,0.000222,4.9e-05,0.000185,0.000304,0.000134,0.000204,0.000635,0.000431,0.000317,0.00046,0.000215,6.9e-05,0.000158,9.8e-05,0.000175,0.000217,0.00015,1.8e-05,1.6e-05,5.1e-05,-9.536743e-07
4,-9.536743e-07,-1e-05,-4.291534e-05,-0.000114,-0.0002,-0.00024,-0.0002,-0.000114,-4.3e-05,-1e-05,-9.536743e-07,-1e-05,-8.6e-05,-0.000343,-0.000801,-0.001202,-0.000202,-0.000801,-0.000343,-8.6e-05,-1e-05,-4.3e-05,-0.000343,-0.001202,-0.000403,-4e-06,-0.001403,-0.001202,-0.000343,-4.3e-05,-0.000114,-0.000801,-0.000403,-0.001005,-0.003006,-0.001403,-0.000801,-0.000114,-0.0002,-0.001202,...,0.01519,0.009598,0.007797,0.002798,0.00576,-0.0002002716,-0.000801,-0.000202,0.000199,-0.0002,0.002199,0.005596,0.002596,-0.000801,0.002798,0.003597,0.002798,0.003199,0.005199,0.0018,-0.000114,-0.000343,-0.000343,0.000885,0.001657,0.000313,0.001657,0.002657,0.000657,-0.000114,-4.3e-05,-8.6e-05,-4.3e-05,-8.6e-05,0.000914,-4.3e-05,-1e-05,-1e-05,0.000991,-9.536743e-07


In [52]:
df_train = df_train.reset_index(drop=True)
df_train.head(2)

Unnamed: 0,A0T0G0C10,A0T0G1C9,A0T0G2C8,A0T0G3C7,A0T0G4C6,A0T0G5C5,A0T0G6C4,A0T0G7C3,A0T0G8C2,A0T0G9C1,A0T0G10C0,A0T1G0C9,A0T1G1C8,A0T1G2C7,A0T1G3C6,A0T1G4C5,A0T1G5C4,A0T1G6C3,A0T1G7C2,A0T1G8C1,A0T1G9C0,A0T2G0C8,A0T2G1C7,A0T2G2C6,A0T2G3C5,A0T2G4C4,A0T2G5C3,A0T2G6C2,A0T2G7C1,A0T2G8C0,A0T3G0C7,A0T3G1C6,A0T3G2C5,A0T3G3C4,A0T3G4C3,A0T3G5C2,A0T3G6C1,A0T3G7C0,A0T4G0C6,A0T4G1C5,...,A5T3G2C0,A5T4G0C1,A5T4G1C0,A5T5G0C0,A6T0G0C4,A6T0G1C3,A6T0G2C2,A6T0G3C1,A6T0G4C0,A6T1G0C3,A6T1G1C2,A6T1G2C1,A6T1G3C0,A6T2G0C2,A6T2G1C1,A6T2G2C0,A6T3G0C1,A6T3G1C0,A6T4G0C0,A7T0G0C3,A7T0G1C2,A7T0G2C1,A7T0G3C0,A7T1G0C2,A7T1G1C1,A7T1G2C0,A7T2G0C1,A7T2G1C0,A7T3G0C0,A8T0G0C2,A8T0G1C1,A8T0G2C0,A8T1G0C1,A8T1G1C0,A8T2G0C0,A9T0G0C1,A9T0G1C0,A9T1G0C0,A10T0G0C0,target
0,-9.536743e-07,-1e-05,-4.3e-05,-0.000114,-0.0002,-0.00024,-0.0002,-0.000114,-4.3e-05,-1e-05,-9.536743e-07,-1e-05,-8.6e-05,-0.000343,-0.000801,-0.001202,-0.001202,-0.000801,-0.000343,-8.6e-05,-1e-05,-4.3e-05,-0.000343,-0.001202,-0.002403,-0.003004,-0.002403,-0.001202,-0.000343,-4.3e-05,-0.000114,-0.000801,-0.002403,-0.004005,-0.004005,-0.002403,-0.000801,-0.000114,-0.0002,-0.001202,...,-0.002403,0.008797,-0.001202,0.009758,-0.0002,-0.000801,-0.001202,-0.000801,-0.0002,-0.000801,-0.002403,-0.002403,-0.000801,-0.001202,-0.002403,-0.001202,-0.000801,-0.000801,-0.0002,-0.000114,-0.000343,-0.000343,-0.000114,-0.000343,-0.000687,-0.000343,-0.000343,-0.000343,-0.000114,-4.3e-05,-8.6e-05,-4.3e-05,-8.6e-05,-8.6e-05,-4.3e-05,-1e-05,-1e-05,-1e-05,-9.536743e-07,9
1,-9.536743e-07,-1e-05,-4.3e-05,0.000885,-0.0002,0.00076,-0.0002,-0.000114,-4.3e-05,-1e-05,-9.536743e-07,-1e-05,-8.6e-05,-0.000343,0.000199,-0.000202,0.001799,-0.000801,-0.000343,-8.6e-05,-1e-05,-4.3e-05,-0.000343,0.001799,-0.000403,0.001995,0.003597,-0.001202,-0.000343,-4.3e-05,-0.000114,-0.000801,-0.000403,0.002995,0.002995,0.000597,0.001199,-0.000114,-0.0002,-0.001202,...,0.000597,0.002798,0.001799,0.00076,-0.0002,-0.000801,-0.000202,-0.000801,0.0008,0.000199,0.003597,0.001596,-0.000801,-0.000202,-0.001403,0.000798,0.001199,0.001199,0.0008,-0.000114,-0.000343,-0.000343,-0.000114,-0.000343,0.001313,0.000657,0.001657,0.001657,0.000885,-4.3e-05,-8.6e-05,-4.3e-05,0.000914,0.000914,-4.3e-05,-1e-05,-1e-05,-1e-05,-9.536743e-07,6


In [55]:
N_SPLITS = 10
folds = StratifiedKFold(n_splits=N_SPLITS, shuffle=True)

scores, preds, probs = [], [], []
N_ESTIMATORS = 1000
for fold, (train_idx, valid_idx) in enumerate(tqdm(folds.split(df_train[features], df_train['target']), total=N_SPLITS)):
    print(f'\n\nFold: {fold}')
    X_train = df_train.loc[train_idx][features]
    y_train = df_train.loc[train_idx]['target']
    X_valid = df_train.loc[valid_idx][features]
    y_valid = df_train.loc[valid_idx]['target']
    
    model = ExtraTreesClassifier(n_estimators=N_ESTIMATORS, n_jobs=-1, random_state=SEED)
    model.fit(X_train, y_train)

    p_valid = model.predict(X_valid)
    valid_score = accuracy_score(y_valid, p_valid)
    print(f'Valid score: {valid_score}')
    scores.append(valid_score)

    preds.append(model.predict(df_test[features]))
    probs.append(model.predict_proba(df_test[features]))


score = np.array(scores).mean()
print(f'Mean accuracy score: {score:6f}')

  0%|          | 0/10 [00:00<?, ?it/s]



Fold: 0
Valid score: 0.9771774193548387


 10%|█         | 1/10 [07:15<1:05:15, 435.09s/it]



Fold: 1
Valid score: 0.9775806451612903


 20%|██        | 2/10 [14:39<58:46, 440.76s/it]  



Fold: 2
Valid score: 0.9789516129032259


 30%|███       | 3/10 [22:02<51:32, 441.75s/it]



Fold: 3
Valid score: 0.9782240503266393


 40%|████      | 4/10 [29:23<44:07, 441.33s/it]



Fold: 4
Valid score: 0.9784660053230099


 50%|█████     | 5/10 [36:45<36:48, 441.76s/it]



Fold: 5
Valid score: 0.9789499153157513


 60%|██████    | 6/10 [44:04<29:22, 440.66s/it]



Fold: 6
Valid score: 0.9791112186466651


 70%|███████   | 7/10 [51:30<22:07, 442.41s/it]



Fold: 7
Valid score: 0.9755625453665618


 80%|████████  | 8/10 [59:26<15:06, 453.13s/it]



Fold: 8
Valid score: 0.9775788370029841


 90%|█████████ | 9/10 [1:07:06<07:35, 455.29s/it]



Fold: 9
Valid score: 0.9754818937011049


100%|██████████| 10/10 [1:14:33<00:00, 447.38s/it]

Mean accuracy score: 0.977708





In [58]:
len(probs)

10

In [64]:
y_prob = sum(probs) / len(probs)
y_name = label_encoder.inverse_transform(np.argmax(y_prob, axis=1))

In [65]:
df_submit.head()

Unnamed: 0,row_id,target
0,200000,Streptococcus_pneumoniae
1,200001,Streptococcus_pneumoniae
2,200002,Streptococcus_pneumoniae
3,200003,Streptococcus_pneumoniae
4,200004,Streptococcus_pneumoniae


In [66]:
y_name

array(['Escherichia_fergusonii', 'Salmonella_enterica',
       'Enterococcus_hirae', ..., 'Bacteroides_fragilis',
       'Bacteroides_fragilis', 'Streptococcus_pyogenes'], dtype=object)

In [68]:
df_submit['target'] = y_name
df_submit.to_csv(f'{SAVE_PATH}/2013_kfold_extratree.csv', index=False)

# LB(Public) = 0.97650