# Model Notebook - DataScience Competition Baseline

### Created by Anis Ayari : https://github.com/anisayari on May 2019

Please consider to report any enhancements/bug/modification/use by created issue on : https://github.com/anisayari/DataScience-competitions-template

# Import Library

In [1]:
#DS & Math
import pandas as pd 
import numpy as np 

#Vizu libraries
import matplotlib.pyplot as plt 
import seaborn as sns
from tqdm import tqdm_notebook as tqdm

#sklearn libraries
from sklearn.decomposition import TruncatedSVD,NMF
from sklearn.preprocessing import LabelEncoder, Imputer, OneHotEncoder
from sklearn.model_selection import KFold,cross_val_score,cross_val_predict, GridSearchCV, StratifiedKFold, train_test_split
from sklearn.pipeline import Pipeline
from sklearn.metrics import log_loss, mean_squared_error, accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import VotingClassifier,AdaBoostClassifier,GradientBoostingClassifier, BaggingClassifier, RandomForestClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.preprocessing import StandardScaler
# Other ML libraries
import featuretools as ft
import xgboost as xgb
from xgboost.sklearn import XGBClassifier
import lightgbm as lgb
from nltk.corpus import stopwords
from scipy.sparse import csr_matrix
from stop_words import get_stop_words
stop_words_fr = get_stop_words('fr')
from functools import partial
import scipy as sp
from ml_metrics import quadratic_weighted_kappa
from collections import Counter
from math import sqrt
from sklearn.metrics import confusion_matrix as sk_cmatrix

#Others
import cv2
import warnings
import csv 
import os 
import time 
import urllib
import utils

warnings.filterwarnings('ignore')

  (fname, cnt))
This means that in case of installing LightGBM from PyPI via the ``pip install lightgbm`` command, you don't need to install the gcc compiler anymore.
Instead of that, you need to install the OpenMP library, which is required for running LightGBM on the system with the Apple Clang compiler.
You can install the OpenMP library by the following command: ``brew install libomp``.


# Feature engineering

In [2]:
def auto_features(df):
    print('[INFO] Auto Features Processing')
    
    es = ft.EntitySet(id = 'emmaus')
    #es = es.entity_from_dataframe(entity_id = 'data',dataframe = train_test.reset_index(drop=True),make_index = True,index='id')
    es = es.entity_from_dataframe(entity_id='data', index='id', dataframe = df)

    for groupby in ['brand','category','store_name','product_name','material']:
        es = es.normalize_entity(base_entity_id='data', new_entity_id=groupby, index=groupby)
    
    features, feature_names = ft.dfs(entityset = es, target_entity = 'data', max_depth = 2, verbose=2, n_jobs=5)

    df = df.set_index('id').append([features], sort=False)
    return df,feature_names

def drop_higlhy_correlated_features(df):
    # Threshold for removing correlated variables
    threshold = 0.98

    # Absolute value correlation matrix
    corr_matrix = df.corr().abs()
    upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(np.bool))
    upper.head(50)

    # Select columns with correlations above threshold
    collinear_features = [column for column in upper.columns if any(upper[column] > threshold)]

    print('There are %d features to remove.' % (len(collinear_features)))

    features_filtered = df.drop(columns = collinear_features)

    print('The number of features that passed the collinearity threshold: ', features_filtered.shape[1])
    features_positive = features_filtered.loc[:, features_filtered.all()]
    return features_positive,features_filtered


def features_engineering(df):
    """
    DROP NOT RELEVANT COLUMN 
    """
    print('[INFO] Dropping Columns...')
    columns_to_drop = ["image_url", "sub_category_3", "sub_category_4"]  #'To fill'
    df.drop(columns_to_drop, axis = 1, inplace = True)    
    
    text_columns = df.select_dtypes(include='object').columns.tolist()
    df[text_columns] = df[text_columns].fillna('missing')
    
    df,features_filtered = auto_features(df)
    df = df.reset_index()
    """
    ENCODE FEATURES
    """
    #df =utils.encoded_columns(df,df.select_dtypes(include=['category','object']).columns.tolist())
    df = utils.label_encoding(df, columns_to_encode=['color','age','product_size',"brand","shoe_size"] )
    """
    TEXT FEATURES
    """
    print('[INFO] Text Features processing')
    
    df = utils.get_len_columns(df, len_columns=['product_description'])
        
    column_to_vectorize = ["sub_category_1", "sub_category_2",'store_name','product_description',
                    'material', 'editor', 'product_name',"author"]  #'To fill'
    
    #for column_ in column_to_vectorize:
        #if column_ in df.columns :
            #df=apply_tfidf_vectorizer(df,column_)
            #df.drop(column_, inplace=True, axis=1)
    
    df= utils.tfidf_nmf_svd(df,text_columns=column_to_vectorize)
    
    binary_column = ['warranty','wifi','vintage']  #'To fill'
    for col_ in tqdm(binary_column):
        df = utils.binarie_fill(df,col_)
    
    columns_to_dummies = ['category']  # 'To fill'
    for col_ in tqdm(columns_to_dummies):
        df = pd.concat([df.drop(col_, axis=1), pd.get_dummies(df[col_],prefix=col_)], axis=1)
    
    """
    NUMERICAL FEATURES
    """
    count_column = ["brand", "author", "editor"]  #'To fill'
    for col_ in count_column:
        df = utils.count_item_column(df, col_, 'id')
        
    column_to_count = 'price'    #'To fill'
    column_to_groupby = 'store_name'    #'To fill'
    df = utils.create_mathematics_features(df, column_to_count, column_to_groupby)
    
    
    columns_to_log = ["price", "len_product_description"]  #'To fill'
    utils.transform_to_log(df,columns_to_log)

    #to_drop = ["price","id",'image_width','image_height','color','age','product_size',"brand","shoe_size","len_product_description", "condition", "year", "product_width","product_length", "product_height"]  #'To fill'
    #df.drop(to_drop,inplace=True, axis=1)
    df.drop('id',axis=1,inplace=True)

    df,features_filtered=drop_higlhy_correlated_features(df)
    df = utils.reduce_mem_usage(df)
    
    return df

# Loading Data 

In [3]:
train = pd.read_csv("X_train.csv", index_col=0, error_bad_lines=False,header=0)
len_train = len(train)
test = pd.read_csv("X_test.csv", index_col=0, error_bad_lines=False,header=0)
y = pd.read_csv("y_train.csv", index_col=0)
train['label'] = y
categorical_col = train.select_dtypes(include=['category','object']).columns.tolist()
train = train.reset_index()
test= test.reset_index()
train['id']  = train['id'].astype(str)+'_'+'train'
test['id']  = test['id'].astype(str)+'_'+'test'
train_test = pd.concat((train, test), axis=0)
train_test = features_engineering(train_test)
print(train_test.shape)
train = train_test.iloc[:len_train, :]
test = train_test.iloc[len_train:, :]
test_id = test.index
train['label'] = y
all_cat_col = train.select_dtypes(include=['category','object']).columns.tolist()

categorical_col = [col_ for col_ in categorical_col if col_ in all_cat_col]
train, test = utils.leave_one_hot_encoding(train,test,categorical_col,'label')

train, test = train.drop(all_cat_col,axis=1), test.drop(all_cat_col,axis=1)
print('[DONE]')

b'Skipping line 2168: expected 31 fields, saw 33\nSkipping line 4822: expected 31 fields, saw 37\nSkipping line 4859: expected 31 fields, saw 37\nSkipping line 7342: expected 31 fields, saw 37\n'


[INFO] Dropping Columns...
[INFO] Auto Features Processing
Built 514 features
EntitySet scattered to 5 workers in 4 seconds
Elapsed: 00:36 | Remaining: 00:00 | Progress: 100%|██████████| Calculated: 10/10 chunks


  0%|          | 0/8 [00:00<?, ?it/s]

[INFO] Text Features processing


 50%|█████     | 4/8 [00:07<00:09,  2.28s/it]distributed.client - ERROR - Failed to reconnect to scheduler after 10.00 seconds, closing client
distributed.utils - ERROR - 
Traceback (most recent call last):
  File "/anaconda3/envs/datascience-env/lib/python3.6/site-packages/distributed/utils.py", line 713, in log_errors
    yield
  File "/anaconda3/envs/datascience-env/lib/python3.6/site-packages/distributed/client.py", line 1223, in _close
    quiet_exceptions=(CancelledError,),
  File "/anaconda3/envs/datascience-env/lib/python3.6/site-packages/tornado/gen.py", line 584, in with_timeout
    chain_future(future_converted, result)
  File "/anaconda3/envs/datascience-env/lib/python3.6/site-packages/tornado/concurrent.py", line 166, in chain_future
    future_add_done_callback(a, copy)
  File "/anaconda3/envs/datascience-env/lib/python3.6/site-packages/tornado/concurrent.py", line 262, in future_add_done_callback
    callback(future)
  File "/anaconda3/envs/datascience-env/lib/python3.6/

HBox(children=(IntProgress(value=0, max=3), HTML(value='')))




HBox(children=(IntProgress(value=0, max=1), HTML(value='')))


There are 275 features to remove.
The number of features that passed the collinearity threshold:  273
Memory usage of dataframe is 34.15 MB
Memory usage after optimization is: 9.60 MB
Decreased by 71.9%
(23680, 188)
[INFO] Start Leave One Hot Encoding....
[INFO] Start Leave One Hot Encoding DONE
[DONE]


In [6]:
from sklearn.svm import LinearSVC
from sklearn.feature_selection import SelectFromModel

train_X = train.copy()
lsvc = LinearSVC(C=0.01, penalty="l1", dual=False).fit(train_X.select_dtypes([np.number]).fillna(-1), y)
model = SelectFromModel(lsvc, prefit=True)
X_new = model.transform(train_X.select_dtypes([np.number]).fillna(-1))
X_selected_df = pd.DataFrame(X_new, columns=[train_X.select_dtypes([np.number]).fillna(-1).columns[i] for i in range(len(train_X.select_dtypes([np.number]).fillna(-1).columns)) if model.get_support()[i]])
print(X_selected_df.shape)
X_selected_df.columns

(8880, 13)


Index(['image_height', 'image_width', 'price', 'product_height',
       'product_length', 'product_width', 'shoe_size',
       'product_name.SKEW(data.product_width)', 'len_product_description',
       'editor_COUNT', 'log_price', 'log_len_product_description', 'label'],
      dtype='object')

In [5]:
train.select_dtypes([np.number]).fillna(-1).shape

(8880, 110)

In [28]:
train_test.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 23680 entries, 0 to 23679
Columns: 517 entries, images_count to log_len_product_description
dtypes: category(88), float16(358), float32(54), float64(1), int16(5), int8(11)
memory usage: 26.1 MB


# Prediction

## Model

#### Random Forest

In [8]:
def split_target_and_df(train,label_column):
    return train.drop([label_column], axis=1),train[label_column]
    
def run_randomforest_classifier(train, test, label_column,scoring='accuracy'):
    
    train,target = split_target_and_df(train,label_column)
    
    params = {'bootstrap': True, 
              'class_weight': None, 
              'criterion': 'gini', 
              'max_depth': None,
              'max_features': 'auto', 
              'max_leaf_nodes': None, 
              'min_impurity_decrease': 0.0, 
              'min_impurity_split': None,
              'min_samples_leaf': 1,
              'min_samples_split': 2, 
              'min_weight_fraction_leaf': 0.0, 
              'n_estimators': 10,
              'n_jobs': -1, 
              'oob_score': False, 
              'random_state': None, 
              'verbose': 0, 
              'warm_start': False}
    
    model = RandomForestClassifier(**params)
    model.fit(train, target)
    pred_train = model.predict(train)
    pred_test = model.predict(test)
    
    cv_scores = cross_val_score(model, train, target, cv=5, scoring=scoring)
    print(cv_scores)
    print('RF CV mean : %.2f ' % (np.mean(cv_scores)))
    print('RF CV std : %.2f ' % (np.std(cv_scores)))
        
    print("True Distribution:")
    print(pd.value_counts(target, normalize=True).sort_index())
    print("Train Predicted Distribution:")
    print(pd.value_counts(pred_train, normalize=True).sort_index())
    print("Test Predicted Distribution:")
    print(pd.value_counts(pred_test, normalize=True).sort_index())
    
    features_importances = pd.Series(model.feature_importances_, index=train.columns)
    features_importances.nlargest(25).plot(kind='barh')
    
    return pred_test

pred_test = run_randomforest_classifier(train,test,"label")

In [None]:
#TODO:'LightGBM validation CV'

In [None]:
N_SPLITS = 2
pred_test = run_lgbm(train, test,'label',test_id)

#### SVM

In [29]:
def run_svm(train, test, label_column):
    target = train[label_column]
    train = train.drop([label_column], axis=1)
    
    scaler = StandardScaler()
    train_scaled = scaler.fit_transform(train)
    test_scaled = scaler.fit_transform(test)
    
    svm_params = {'C': 1.0, 
                  'cache_size': 200, 
                  'class_weight': None, 
                  'coef0': 0.0, 
                  'decision_function_shape': 'ovr', 
                  'degree': 3, 'gamma': 
                  'auto_deprecated', 
                  'kernel': 'rbf', 
                  'max_iter': -1, 
                  'probability': False, 
                  'random_state': None, 
                  'shrinking': True, 
                  'tol': 0.001, 
                  'verbose': False}
    
    svc=SVC() 
    svc.fit(train_scaled,target)
    y_pred_train=svc.predict(train_scaled)
    score = accuracy_score(target,y_pred_train)
    print('Accuracy Score: %.2f' % (score))
    

In [30]:
run_svm(train, test, "label")

ValueError: Input contains NaN, infinity or a value too large for dtype('float64').

#### Voting Classifier

In [None]:
def run_voting_classifier(train, test, label_column):
    
    target = train[label_column]
    train = train.drop([label_column], axis=1)
    
    ab_params = {'algorithm': 'SAMME.R', 
                 'base_estimator': None, 
                 'learning_rate': 0.1, 
                 'n_estimators': 20, 
                 'random_state': None}
    
    gbc_params = {'criterion': 'friedman_mse', 
                  'init': None, 'learning_rate': 0.1, 
                  'loss': 'deviance', 
                  'max_depth': 30, 
                  'max_features': None, 
                  'max_leaf_nodes': None, 
                  'min_impurity_decrease': 0.0, 
                  'min_impurity_split': None, 
                  'min_samples_leaf': 1, 
                  'min_samples_split': 2, 
                  'min_weight_fraction_leaf': 0.0, 
                  'n_estimators': 100, 
                  'n_iter_no_change': None, 
                  'presort': 'auto', 
                  'random_state': None, 
                  'subsample': 1.0, 
                  'tol': 0.0001, 
                  'validation_fraction': 0.1, 
                  'verbose': 0, 
                  'warm_start': False}
    
    bc_params = {'base_estimator': None, 
                 'bootstrap': True, 
                 'bootstrap_features': False, 
                 'max_features': 10, 
                 'max_samples': 1.0, 
                 'n_estimators': 20, 
                 'n_jobs': None, 
                 'oob_score': False, 
                 'random_state': None, 
                 'verbose': 0, 
                 'warm_start': False}
    
    clf1 = AdaBoostClassifier(**ab_params)
    clf2 = GradientBoostingClassifier(**gbc_params)
    clf3 = BaggingClassifier(**bc_params)
    vote_clf = VotingClassifier(estimators=[('ab', clf1), ('gbc', clf2), ('bc', clf3)], weights=[0.2,1.7,0.6], voting='soft')
    vote_clf = vote_clf.fit(train, target)
    
    pred_train = vote_clf.predict_proba(train)
    pred_cv = cross_val_predict(vote_clf, train, np.ravel(target),
                            method='predict_proba', cv=5, n_jobs=-1)
    pred_test = vote_clf.predict_proba(test)
    
    print("LogLoss on train sample ", log_loss(y_pred=pred_train, y_true=target))
    print("LogLoss on train sample (CV): ", log_loss(y_pred=pred_cv, y_true=target))
    
    return pred_test



In [None]:
pred_test = run_voting_classifier(train, test, "label")

In [None]:
from sklearn.utils import shuffle

def cross_validate(model, x, y, folds=10, repeats=5):
    '''
    Function to do the cross validation - using stacked Out of Bag method instead of averaging across folds.
    model = algorithm to validate. Must be scikit learn or scikit-learn like API (Example xgboost XGBRegressor)
    x = training data, numpy array
    y = training labels, numpy array
    folds = K, the number of folds to divide the data into
    repeats = Number of times to repeat validation process for more confidence
    '''
    ypred = np.zeros((len(y),repeats))
    score = np.zeros(repeats)
    x = np.array(x)
    for r in range(repeats):
        i=0
        print('Cross Validating - Run', str(r + 1), 'out of', str(repeats))
        x,y = shuffle(x,y,random_state=r) #shuffle data before each repeat
        kf = KFold(n_splits=folds,random_state=i+1000) #random split, different each time
        for train_ind,test_ind in kf.split(x):
            print(train_ind)
            print('Fold', i+1, 'out of',folds)
            xtrain,ytrain = x[train_ind,:],y[train_ind]
            xtest,ytest = x[test_ind,:],y[test_ind]
            model.fit(xtrain, ytrain)
            ypred[test_ind,r]=model.predict(xtest)
            i+=1
        score[r] = log_loss(ypred[:,r],y)
    print('\nOverall logloss:',str(score))
    print('Mean:',str(np.mean(score)))
    print('Deviation:',str(np.std(score)))
    pass

params = {'objective' : 'multi:softprob', 
              'max_depth':3,
              'min_child_weight':2,
              'learning_rate':0.12,
              'n_estimators':80,
              'silent':True,
              'gamma':0,
              'max_delta_step':0,
              'subsample':1,
              'colsample_bytree':1,
              'colsample_bylevel':1,
              'reg_alpha':0,
              'reg_lambda':0,
              'scale_pos_weight':1,
              'seed':1,
              'missing':None}

xgb_model =XGBClassifier(**params)

#cross_validate(xgb_model, train[train.select_dtypes([np.number]).columns.tolist()], y, folds=5, repeats=4) #validate xgboost


In [25]:
train.reset_index().head()

Unnamed: 0,index,image_height,image_width,price,product_height,product_length,product_width,shoe_size,warranty,weight,...,product_name.NUM_UNIQUE(data.wifi),material.STD(data.product_height),material.STD(data.product_length),len_product_description,author_COUNT,editor_COUNT,log_price,log_len_product_description,label,mean_label
0,0,2552.0,3458.0,4.5,,,,,1,200.0,...,,,,191.0,19914,19860,1.71,5.26,2,
1,1,2254.0,2486.0,15.0,,,,,1,1000.0,...,,,,42.0,19914,19860,2.77,3.76,1,
2,2,1536.0,1536.0,16.0,,,,,1,360.0,...,,,,182.0,19914,19860,2.83,5.21,1,
3,3,1100.0,1100.0,24.0,,,,,1,520.0,...,,,,228.0,19914,19860,3.22,5.43,1,
4,4,450.0,450.0,139.0,,,,,1,300.0,...,,,,592.0,19914,19860,4.94,6.39,0,


In [38]:
import xgboost as xgb 
# define xgboost parameters to use in models

param = {} 
param['objective'] = 'multi:softprob'
param['eval_metric'] =  'mlogloss'
param['booster'] = 'gbtree'
param['eta'] = 0.025
param['subsample'] = 0.7
param['colsample_bytree'] = 0.7
param['num_parallel_tree'] = 3
param['min_child_weight'] = 25
param['gamma'] = 5
param['max_depth'] =  3
param['n_jobs'] =  -1
param['num_class'] =  3

# define function to generate xgboost objects 

def train_test_split(tr, te, num_folds, feats):
    
    Xtrain = []
    ytrain = []
    dtrain = []
    Xval = []
    yval = []
    dval = []

    for i in range(num_folds):
        Xtrain.append(tr.loc[(tr.idx != i), feats].values)
        ytrain.append(tr.loc[(tr.idx != i), 'label'].values)
        dtrain.append(xgb.DMatrix(Xtrain[i],ytrain[i]))
        
        Xval.append(tr.loc[(tr.idx == i), feats].values)
        yval.append(tr.loc[(tr.idx == i), 'label'].values)
        dval.append(xgb.DMatrix(Xval[i],yval[i]))

    Xtest = te.values    
    dtest = xgb.DMatrix(Xtest)
    
    return dtrain, dval, dtest


# train models for the 1 month
train['idx'] = pd.Categorical(train.reset_index().index).codes
train['idx'] = train['idx'] % 5

X = train.reset_index().copy()
Z = test.reset_index().copy()
         
dtrain, dval, dtest = train_test_split(tr = X, te = Z, num_folds = 5, 
                                       feats= [col_ for col_ in train.select_dtypes([np.number]).columns.tolist() if col_ !='label'])

model_m1 = []
for i in range(5):
    model_m1.append(
        xgb.train(
                  param,
                  dtrain[i],
                  50000,
                  [(dtrain[i],'train'), (dval[i],'eval')],
                  early_stopping_rounds = 200,
                  verbose_eval = 0)
    )
    
# run predictions for the 1 month    
    
oof_m1 = []
oof_test_m1 = []
for i in range(5):
    oof_m1.append(model_m1[i].predict(dval[i]))
    oof_test_m1.append(model_m1[i].predict(dtest))
    
test_m1 = np.mean(oof_test_m1, axis=0)    
    
m1 = {}
for i in range(5):
    m1 = {**m1, **dict(zip(X, oof_m1[i]))}
    
m1 = {**m1, **dict(zip(Z, test_m1))}
    
oof_m1 = pd.DataFrame.from_dict(m1, orient='index').reset_index()    

X2 = pd.merge(X.copy(), oof_m1)
Z2 = pd.merge(Z.copy(), oof_m1)

[0]	train-mlogloss:1.09651	eval-mlogloss:1.09693
Multiple eval metrics have been passed: 'eval-mlogloss' will be used for early stopping.

Will train until eval-mlogloss hasn't improved in 200 rounds.
[1]	train-mlogloss:1.0946	eval-mlogloss:1.09536
[2]	train-mlogloss:1.09281	eval-mlogloss:1.09388
[3]	train-mlogloss:1.09087	eval-mlogloss:1.09232
[4]	train-mlogloss:1.08925	eval-mlogloss:1.09103
[5]	train-mlogloss:1.08744	eval-mlogloss:1.08959
[6]	train-mlogloss:1.08573	eval-mlogloss:1.08828
[7]	train-mlogloss:1.08406	eval-mlogloss:1.08693
[8]	train-mlogloss:1.08266	eval-mlogloss:1.08579
[9]	train-mlogloss:1.08111	eval-mlogloss:1.08461
[10]	train-mlogloss:1.07955	eval-mlogloss:1.0834
[11]	train-mlogloss:1.07816	eval-mlogloss:1.08234
[12]	train-mlogloss:1.07671	eval-mlogloss:1.08117
[13]	train-mlogloss:1.07529	eval-mlogloss:1.08008
[14]	train-mlogloss:1.07394	eval-mlogloss:1.07901
[15]	train-mlogloss:1.07263	eval-mlogloss:1.07797
[16]	train-mlogloss:1.07149	eval-mlogloss:1.07708
[17]	train

IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)



[5497]	train-mlogloss:0.979848	eval-mlogloss:1.01829
[5498]	train-mlogloss:0.979848	eval-mlogloss:1.01829
[5499]	train-mlogloss:0.979848	eval-mlogloss:1.01829
[5500]	train-mlogloss:0.979848	eval-mlogloss:1.01828
[5501]	train-mlogloss:0.979848	eval-mlogloss:1.01829
[5502]	train-mlogloss:0.979848	eval-mlogloss:1.01828
[5503]	train-mlogloss:0.979848	eval-mlogloss:1.01828
[5504]	train-mlogloss:0.979848	eval-mlogloss:1.01828
[5505]	train-mlogloss:0.97984	eval-mlogloss:1.01828
[5506]	train-mlogloss:0.97984	eval-mlogloss:1.01828
[5507]	train-mlogloss:0.97984	eval-mlogloss:1.01829
[5508]	train-mlogloss:0.97984	eval-mlogloss:1.01828
[5509]	train-mlogloss:0.97984	eval-mlogloss:1.01828
[5510]	train-mlogloss:0.97984	eval-mlogloss:1.01829
[5511]	train-mlogloss:0.97984	eval-mlogloss:1.01828
[5512]	train-mlogloss:0.979826	eval-mlogloss:1.01828
[5513]	train-mlogloss:0.979826	eval-mlogloss:1.01827
[5514]	train-mlogloss:0.979817	eval-mlogloss:1.01828
[5515]	train-mlogloss:0.979817	eval-mlogloss:1.01828


ValueError: You are trying to merge on int64 and object columns. If you wish to proceed you should use pd.concat

# Gradient Boosting

In [26]:
def run_xgb_classifier(train, test, label_column):
    
    target = train[label_column]
    train = train.drop([label_column], axis=1)
    
    eval_set  = [(train,y_train), (valid,y_valid)]
    
    params = {'objective' : 'multi:softprob', 
              'max_depth':3,
              'min_child_weight':2,
              'learning_rate':0.12,
              'n_estimators':80,
              'silent':True,
              'gamma':0,
              'max_delta_step':0,
              'subsample':1,
              'colsample_bytree':1,
              'colsample_bylevel':1,
              'reg_alpha':0,
              'reg_lambda':0,
              'scale_pos_weight':1,
              'seed':1,
              'missing':None}

    clf_xgb = XGBClassifier(**params,eval)

    ppl = Pipeline([("clf", clf_xgb)])

    ppl.fit(train, np.ravel(y))

    pred_train = ppl.predict_proba(train)
    pred_cv = cross_val_predict(ppl, train, np.ravel(y),
                                method='predict_proba', cv=2, n_jobs=-1,verbose=1)

    print("LogLoss on train sample:",log_loss(y_pred=pred_train, y_true=y))
    print("LogLoss on train sample (CV):",log_loss(y_pred=pred_cv, y_true=y))
    
    pred_test = ppl.predict_proba(test)
    return pred_test 

In [27]:
run_xgb_classifier(train[train.select_dtypes([np.number]).columns.tolist()],test[[col_ for col_ in test.select_dtypes([np.number]).columns.tolist() if col_ !='label']],'label')

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.


LogLoss on train sample: 0.8540737917310105
LogLoss on train sample (CV): 1.0176073800644476


[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:    1.5s finished


array([[0.15746106, 0.35389915, 0.48863983],
       [0.26000535, 0.2564277 , 0.48356694],
       [0.37194073, 0.585828  , 0.04223122],
       ...,
       [0.5720547 , 0.2814499 , 0.14649539],
       [0.59563583, 0.31319585, 0.09116832],
       [0.5550139 , 0.22323357, 0.22175251]], dtype=float32)

In [12]:
from sklearn.ensemble import ExtraTreesClassifier

label_column = 'label'
train_,target = split_target_and_df(train._get_numeric_data().fillna(train._get_numeric_data().mean()),label_column)

model = ExtraTreesClassifier(bootstrap=True , 
                                         criterion="gini", 
                                         min_samples_leaf=10, 
                                         min_samples_split=100, 
                                         n_estimators=300,
                                         random_state = 50,
                                         n_jobs = -1)


cv_scores = cross_val_score(model, train_ , target, cv=5, scoring='neg_log_loss')
print(cv_scores)
print('RF CV mean : %.2f ' % (np.mean(cv_scores)))
print('RF CV std : %.2f ' % (np.std(cv_scores)))

ValueError: Input contains NaN, infinity or a value too large for dtype('float32').

# Submission

In [None]:
df_submission = pd.DataFrame(pred_test, index=test.index)

In [None]:
df_submission.head()

In [None]:
df_submission.to_csv("submission.csv", index_label="id", header=['0', '1', '2'])