# Model Notebook - DataScience Competition Baseline

### Created by Anis Ayari : https://github.com/anisayari on May 2019

Please consider to report any enhancements/bug/modification/use to : aayari@deloitte.fr

# Import Library

In [1]:
#DS & Math
import pandas as pd 
import numpy as np 

#Vizu libraries
import matplotlib.pyplot as plt 
import seaborn as sns
from tqdm import tqdm_notebook as tqdm

#sklearn libraries
from sklearn.decomposition import TruncatedSVD,NMF
from sklearn.preprocessing import LabelEncoder, Imputer, OneHotEncoder
from sklearn.model_selection import KFold,cross_val_score,cross_val_predict, GridSearchCV, StratifiedKFold, train_test_split
from sklearn.pipeline import Pipeline
from sklearn.metrics import log_loss, mean_squared_error, accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import VotingClassifier,AdaBoostClassifier,GradientBoostingClassifier, BaggingClassifier, RandomForestClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.preprocessing import StandardScaler


# Other ML libraries
import featuretools as ft
import xgboost as xgb
from xgboost.sklearn import XGBClassifier
import lightgbm as lgb
from nltk.corpus import stopwords
from scipy.sparse import csr_matrix
from stop_words import get_stop_words
stop_words_fr = get_stop_words('fr')
from functools import partial
import scipy as sp
from ml_metrics import quadratic_weighted_kappa
from collections import Counter
from math import sqrt
from sklearn.metrics import confusion_matrix as sk_cmatrix

#Others
import warnings
import csv 
import os 
import time 
warnings.filterwarnings('ignore')

def reduce_mem_usage(df):
    """ iterate through all the columns of a dataframe and modify the data type
        to reduce memory usage.
    """
    start_mem = df.memory_usage().sum() / 1024 ** 2
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))

    for col in df.columns:
        col_type = df[col].dtype

        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            df[col] = df[col].astype('category')

    end_mem = df.memory_usage().sum() / 1024 ** 2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
    return df

  (fname, cnt))
This means that in case of installing LightGBM from PyPI via the ``pip install lightgbm`` command, you don't need to install the gcc compiler anymore.
Instead of that, you need to install the OpenMP library, which is required for running LightGBM on the system with the Apple Clang compiler.
You can install the OpenMP library by the following command: ``brew install libomp``.


# Feature engineering

### Feature Engineering common functions

In [107]:
"""
FEATURE ENGINEERING COMMON FUNCTIONS
"""
#@TODO : 'Need to check with auto FE libraries
"""
MATHEMATICS FEATURES
"""
def create_mathematics_features(df, column_to_count, column_to_groupby):
    df_tmp = df.groupby(column_to_groupby)[column_to_count].agg(['count','mean', 'std', 'max', 'min'])
    df_tmp.columns =['count_' + column_to_count, 'mean_' + column_to_count, 'std_' + column_to_count,'max_' + column_to_count, 'min_' +column_to_count,]
    df = df.merge(df_tmp, on=column_to_groupby, how='left')
    return df 

"""
NUMERICAL FEATURES
"""
def get_len_columns(df, len_columns):
    for col_ in len_columns:
        df["len_" + col_] = df[col_].str.len()
    return df

def transform_to_log(df,columns_to_log):
    for col_ in columns_to_log:
        df['log_' + col_] = (1+df[col_]).apply(np.log)
    return df

def count_product_per_store(df, column_to_groupby, column_to_count):
    tmp = df.groupby(column_to_groupby).count()[column_to_count].reset_index()
    tmp.columns = [column_to_groupby] + ["number_" + column_to_count + '_' + column_to_groupby]
    df = df.merge(tmp, on=column_to_groupby, how='left')
    return df

def count_item_column(df, column_to_count, column_groupby):
    rescuer_count = df.groupby([column_to_count])[column_groupby].count().reset_index()
    rescuer_count.rename(columns={rescuer_count.columns[0]: column_to_count}, inplace=True)
    rescuer_count.columns = [column_to_count, column_to_count+'_COUNT']
    df = df.merge(rescuer_count, how='left', on=column_to_count)
    return df

def label_encoding(df,columns_to_encode):
    labelencoder = LabelEncoder()
    categ_cols = columns_to_encode
    for columns_ in categ_cols:
        df[columns_+'_ENCODED'] = labelencoder.fit_transform(df[columns_].values.astype(str))
    return df

def binarie_fill(df,column):
    df[column] = df[column].fillna(0)
    if True in df[column].tolist():
        df[column]= np.where(df[column]==True,1,0)
    else:
        df[column]= np.where(df[column]==0,0,1)
    return df

"""
TEXT
"""

def apply_tfidf_vectorizer(df, column):
    df[column] = df[column].fillna("missing")
    df[column] = df[column].astype(str)
    vectorizer = TfidfVectorizer(analyzer='word', ngram_range=(1,3), stop_words = stop_words_fr, lowercase=True, 
                                     max_features=50, binary=True, norm=None,use_idf=False)
    tfidf = vectorizer.fit_transform(df[column])
    tfidf_cols = vectorizer.get_feature_names()
    tmp = pd.DataFrame(data=tfidf.toarray(), columns=['tfidf_' + column + '_' + i for i in tfidf_cols])
    df = pd.concat([df, tmp], axis=1)
    return df

"""
IMAGE
"""
#@TODO : 'To fill'

"""
SONG
"""
#@TODO : 'To fill'


def tfidf_nmf_svd(df,text_columns):
    for col_ in tqdm(text_columns):
        print(col_)
        text = df[col_].values.tolist()
        print('[INFO] Start count vectorize')
        cvec = CountVectorizer(min_df=2, ngram_range=(1, 3), max_features=1000,
                               strip_accents='unicode',
                               lowercase=True, analyzer='word', token_pattern=r'\w+',
                               stop_words=stop_words_fr)

        cvec.fit(text)
        X = cvec.transform(text)
        df['cvec_sum'] = X.sum(axis=1)
        df['cvec_mean'] = X.mean(axis=1)
        df['cvec_len'] = (X != 0).sum(axis=1)

        print('[INFO] Start TFDIDF')
        tfv = TfidfVectorizer(min_df=2, max_features=1000,
                              strip_accents='unicode', analyzer='word',
                              ngram_range=(1, 3), use_idf=1, smooth_idf=1, sublinear_tf=1,
                              stop_words=stop_words_fr)

        # Fit TFIDF
        X = tfv.fit_transform(text)
        df['tfidf_sum'] = X.sum(axis=1)
        df['tfidf_mean'] = X.mean(axis=1)
        df['tfidf_len'] = (X != 0).sum(axis=1)
        n_components = 20

        print('[INFO] Start NMF')

        nmf_ = NMF(n_components=n_components)
        X_nmf = nmf_.fit_transform(X)
        X_nmf = pd.DataFrame(X_nmf, columns=['{}_nmf_{}'.format(col_, i) for i in range(n_components)])
        X_nmf['id'] = df.id.values.tolist()
        df = pd.concat([df.set_index('id'), X_nmf.set_index('id')], sort=False, axis=1).reset_index()
        df.rename(columns={df.columns[0]: 'id'}, inplace=True)

        print('[INFO] Start SVD')
        svd = TruncatedSVD(n_components=n_components)
        svd.fit(X)
        print('fit done')
        X_svd = svd.transform(X)
        X_svd = pd.DataFrame(X_svd, columns=['{}_svd_{}'.format(col_, i) for i in range(n_components)])
        X_svd['id'] = df.id.values.tolist()
        df = pd.concat([df.set_index('id'), X_svd.set_index('id')], sort=False, axis=1).reset_index()
        df.rename(columns={df.columns[0]: 'id'}, inplace=True)
        df.drop(col_, axis=1, inplace=True)
    return df

def auto_features(df):
    print('[INFO] Auto Features Processing')
    
    es = ft.EntitySet(id = 'emmaus')
    #es = es.entity_from_dataframe(entity_id = 'data',dataframe = train_test.reset_index(drop=True),make_index = True,index='id')
    es = es.entity_from_dataframe(entity_id='data', index='id', dataframe = df)

    for groupby in ['brand','category','store_name','product_name','material']:
        es = es.normalize_entity(base_entity_id='data', new_entity_id=groupby, index=groupby)
    
    features, feature_names = ft.dfs(entityset = es, target_entity = 'data', max_depth = 2, verbose=2, n_jobs=5)

    # Threshold for removing correlated variables
    threshold = 0.95

    # Absolute value correlation matrix
    corr_matrix = features.corr().abs()
    upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(np.bool))
    upper.head(50)

    # Select columns with correlations above threshold
    collinear_features = [column for column in upper.columns if any(upper[column] > threshold)]

    print('There are %d features to remove.' % (len(collinear_features)))

    features_filtered = features.drop(columns = collinear_features)

    print('The number of features that passed the collinearity threshold: ', features_filtered.shape[1])
    features_positive = features_filtered.loc[:, features_filtered.all()]

    pd.concat([df.set_index('id'), features_positive].set_index('id'), axis=1, sort=False)
    print('[INFO] Auto Features DONE')
    return df


def features_engineering(df):
    """
    DROP NOT RELEVANT COLUMN 
    """
    print('[INFO] Dropping Columns...')
    columns_to_drop = ["image_url", "sub_category_3", "sub_category_4"]  #'To fill'
    df.drop(columns_to_drop, axis = 1, inplace = True)    
    """
    TEXT FEATURES
    """
    print('[INFO] Text Features processing')
    column_to_count = 'price'    #'To fill'
    column_to_groupby = 'store_name'    #'To fill'
    df = create_mathematics_features(df, column_to_count, column_to_groupby)
    
    len_columns = ['product_description']  #'To fill'
    df = get_len_columns(df, len_columns)

    text_columns = ['product_description', 'product_name', 'material']  #'To fill'
    df[text_columns] = df[text_columns].fillna('missing')
    
    column_to_encode = ['color','age','product_size',"brand","shoe_size"]  #'To fill'
    df = label_encoding(df, column_to_encode)
        
    count_column = ["brand", "author", "editor"]  #'To fill'
    for col_ in count_column:
        df = count_item_column(df, col_, 'id')
    
    column_to_vectorize = ["sub_category_1", "sub_category_2",'store_name','product_description',
                    'material', 'editor', 'product_name',"author"]  #'To fill'
    for column_ in column_to_vectorize:
        if column_ in df.columns :
            df=apply_tfidf_vectorizer(df,column_)
            df.drop(column_, inplace=True, axis=1)
    
    binary_column = ['warranty','wifi','vintage']  #'To fill'
    for col_ in binary_column:
        df = binarie_fill(df,col_)
    
    columns_to_dummies = ['category']  # 'To fill'
    for col_ in columns_to_dummies:
        df = pd.concat([df.drop(col_, axis=1), pd.get_dummies(df[col_],prefix=col_)], axis=1)
    
    """
    NUMERICAL FEATURES
    """
    
    columns_to_log = ["price", "len_product_description"]  #'To fill'
    transform_to_log(df,columns_to_log)

    to_drop = ["price","id",'image_width','image_height','color','age',
             'product_size',"brand","shoe_size","len_product_description", 
               "condition", "year", "product_width","product_length", "product_height"]  #'To fill'
    df.drop(to_drop,inplace=True, axis=1)

    df = reduce_mem_usage(df)
    
    return df

# Loading Data 

In [120]:
train = pd.read_csv("X_train.csv", index_col=0, error_bad_lines=False)
len_train = len(train)
test = pd.read_csv("X_test.csv", index_col=0, error_bad_lines=False)

train = train.reset_index()
test= test.reset_index()
#t
rain['id'] = train['id'].astype(str)+'_'+train
y = pd.read_csv("y_train.csv", index_col=0)
train_test = pd.concat((train, test), axis=0)
train_test=train_test.reset_index()
#train_test = features_engineering(train_test)
#train_test = train_test.dropna(axis=1)

b'Skipping line 2168: expected 31 fields, saw 33\nSkipping line 4822: expected 31 fields, saw 37\nSkipping line 4859: expected 31 fields, saw 37\nSkipping line 7342: expected 31 fields, saw 37\n'


TypeError: unsupported operand type(s) for +: 'float' and 'str'

In [121]:
train['id']

0          0
1          1
2          2
3          3
4          4
5          5
6          6
7          7
8          8
9          9
10        10
11        11
12        12
13        13
14        14
15        15
16        16
17        17
18        18
19        19
20        20
21        21
22        22
23        23
24        24
25        25
26        26
27        27
28        28
29        29
        ... 
8850    8850
8851    8851
8852    8852
8853    8853
8854    8854
8855    8855
8856    8856
8857    8857
8858    8858
8859    8859
8860    8860
8861    8861
8862    8862
8863    8863
8864    8864
8865    8865
8866    8866
8867    8867
8868    8868
8869    8869
8870    8870
8871    8871
8872    8872
8873    8873
8874    8874
8875    8875
8876    8876
8877    8877
8878    8878
8879    8879
Name: id, Length: 8880, dtype: int64

distributed.core - INFO - Event loop was unresponsive in Nanny for 28.99s.  This is often caused by long-running GIL-holding functions or moving large chunks of data. This can cause timeouts and instability.
distributed.core - INFO - Event loop was unresponsive in Nanny for 28.99s.  This is often caused by long-running GIL-holding functions or moving large chunks of data. This can cause timeouts and instability.
distributed.core - INFO - Event loop was unresponsive in Nanny for 28.99s.  This is often caused by long-running GIL-holding functions or moving large chunks of data. This can cause timeouts and instability.
distributed.core - INFO - Event loop was unresponsive in Nanny for 28.99s.  This is often caused by long-running GIL-holding functions or moving large chunks of data. This can cause timeouts and instability.
distributed.core - INFO - Event loop was unresponsive in Nanny for 28.99s.  This is often caused by long-running GIL-holding functions or moving large chunks of data. T

In [111]:
es.entity_from_dataframe(entity_id='data', index='id', dataframe = train_test)

AssertionError: Index is not unique on dataframe (Entity data)

In [110]:
auto_features(train_test)

[INFO] Auto Features Processing


AssertionError: Index is not unique on dataframe (Entity data)

In [61]:
train.columns.tolist()

['images_count',
 'image_width',
 'image_height',
 'image_url',
 'product_description',
 'product_size',
 'material',
 'age',
 'warranty',
 'year',
 'color',
 'product_width',
 'wifi',
 'condition',
 'product_length',
 'shoe_size',
 'vintage',
 'brand',
 'author',
 'editor',
 'product_height',
 'weight',
 'price',
 'category',
 'sub_category_1',
 'sub_category_2',
 'sub_category_3',
 'sub_category_4',
 'product_name',
 'store_name']

In [59]:
"""
Percetage of NaN values in Train
pd.DataFrame({'number_of_nan_train':train.isna().sum().tolist(),
              'percentage_of_nan_train': (train.isna().mean()* 100).round(1).tolist(),
             'number_of_nan_test':test.isna().sum().tolist(),
              'percentage_of_nan_test': (test.isna().mean()* 100).round(1).tolist()},
             index=train.columns).sort_values(by=['number_of_nan_train'], ascending=False)
"""

"\nPercetage of NaN values in Train\npd.DataFrame({'number_of_nan_train':train.isna().sum().tolist(),\n              'percentage_of_nan_train': (train.isna().mean()* 100).round(1).tolist(),\n             'number_of_nan_test':test.isna().sum().tolist(),\n              'percentage_of_nan_test': (test.isna().mean()* 100).round(1).tolist()},\n             index=train.columns).sort_values(by=['number_of_nan_train'], ascending=False)\n"

[INFO] Auto Features Processing
Built 1558 features
EntitySet scattered to 4 workers in 3 seconds
Elapsed: 00:05 | Remaining: 00:00 | Progress: 100%|██████████| Calculated: 10/10 chunks


distributed.utils - ERROR - 
Traceback (most recent call last):
  File "/anaconda3/envs/datascience-env/lib/python3.6/site-packages/distributed/utils.py", line 713, in log_errors
    yield
  File "/anaconda3/envs/datascience-env/lib/python3.6/site-packages/distributed/client.py", line 1223, in _close
    quiet_exceptions=(CancelledError,),
  File "/anaconda3/envs/datascience-env/lib/python3.6/site-packages/tornado/gen.py", line 584, in with_timeout
    chain_future(future_converted, result)
  File "/anaconda3/envs/datascience-env/lib/python3.6/site-packages/tornado/concurrent.py", line 166, in chain_future
    future_add_done_callback(a, copy)
  File "/anaconda3/envs/datascience-env/lib/python3.6/site-packages/tornado/concurrent.py", line 262, in future_add_done_callback
    callback(future)
  File "/anaconda3/envs/datascience-env/lib/python3.6/site-packages/tornado/concurrent.py", line 160, in copy
    elif a.exception() is not None:
concurrent.futures._base.CancelledError


[INFO] Dropping Columns...


Exception ignored in: <bound method Client.__del__ of <Client: not connected>>
Traceback (most recent call last):
  File "/anaconda3/envs/datascience-env/lib/python3.6/site-packages/distributed/client.py", line 1075, in __del__
    self.close()
  File "/anaconda3/envs/datascience-env/lib/python3.6/site-packages/distributed/client.py", line 1290, in close
    sync(self.loop, self._close, fast=True)
  File "/anaconda3/envs/datascience-env/lib/python3.6/site-packages/distributed/utils.py", line 331, in sync


KeyError: "['image_url'] not found in axis"

    six.reraise(*error[0])
  File "/anaconda3/envs/datascience-env/lib/python3.6/site-packages/six.py", line 693, in reraise
    raise value
  File "/anaconda3/envs/datascience-env/lib/python3.6/site-packages/distributed/utils.py", line 316, in f
    result[0] = yield future
  File "/anaconda3/envs/datascience-env/lib/python3.6/site-packages/tornado/gen.py", line 729, in run
    value = future.result()
  File "/anaconda3/envs/datascience-env/lib/python3.6/site-packages/tornado/gen.py", line 209, in wrapper
    yielded = next(result)
  File "/anaconda3/envs/datascience-env/lib/python3.6/site-packages/distributed/client.py", line 1223, in _close
    quiet_exceptions=(CancelledError,),
  File "/anaconda3/envs/datascience-env/lib/python3.6/site-packages/tornado/gen.py", line 584, in with_timeout
    chain_future(future_converted, result)
  File "/anaconda3/envs/datascience-env/lib/python3.6/site-packages/tornado/concurrent.py", line 166, in chain_future
    future_add_done_callback(a, copy

Entityset: emmaus
  Entities:
    data [Rows: 11840, Columns: 31]
    brand [Rows: 3330, Columns: 1]
    category [Rows: 12, Columns: 1]
    color [Rows: 19, Columns: 1]
  Relationships:
    data.brand -> brand.brand
    data.category -> category.category
    data.color -> color.color


Entityset: emmaus
  Entities:
    data [Rows: 11840, Columns: 31]
    brand [Rows: 3330, Columns: 1]
    category [Rows: 12, Columns: 1]
    color [Rows: 19, Columns: 1]
  Relationships:
    data.brand -> brand.brand
    data.category -> category.category
    data.color -> color.color
Built 317 features
EntitySet scattered to 5 workers in 4 seconds
Elapsed: 00:00 | Remaining: ? | Progress:   0%|          | Calculated: 0/10 chunks

distributed.utils - ERROR - 
Traceback (most recent call last):
  File "/anaconda3/envs/datascience-env/lib/python3.6/site-packages/distributed/utils.py", line 713, in log_errors
    yield
  File "/anaconda3/envs/datascience-env/lib/python3.6/site-packages/distributed/client.py", line 1223, in _close
    quiet_exceptions=(CancelledError,),
  File "/anaconda3/envs/datascience-env/lib/python3.6/site-packages/tornado/gen.py", line 584, in with_timeout
    chain_future(future_converted, result)
  File "/anaconda3/envs/datascience-env/lib/python3.6/site-packages/tornado/concurrent.py", line 166, in chain_future
    future_add_done_callback(a, copy)
  File "/anaconda3/envs/datascience-env/lib/python3.6/site-packages/tornado/concurrent.py", line 262, in future_add_done_callback
    callback(future)
  File "/anaconda3/envs/datascience-env/lib/python3.6/site-packages/tornado/concurrent.py", line 160, in copy
    elif a.exception() is not None:
concurrent.futures._base.CancelledError
Exception i

Elapsed: 00:12 | Remaining: 00:00 | Progress: 100%|██████████| Calculated: 10/10 chunks
There are 101 features to remove.
The number of features that passed the collinearity threshold:  216


NameError: name 'LinearSVC' is not defined

distributed.client - ERROR - Failed to reconnect to scheduler after 10.00 seconds, closing client
distributed.utils - ERROR - 
Traceback (most recent call last):
  File "/anaconda3/envs/datascience-env/lib/python3.6/site-packages/distributed/utils.py", line 713, in log_errors
    yield
  File "/anaconda3/envs/datascience-env/lib/python3.6/site-packages/distributed/client.py", line 1223, in _close
    quiet_exceptions=(CancelledError,),
  File "/anaconda3/envs/datascience-env/lib/python3.6/site-packages/tornado/gen.py", line 584, in with_timeout
    chain_future(future_converted, result)
  File "/anaconda3/envs/datascience-env/lib/python3.6/site-packages/tornado/concurrent.py", line 166, in chain_future
    future_add_done_callback(a, copy)
  File "/anaconda3/envs/datascience-env/lib/python3.6/site-packages/tornado/concurrent.py", line 262, in future_add_done_callback
    callback(future)
  File "/anaconda3/envs/datascience-env/lib/python3.6/site-packages/tornado/concurrent.py", line 16

In [52]:
from sklearn.svm import LinearSVC
from sklearn.feature_selection import SelectFromModel


lsvc = LinearSVC(C=0.01, penalty="l1", dual=False).fit(train_X.select_dtypes([np.number]).fillna(-1), train_y)
model = SelectFromModel(lsvc, prefit=True)
X_new = model.transform(train_X.select_dtypes([np.number]).fillna(-1))
X_selected_df = pd.DataFrame(X_new, columns=[train_X.select_dtypes([np.number]).fillna(-1).columns[i] for i in range(len(train_X.select_dtypes([np.number]).fillna(-1).columns)) if model.get_support()[i]])
X_selected_df.shape

(8880, 55)

In [53]:
X_selected_df.columns

Index(['image_width', 'image_height', 'product_width', 'product_length',
       'shoe_size', 'product_height', 'price',
       'NUM_WORDS(product_description)', 'brand.MAX(data.image_width)',
       'brand.MAX(data.image_height)', 'brand.MAX(data.year)',
       'brand.MAX(data.shoe_size)', 'brand.MAX(data.price)',
       'brand.MIN(data.shoe_size)', 'brand.MIN(data.price)',
       'brand.MEAN(data.image_width)', 'brand.MEAN(data.image_height)',
       'brand.MEAN(data.year)', 'brand.COUNT(data)',
       'brand.NUM_UNIQUE(data.store_name)', 'category.SUM(data.images_count)',
       'category.STD(data.image_width)', 'category.STD(data.image_height)',
       'category.MAX(data.images_count)', 'category.MAX(data.image_width)',
       'category.MAX(data.image_height)', 'category.MAX(data.product_width)',
       'category.MAX(data.shoe_size)', 'category.MAX(data.price)',
       'category.NUM_UNIQUE(data.sub_category_1)',
       'color.SUM(data.images_count)', 'color.STD(data.image_width)',
 

In [39]:
features_filtered.head()

Unnamed: 0_level_0,images_count,image_width,image_height,product_size,material,age,warranty,year,color,product_width,...,color.MODE(data.brand),color.MODE(data.author),color.MODE(data.editor),color.MODE(data.category),color.MODE(data.sub_category_1),color.MODE(data.sub_category_2),color.MODE(data.sub_category_3),color.MODE(data.sub_category_4),color.MODE(data.product_name),color.MODE(data.store_name)
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,3,3458.0,2552.0,44.0,100 % polyester,,,,Multicolore,,...,Desigual,GOSCINNY et UDERZO,Dargaud,mode,accessoires femme,label selection,les coups de coeur des vendeurs,Cérémonies,Collection Papillons,Emmaüs 88 Neufchateau
1,2,2486.0,2254.0,,Plastique,,,,Jaune,,...,Ricard,FRANQUIN,G. M. Perrin,mobilier - deco,bibelots et objets déco,label selection,Déco Pop et Colorée,Vélorution !,Bougeoir en laiton,Emmaüs 88 Neufchateau
2,3,1536.0,1536.0,40.0,"Polyester, coton, laine",,,,Gris,,...,ESPRIT,Dom Norbert Nieuwland et Maurice Tschoffen,Duculot,mode,mode,label selection,les coups de coeur des vendeurs,Mode : Carnaval !,Apple iPhone 5S - 16 Go - Gris sidéral - Très ...,Label Emmaüs Chambéry
3,2,1100.0,1100.0,,,,,,,,...,,,,,,,,,,
4,2,450.0,450.0,,,,6 mois,,Blanc,,...,tmp,BOUTET DE MONVEL,"""avx despens de l'avteur""",mobilier - deco,mode,label selection,poupées,Cabinet de curiosités,Napperon rectangulaire brodé main en coton,Label Emmaüs Chambéry


In [14]:
train_test.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 11840 entries, 0 to 11839
Columns: 431 entries, images_count to log_len_product_description
dtypes: float16(420), float32(1), int16(2), int8(8)
memory usage: 9.8 MB


In [54]:
train = train_test.iloc[:len_train, :]
test = train_test.iloc[len_train:, :]

In [55]:
test_id = test.index
train['label'] = y

# Prediction

## Model

#### Random Forest

In [56]:
def split_target_and_df(train,label_column):
    return train.drop([label_column], axis=1),train[label_column]
    
def run_randomforest_classifier(train, test, label_column,scoring='accuracy'):
    
    train,target = split_target_and_df(train,label_column)
    
    params = {'bootstrap': True, 
              'class_weight': None, 
              'criterion': 'gini', 
              'max_depth': None,
              'max_features': 'auto', 
              'max_leaf_nodes': None, 
              'min_impurity_decrease': 0.0, 
              'min_impurity_split': None,
              'min_samples_leaf': 1,
              'min_samples_split': 2, 
              'min_weight_fraction_leaf': 0.0, 
              'n_estimators': 10,
              'n_jobs': -1, 
              'oob_score': False, 
              'random_state': None, 
              'verbose': 0, 
              'warm_start': False}
    
    model = RandomForestClassifier(**params)
    model.fit(train, target)
    pred_train = model.predict(train)
    pred_test = model.predict(test)
    
    cv_scores = cross_val_score(model, train, target, cv=5, scoring=scoring)
    print(cv_scores)
    print('RF CV mean : %.2f ' % (np.mean(cv_scores)))
    print('RF CV std : %.2f ' % (np.std(cv_scores)))
        
    print("True Distribution:")
    print(pd.value_counts(target, normalize=True).sort_index())
    print("Train Predicted Distribution:")
    print(pd.value_counts(pred_train, normalize=True).sort_index())
    print("Test Predicted Distribution:")
    print(pd.value_counts(pred_test, normalize=True).sort_index())
    
    features_importances = pd.Series(model.feature_importances_, index=train.columns)
    features_importances.nlargest(25).plot(kind='barh')
    
    return pred_test


In [57]:
pred_test = run_randomforest_classifier(train,test,"label")

ValueError: could not convert string to float: 'https://d1kvfoyrif6wzg.cloudfront.net/assets/images/None/main/100_6771_3b0f897.JPG'

In [None]:
@TODO : "LightGBM validation CV"

In [None]:
N_SPLITS = 2
pred_test = run_lgbm(train, test,'label',test_id)

#### SVM

In [None]:
def run_svm(train, test, label_column):
    target = train[label_column]
    train = train.drop([label_column], axis=1)
    
    scaler = StandardScaler()
    train_scaled = scaler.fit_transform(train)
    test_scaled = scaler.fit_transform(test)
    
    svm_params = {'C': 1.0, 
                  'cache_size': 200, 
                  'class_weight': None, 
                  'coef0': 0.0, 
                  'decision_function_shape': 'ovr', 
                  'degree': 3, 'gamma': 
                  'auto_deprecated', 
                  'kernel': 'rbf', 
                  'max_iter': -1, 
                  'probability': False, 
                  'random_state': None, 
                  'shrinking': True, 
                  'tol': 0.001, 
                  'verbose': False}
    
    svc=SVC() 
    svc.fit(train_scaled,target)
    y_pred_train=svc.predict(train_scaled)
    score = accuracy_score(target,y_pred_train)
    print('Accuracy Score: %.2f' % (score))
    

In [None]:
run_svm(train, test, "label")

#### Voting Classifier

In [None]:
def run_voting_classifier(train, test, label_column):
    
    target = train[label_column]
    train = train.drop([label_column], axis=1)
    
    ab_params = {'algorithm': 'SAMME.R', 
                 'base_estimator': None, 
                 'learning_rate': 0.1, 
                 'n_estimators': 20, 
                 'random_state': None}
    
    gbc_params = {'criterion': 'friedman_mse', 
                  'init': None, 'learning_rate': 0.1, 
                  'loss': 'deviance', 
                  'max_depth': 30, 
                  'max_features': None, 
                  'max_leaf_nodes': None, 
                  'min_impurity_decrease': 0.0, 
                  'min_impurity_split': None, 
                  'min_samples_leaf': 1, 
                  'min_samples_split': 2, 
                  'min_weight_fraction_leaf': 0.0, 
                  'n_estimators': 100, 
                  'n_iter_no_change': None, 
                  'presort': 'auto', 
                  'random_state': None, 
                  'subsample': 1.0, 
                  'tol': 0.0001, 
                  'validation_fraction': 0.1, 
                  'verbose': 0, 
                  'warm_start': False}
    
    bc_params = {'base_estimator': None, 
                 'bootstrap': True, 
                 'bootstrap_features': False, 
                 'max_features': 10, 
                 'max_samples': 1.0, 
                 'n_estimators': 20, 
                 'n_jobs': None, 
                 'oob_score': False, 
                 'random_state': None, 
                 'verbose': 0, 
                 'warm_start': False}
    
    clf1 = AdaBoostClassifier(**ab_params)
    clf2 = GradientBoostingClassifier(**gbc_params)
    clf3 = BaggingClassifier(**bc_params)
    vote_clf = VotingClassifier(estimators=[('ab', clf1), ('gbc', clf2), ('bc', clf3)], weights=[0.2,1.7,0.6], voting='soft')
    vote_clf = vote_clf.fit(train, target)
    
    pred_train = vote_clf.predict_proba(train)
    pred_cv = cross_val_predict(vote_clf, train, np.ravel(target),
                            method='predict_proba', cv=5, n_jobs=-1)
    pred_test = vote_clf.predict_proba(test)
    
    print("LogLoss on train sample ", log_loss(y_pred=pred_train, y_true=target))
    print("LogLoss on train sample (CV): ", log_loss(y_pred=pred_cv, y_true=target))
    
    return pred_test

In [None]:
pred_test = run_voting_classifier(train, test, "label")

In [None]:
#Gradient Boosting

In [67]:
def run_xgb_classifier(train, test, label_column):
    
    target = train[label_column]
    train = train.drop([label_column], axis=1)

    params = {'objective' : 'multi:softprob', 
              'num_class'  : 3,
              'eval_metric' : 'mlogloss',
              'nthread' : -1, 
              'booster' : "gbtree",
              'gamma' : 0.01, 
              'max_depth' : 7,
              'eta' : 0.1,
              'min_child_weight'  : 0.7
             }

    clf_xgb = XGBClassifier(**params)

    ppl = Pipeline([("clf", clf_xgb)])

    ppl.fit(train, np.ravel(y))

    pred_train = ppl.predict_proba(train)
    pred_cv = cross_val_predict(ppl, train, np.ravel(y),
                                method='predict_proba', cv=5, n_jobs=-1)

    print("LogLoss on train sample:",log_loss(y_pred=pred_train, y_true=y))
    print("LogLoss on train sample (CV):",log_loss(y_pred=pred_cv, y_true=y))
    
    return pred_test 

In [79]:
from sklearn.ensemble import ExtraTreesClassifier

label_column = 'label'
train_,target = split_target_and_df(train.dropna(axis=1),label_column)

model = ExtraTreesClassifier(bootstrap=True , 
                                         criterion="gini", 
                                         min_samples_leaf=10, 
                                         min_samples_split=100, 
                                         n_estimators=300,
                                         random_state = 50,
                                         n_jobs = -1)


cv_scores = cross_val_score(model, train_, target, cv=5, scoring='neg_log_loss')
print(cv_scores)
print('RF CV mean : %.2f ' % (np.mean(cv_scores)))
print('RF CV std : %.2f ' % (np.std(cv_scores)))

[-1.0002766  -1.02109464 -1.00909881 -1.00265007 -1.00062673]
RF CV mean : -1.01 
RF CV std : 0.01 


# Submission

In [None]:
df_submission = pd.DataFrame(pred_test, index=test.index)

In [None]:
df_submission.head()

In [None]:
df_submission.to_csv("submission.csv", index_label="id", header=['0', '1', '2'])