# Model Notebook - DataScience Competition Baseline

### Created by Anis Ayari : https://github.com/anisayari on May 2019

Please consider to report any enhancements/bug/modification/use to : aayari@deloitte.fr

# Import Library

In [1]:
#DS & Math
import pandas as pd 
import numpy as np 

#Vizu libraries
import matplotlib.pyplot as plt 
import seaborn as sns
from tqdm import tqdm_notebook as tqdm

#sklearn libraries
from sklearn.decomposition import TruncatedSVD,NMF
from sklearn.preprocessing import LabelEncoder, Imputer, OneHotEncoder
from sklearn.model_selection import KFold,cross_val_score,cross_val_predict, GridSearchCV, StratifiedKFold, train_test_split
from sklearn.pipeline import Pipeline
from sklearn.metrics import log_loss, mean_squared_error, accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import VotingClassifier,AdaBoostClassifier,GradientBoostingClassifier, BaggingClassifier, RandomForestClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.preprocessing import StandardScaler


# Other ML libraries
import featuretools as ft
import xgboost as xgb
from xgboost.sklearn import XGBClassifier
import lightgbm as lgb
from nltk.corpus import stopwords
from scipy.sparse import csr_matrix
from stop_words import get_stop_words
stop_words_fr = get_stop_words('fr')
from functools import partial
import scipy as sp
from ml_metrics import quadratic_weighted_kappa
from collections import Counter
from math import sqrt
from sklearn.metrics import confusion_matrix as sk_cmatrix

#Others
import cv2
import warnings
import csv 
import os 
import time 
import urllib

warnings.filterwarnings('ignore')

def reduce_mem_usage(df):
    """ iterate through all the columns of a dataframe and modify the data type
        to reduce memory usage.
    """
    start_mem = df.memory_usage().sum() / 1024 ** 2
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))

    for col in df.columns:
        col_type = df[col].dtype

        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            df[col] = df[col].astype('category')

    end_mem = df.memory_usage().sum() / 1024 ** 2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
    return df

  (fname, cnt))
This means that in case of installing LightGBM from PyPI via the ``pip install lightgbm`` command, you don't need to install the gcc compiler anymore.
Instead of that, you need to install the OpenMP library, which is required for running LightGBM on the system with the Apple Clang compiler.
You can install the OpenMP library by the following command: ``brew install libomp``.


# Feature engineering

### Feature Engineering common functions

In [2]:
"""
FEATURE ENGINEERING COMMON FUNCTIONS
"""
#@TODO : 'Need to check with auto FE libraries
"""
MATHEMATICS FEATURES
"""
def create_mathematics_features(df, column_to_count, column_to_groupby):
    df_tmp = df.groupby(column_to_groupby)[column_to_count].agg(['count','mean', 'std', 'max', 'min'])
    df_tmp.columns =['count_' + column_to_count, 'mean_' + column_to_count, 'std_' + column_to_count,'max_' + column_to_count, 'min_' +column_to_count,]
    df = df.merge(df_tmp, on=column_to_groupby, how='left')
    return df 

"""
NUMERICAL FEATURES
"""
def get_len_columns(df, len_columns):
    for col_ in len_columns:
        df["len_" + col_] = df[col_].str.len()
    return df

def transform_to_log(df,columns_to_log):
    for col_ in columns_to_log:
        df['log_' + col_] = (1+df[col_]).apply(np.log)
    return df

def count_product_per_store(df, column_to_groupby, column_to_count):
    tmp = df.groupby(column_to_groupby).count()[column_to_count].reset_index()
    tmp.columns = [column_to_groupby] + ["number_" + column_to_count + '_' + column_to_groupby]
    df = df.merge(tmp, on=column_to_groupby, how='left')
    return df

def count_item_column(df, column_to_count, column_groupby):
    rescuer_count = df.groupby([column_to_count])[column_groupby].count().reset_index()
    rescuer_count.rename(columns={rescuer_count.columns[0]: column_to_count}, inplace=True)
    rescuer_count.columns = [column_to_count, column_to_count+'_COUNT']
    df = df.merge(rescuer_count, how='left', on=column_to_count)
    return df

def label_encoding(df,columns_to_encode):
    labelencoder = LabelEncoder()
    categ_cols = columns_to_encode
    for columns_ in categ_cols:
        df[columns_+'_ENCODED'] = labelencoder.fit_transform(df[columns_].values.astype(str))
    return df

def binarie_fill(df,column):
    df[column] = df[column].fillna(0)
    if True in df[column].tolist():
        df[column]= np.where(df[column]==True,1,0)
    else:
        df[column]= np.where(df[column]==0,0,1)
    return df

"""
TEXT
"""

def apply_tfidf_vectorizer(df, column):
    df[column] = df[column].fillna("missing")
    df[column] = df[column].astype(str)
    vectorizer = TfidfVectorizer(analyzer='word', ngram_range=(1,3), stop_words = stop_words_fr, lowercase=True, 
                                     max_features=50, binary=True, norm=None,use_idf=False)
    tfidf = vectorizer.fit_transform(df[column])
    tfidf_cols = vectorizer.get_feature_names()
    tmp = pd.DataFrame(data=tfidf.toarray(), columns=['tfidf_' + column + '_' + i for i in tfidf_cols])
    df = pd.concat([df, tmp], axis=1,sort=False)
    return df

"""
IMAGE
"""
#@TODO : 'To fill'

"""
SONG
"""
#@TODO : 'To fill'


def tfidf_nmf_svd(df,text_columns):
    for col_ in tqdm(text_columns):
        text = df[col_].values.tolist()
        cvec = CountVectorizer(min_df=2, ngram_range=(1, 3), max_features=1000,
                               strip_accents='unicode',
                               lowercase=True, analyzer='word', token_pattern=r'\w+',
                               stop_words=stop_words_fr)
        text = [str(element) for element in text]
        cvec.fit(text)
        X = cvec.transform(text)
        df['cvec_sum'] = X.sum(axis=1)
        df['cvec_mean'] = X.mean(axis=1)
        df['cvec_len'] = (X != 0).sum(axis=1)
        tfv = TfidfVectorizer(min_df=2, max_features=200,
                              strip_accents='unicode', analyzer='word',
                              ngram_range=(1, 3), use_idf=1, smooth_idf=1, sublinear_tf=1,
                              stop_words=stop_words_fr)

        # Fit TFIDF
        X = tfv.fit_transform(text)
        df['tfidf_sum'] = X.sum(axis=1)
        df['tfidf_mean'] = X.mean(axis=1)
        df['tfidf_len'] = (X != 0).sum(axis=1)
        
        """
        n_components = 20

        print('[INFO] Start NMF')

        nmf_ = NMF(n_components=n_components)
        X_nmf = nmf_.fit_transform(X)
        X_nmf = pd.DataFrame(X_nmf, columns=['{}_nmf_{}'.format(col_, i) for i in range(n_components)])
        X_nmf['id'] = df.id.values.tolist()
        df = pd.concat([df.set_index('id'), X_nmf.set_index('id')], sort=False, axis=1).reset_index()
        df.rename(columns={df.columns[0]: 'id'}, inplace=True)

        print('[INFO] Start SVD')
        svd = TruncatedSVD(n_components=n_components)
        svd.fit(X)
        print('fit done')
        X_svd = svd.transform(X)
        X_svd = pd.DataFrame(X_svd, columns=['{}_svd_{}'.format(col_, i) for i in range(n_components)])
        X_svd['id'] = df.id.values.tolist()
        df = pd.concat([df.set_index('id'), X_svd.set_index('id')], sort=False, axis=1).reset_index()
        df.rename(columns={df.columns[0]: 'id'}, inplace=True)
        df.drop(col_, axis=1, inplace=True)
        """
        
    return df

def auto_features(df):
    print('[INFO] Auto Features Processing')
    
    es = ft.EntitySet(id = 'emmaus')
    #es = es.entity_from_dataframe(entity_id = 'data',dataframe = train_test.reset_index(drop=True),make_index = True,index='id')
    es = es.entity_from_dataframe(entity_id='data', index='id', dataframe = df)

    for groupby in ['brand','category','store_name','product_name','material']:
        es = es.normalize_entity(base_entity_id='data', new_entity_id=groupby, index=groupby)
    
    features, feature_names = ft.dfs(entityset = es, target_entity = 'data', max_depth = 2, verbose=2, n_jobs=5)

    df = df.set_index('id').append([features], sort=False)
    return df,feature_names

def drop_higlhy_correlated_features(df):
    # Threshold for removing correlated variables
    threshold = 0.95

    # Absolute value correlation matrix
    corr_matrix = df.corr().abs()
    upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(np.bool))
    upper.head(50)

    # Select columns with correlations above threshold
    collinear_features = [column for column in upper.columns if any(upper[column] > threshold)]

    print('There are %d features to remove.' % (len(collinear_features)))

    features_filtered = df.drop(columns = collinear_features)

    print('The number of features that passed the collinearity threshold: ', features_filtered.shape[1])
    features_positive = features_filtered.loc[:, features_filtered.all()]
    return features_positive,features_filtered


def features_engineering(df):
    """
    DROP NOT RELEVANT COLUMN 
    """
    print('[INFO] Dropping Columns...')
    columns_to_drop = ["image_url", "sub_category_3", "sub_category_4"]  #'To fill'
    df.drop(columns_to_drop, axis = 1, inplace = True)    
    
    text_columns = df.select_dtypes(include='object').columns.tolist()
    df[text_columns] = df[text_columns].fillna('missing')
    
    df,features_filtered = auto_features(df)
    df = df.reset_index()

    """
    TEXT FEATURES
    """
    print('[INFO] Text Features processing')
    
    df = get_len_columns(df, len_columns=['product_description'])
    
    df = label_encoding(df, columns_to_encode=['color','age','product_size',"brand","shoe_size"] )
        
    #count_column = ["brand", "author", "editor"]  #'To fill'
    #for col_ in count_column:
        #df = count_item_column(df, col_, 'id')
    
    column_to_vectorize = ["sub_category_1", "sub_category_2",'store_name','product_description',
                    'material', 'editor', 'product_name',"author"]  #'To fill'
    
    #for column_ in column_to_vectorize:
        #if column_ in df.columns :
            #df=apply_tfidf_vectorizer(df,column_)
            #df.drop(column_, inplace=True, axis=1)
    df=tfidf_nmf_svd(df,text_columns=column_to_vectorize)
    
    binary_column = ['warranty','wifi','vintage']  #'To fill'
    for col_ in tqdm(binary_column):
        df = binarie_fill(df,col_)
    
    columns_to_dummies = ['category']  # 'To fill'
    for col_ in tqdm(columns_to_dummies):
        df = pd.concat([df.drop(col_, axis=1), pd.get_dummies(df[col_],prefix=col_)], axis=1)
    
    """
    NUMERICAL FEATURES
    """
    column_to_count = 'price'    #'To fill'
    column_to_groupby = 'store_name'    #'To fill'
    #df = create_mathematics_features(df, column_to_count, column_to_groupby)
    
    
    columns_to_log = ["price", "len_product_description"]  #'To fill'
    transform_to_log(df,columns_to_log)

    #to_drop = ["price","id",'image_width','image_height','color','age','product_size',"brand","shoe_size","len_product_description", "condition", "year", "product_width","product_length", "product_height"]  #'To fill'
    #df.drop(to_drop,inplace=True, axis=1)
    #df,features_filtered=drop_higlhy_correlated_features(df)
    train_test.drop('id',axis=1)
    
    df = reduce_mem_usage(df)
    
    return df

In [3]:
def extract_features(df, id_column, column_path):
    print('here______________________________')
    from tensorflow.keras.applications.densenet import preprocess_input, DenseNet121
    from tensorflow.keras.models import Model
    from tensorflow.keras.layers import GlobalAveragePooling2D, Input, Lambda, AveragePooling1D
    import tensorflow.keras.backend as K
    import tensorflow as tf
    from tensorflow.python.client import device_lib

    tf.logging.set_verbosity(tf.logging.INFO)
    sess = tf.Session()

    def get_available_gpus():
        local_device_protos = device_lib.list_local_devices()
        return [x.name for x in local_device_protos if x.device_type == 'GPU']

    print(get_available_gpus())

    def load_image(img_size, path='', url=''):
        def resize_to_square(im, img_size):
            old_size = im.shape[:2]  # old_size is in (height, width) format
            ratio = float(img_size) / max(old_size)
            new_size = tuple([int(x * ratio) for x in old_size])
            # new_size should be in (width, height) format
            im = cv2.resize(im, (new_size[1], new_size[0]))
            delta_w = img_size - new_size[1]
            delta_h = img_size - new_size[0]
            top, bottom = delta_h // 2, delta_h - (delta_h // 2)
            left, right = delta_w // 2, delta_w - (delta_w // 2)
            color = [0, 0, 0]
            new_im = cv2.copyMakeBorder(im, top, bottom, left, right, cv2.BORDER_CONSTANT, value=color)
            return new_im
        if url =='':
            image = cv2.imread(path)
            
        elif path=='': 
                # download the image, convert it to a NumPy array, and then read
            # it into OpenCV format
            resp = urllib.request.urlopen(url)
            image = np.asarray(bytearray(resp.read()), dtype="uint8")
            image = cv2.imdecode(image, cv2.IMREAD_COLOR)

        new_image = resize_to_square(image, img_size)
        new_image = preprocess_input(new_image)
        return new_image

    def init_densenet():
        print('[INFO] Init Densenet...')
        inp = Input((256, 256, 3))
        print('[INFO] import Densenet...')
        backbone = DenseNet121(input_tensor=inp, include_top=False,
                               weights='../input/densenet-121-weights/densenet121_weights_tf_dim_ordering_tf_kernels_notop.h5')
        print('[INFO] import Densenet DONE')
        x = backbone.output
        x = GlobalAveragePooling2D()(x)
        x = Lambda(lambda x: K.expand_dims(x, axis=-1))(x)
        x = AveragePooling1D(4)(x)
        out = Lambda(lambda x: x[:, :, 0])(x)
        m = Model(inp, out)
        print('[INFO] Init Densenet DONE.')
        return m

    m = init_densenet()

    print('[INFO] Start Image Features_Extraction...')
    img_size = 256
    batch_size = 16
    ids = df[id_column].values
    n_batches = len(ids) // batch_size + 1
    features = {}
    for b in tqdm(range(n_batches)):
        start = b * batch_size
        end = (b + 1) * batch_size
        batch_ids = ids[start:end]
        batch_images = np.zeros((len(batch_ids), img_size, img_size, 3))
        for i, id_ in enumerate(batch_ids):
            
            #image_name = '{}-{}.jpg'.format(id_, 1)
            #image_path = jp(input_dir, subfolder, image_name)
            image_path= df.loc[df['id']==id_][column_path].values[0]
            try:
                batch_images[i] = load_image(256,url=image_path)
            except:
                continue
        batch_preds = m.predict(batch_images)
        for i, id_ in enumerate(batch_ids):
            features[id_] = batch_preds[i]

    df_features = pd.DataFrame.from_dict(features, orient='index')
    df_features.rename(columns=lambda k: 'img_{}'.format(k), inplace=True)
    df_features.reset_index(inplace=True)
    df_features.rename(columns={df_features.columns[0]: id_column}, inplace=True)
    n_components = 200
    svd = TruncatedSVD(n_components=n_components)
    X = df_features[['img_{}'.format(k) for k in range(256)]].values
    svd.fit(X)
    print('fit done')
    X_svd = svd.transform(X)
    X_svd = pd.DataFrame(X_svd, columns=['img_svd_{}'.format(i) for i in range(n_components)])
    X_svd[id_column] = df.id.values.tolist()

    df = pd.concat([df.set_index(id_column), X_svd.set_index(id_column)], sort=False, axis=1).reset_index()
    df.rename(columns={df.columns[0]: id_column}, inplace=True)
    print('[INFO] Image Features_Extraction DONE.')
    return df

In [None]:
test_df = extract_features(train, 'id', 'image_url')

here______________________________
[]
[INFO] Init Densenet...
[INFO] import Densenet...
[INFO] import Densenet DONE
[INFO] Init Densenet DONE.
[INFO] Start Image Features_Extraction...


HBox(children=(IntProgress(value=0, max=556), HTML(value='')))

# Loading Data 

In [4]:
train = pd.read_csv("X_train.csv", index_col=0, error_bad_lines=False)
len_train = len(train)
test = pd.read_csv("X_test.csv", index_col=0, error_bad_lines=False)

train = train.reset_index()
test= test.reset_index()
train['id']  = train['id'].astype(str)+'_'+'train'
test['id']  = test['id'].astype(str)+'_'+'test'

#t
y = pd.read_csv("y_train.csv", index_col=0)
train_test = pd.concat((train, test), axis=0)
train_test = features_engineering(train_test.sample(10))
#train_test = train_test.dropna(axis=1)

b'Skipping line 2169: expected 31 fields, saw 33\nSkipping line 4823: expected 31 fields, saw 37\nSkipping line 4860: expected 31 fields, saw 37\nSkipping line 7343: expected 31 fields, saw 37\n'


[INFO] Dropping Columns...
[INFO] Auto Features Processing
Built 403 features
EntitySet scattered to 1 workers in 1 seconds
Elapsed: 00:01 | Remaining: 00:00 | Progress: 100%|██████████| Calculated: 1/1 chunks
[INFO] Text Features processing


HBox(children=(IntProgress(value=0, max=8), HTML(value='')))

[INFO] Start count vectorize
[INFO] Start TFDIDF
[INFO] Start count vectorize
[INFO] Start TFDIDF
[INFO] Start count vectorize
[INFO] Start TFDIDF
[INFO] Start count vectorize
[INFO] Start TFDIDF
[INFO] Start count vectorize
[INFO] Start TFDIDF
[INFO] Start count vectorize
[INFO] Start TFDIDF
[INFO] Start count vectorize
[INFO] Start TFDIDF
[INFO] Start count vectorize
[INFO] Start TFDIDF



HBox(children=(IntProgress(value=0, max=3), HTML(value='')))




HBox(children=(IntProgress(value=0, max=1), HTML(value='')))


Memory usage of dataframe is 0.06 MB
Memory usage after optimization is: 0.04 MB
Decreased by 31.9%


distributed.client - ERROR - Failed to reconnect to scheduler after 10.00 seconds, closing client
distributed.utils - ERROR - 
Traceback (most recent call last):
  File "/anaconda3/envs/datascience-env/lib/python3.6/site-packages/distributed/utils.py", line 713, in log_errors
    yield
  File "/anaconda3/envs/datascience-env/lib/python3.6/site-packages/distributed/client.py", line 1223, in _close
    quiet_exceptions=(CancelledError,),
  File "/anaconda3/envs/datascience-env/lib/python3.6/site-packages/tornado/gen.py", line 584, in with_timeout
    chain_future(future_converted, result)
  File "/anaconda3/envs/datascience-env/lib/python3.6/site-packages/tornado/concurrent.py", line 166, in chain_future
    future_add_done_callback(a, copy)
  File "/anaconda3/envs/datascience-env/lib/python3.6/site-packages/tornado/concurrent.py", line 262, in future_add_done_callback
    callback(future)
  File "/anaconda3/envs/datascience-env/lib/python3.6/site-packages/tornado/concurrent.py", line 16

Unnamed: 0,id,images_count,image_width,image_height,product_description,product_size,material,age,warranty,year,...,tfidf_sum,tfidf_mean,tfidf_len,category_enfance,category_librairie,category_loisirs,category_mobilier - deco,category_mode,log_price,log_len_product_description
0,480_test,4,3152.0,2124.0,Retrouvez dans ce livre datant de 1890 tous le...,missing,missing,missing,1,1890.0,...,2.449219,0.188477,6,0.0,0.0,1.0,0.0,0.0,3.257812,5.390625
1,8634_train,1,616.0,616.0,<p><strong>Livre d'occasion écrit par Elisa Vi...,missing,missing,missing,1,2002.0,...,1.732422,0.133179,3,0.0,1.0,0.0,0.0,0.0,1.931641,6.625
2,283_train,5,1100.0,1100.0,Voiture Miniature Porsche Boxster gris métall...,missing,missing,missing,1,,...,1.0,0.076904,1,1.0,0.0,0.0,0.0,0.0,3.134766,5.410156
3,1450_train,5,2000.0,3008.0,"Ancienne à restaurer , en métal , girouette de...",missing,Métal,missing,1,,...,1.0,0.076904,1,0.0,0.0,0.0,1.0,0.0,3.892578,4.316406
4,2817_test,1,616.0,616.0,<p><strong>Livre d'occasion écrit par Alexandr...,missing,missing,missing,1,2014.0,...,1.732422,0.133179,3,0.0,1.0,0.0,0.0,0.0,1.791992,6.691406
5,251_test,3,1536.0,1536.0,Écharpe femme en laine marron et blanche. Dime...,missing,Laine,missing,1,,...,1.0,0.076904,1,0.0,0.0,0.0,0.0,1.0,2.484375,4.277344
6,397_train,3,1100.0,1100.0,Foulard Gold by Alfredo Versace fabriqué en I...,missing,missing,missing,1,,...,1.0,0.076904,1,0.0,0.0,0.0,0.0,1.0,3.583984,5.050781
7,2785_test,5,485.0,1024.0,La tète et le corps soont en maillechort (alli...,missing,missing,missing,1,,...,1.0,0.076904,1,0.0,0.0,1.0,0.0,0.0,4.394531,4.746094
8,8720_train,3,4608.0,2392.0,6 Verres à vin Lausitzer Weibwasser Design - M...,missing,missing,missing,1,,...,1.0,0.076904,1,0.0,0.0,0.0,1.0,0.0,3.044922,5.367188
9,8788_train,6,2448.0,2448.0,Jean bicolore pour homme de la marque KAPORAL....,40,100% coton,missing,1,,...,1.0,0.076904,1,0.0,0.0,0.0,0.0,1.0,2.197266,5.964844


In [4]:
train_test.reset_index(drop=True)
#fp.drop('id', inplace=True, axis=1)
train = train_test.iloc[:len_train, :]
test = train_test.iloc[len_train:, :]

NameError: name 'fp' is not defined

distributed.client - ERROR - Failed to reconnect to scheduler after 10.00 seconds, closing client
distributed.utils - ERROR - 
Traceback (most recent call last):
  File "/anaconda3/envs/datascience-env/lib/python3.6/site-packages/distributed/utils.py", line 713, in log_errors
    yield
  File "/anaconda3/envs/datascience-env/lib/python3.6/site-packages/distributed/client.py", line 1223, in _close
    quiet_exceptions=(CancelledError,),
  File "/anaconda3/envs/datascience-env/lib/python3.6/site-packages/tornado/gen.py", line 584, in with_timeout
    chain_future(future_converted, result)
  File "/anaconda3/envs/datascience-env/lib/python3.6/site-packages/tornado/concurrent.py", line 166, in chain_future
    future_add_done_callback(a, copy)
  File "/anaconda3/envs/datascience-env/lib/python3.6/site-packages/tornado/concurrent.py", line 262, in future_add_done_callback
    callback(future)
  File "/anaconda3/envs/datascience-env/lib/python3.6/site-packages/tornado/concurrent.py", line 16

In [112]:
fp = auto_features(train_test)

[INFO] Auto Features Processing
Built 507 features
EntitySet scattered to 5 workers in 5 seconds
Elapsed: 00:35 | Remaining: 00:00 | Progress: 100%|██████████████████████████████████████████| Calculated: 10/10 chunks
There are 212 features to remove.
The number of features that passed the collinearity threshold:  295


In [146]:
train_test.columns.tolist()

['id',
 'images_count',
 'image_width',
 'image_height',
 'product_size',
 'age',
 'color',
 'product_width',
 'condition',
 'product_length',
 'shoe_size',
 'brand',
 'product_height',
 'NUM_WORDS(image_url)',
 'NUM_WORDS(product_description)',
 'brand.STD(data.product_width)',
 'brand.STD(data.product_length)',
 'brand.STD(data.product_height)',
 'brand.MAX(data.image_width)',
 'brand.MAX(data.image_height)',
 'brand.MAX(data.year)',
 'brand.MAX(data.shoe_size)',
 'brand.MAX(data.price)',
 'brand.SKEW(data.product_width)',
 'brand.SKEW(data.product_length)',
 'brand.SKEW(data.product_height)',
 'brand.MIN(data.shoe_size)',
 'brand.MIN(data.weight)',
 'brand.MIN(data.price)',
 'brand.MEAN(data.image_width)',
 'brand.MEAN(data.image_height)',
 'brand.MEAN(data.year)',
 'brand.MEAN(data.shoe_size)',
 'brand.COUNT(data)',
 'brand.NUM_UNIQUE(data.store_name)',
 'brand.MODE(data.product_size)',
 'brand.MODE(data.material)',
 'brand.MODE(data.age)',
 'brand.MODE(data.warranty)',
 'brand.MOD

In [147]:
train = train_test.iloc[:len_train, :]
test = train_test.iloc[len_train:, :]
test_id = test.index
train['label'] = y

In [48]:
from sklearn.svm import LinearSVC
from sklearn.feature_selection import SelectFromModel

train_X = train.copy()
lsvc = LinearSVC(C=0.01, penalty="l1", dual=False).fit(train_X.select_dtypes([np.number]).fillna(-1), train_y)
model = SelectFromModel(lsvc, prefit=True)
X_new = model.transform(train_X.select_dtypes([np.number]).fillna(-1))
X_selected_df = pd.DataFrame(X_new, columns=[train_X.select_dtypes([np.number]).fillna(-1).columns[i] for i in range(len(train_X.select_dtypes([np.number]).fillna(-1).columns)) if model.get_support()[i]])
print(X_selected_df.shape)
X_selected_df.columns

(8880, 75)


Index(['images_count', 'image_width', 'image_height', 'year', 'product_width',
       'product_length', 'shoe_size', 'product_height', 'price', 'image_width',
       'image_height', 'product_width', 'product_length', 'shoe_size',
       'product_height', 'price', 'NUM_WORDS(product_description)',
       'brand.MAX(data.image_width)', 'brand.MAX(data.image_height)',
       'brand.MAX(data.year)', 'brand.MAX(data.shoe_size)',
       'brand.MAX(data.price)', 'brand.MIN(data.shoe_size)',
       'brand.MIN(data.price)', 'brand.MEAN(data.image_width)',
       'brand.MEAN(data.image_height)', 'brand.MEAN(data.year)',
       'brand.COUNT(data)', 'brand.NUM_UNIQUE(data.store_name)',
       'category.SUM(data.images_count)', 'category.SUM(data.price)',
       'category.STD(data.image_width)', 'category.STD(data.image_height)',
       'category.MAX(data.images_count)', 'category.MAX(data.image_width)',
       'category.MAX(data.image_height)', 'category.MAX(data.product_width)',
       'category.

In [142]:
train_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11850 entries, 0 to 11849
Columns: 181 entries, id to product_description_svd_11
dtypes: category(80), float16(82), float32(8), float64(11)
memory usage: 5.5 MB


# Prediction

## Model

#### Random Forest

In [50]:
def split_target_and_df(train,label_column):
    return train.drop([label_column], axis=1),train[label_column]
    
def run_randomforest_classifier(train, test, label_column,scoring='accuracy'):
    
    train,target = split_target_and_df(train,label_column)
    
    params = {'bootstrap': True, 
              'class_weight': None, 
              'criterion': 'gini', 
              'max_depth': None,
              'max_features': 'auto', 
              'max_leaf_nodes': None, 
              'min_impurity_decrease': 0.0, 
              'min_impurity_split': None,
              'min_samples_leaf': 1,
              'min_samples_split': 2, 
              'min_weight_fraction_leaf': 0.0, 
              'n_estimators': 10,
              'n_jobs': -1, 
              'oob_score': False, 
              'random_state': None, 
              'verbose': 0, 
              'warm_start': False}
    
    model = RandomForestClassifier(**params)
    model.fit(train, target)
    pred_train = model.predict(train)
    pred_test = model.predict(test)
    
    cv_scores = cross_val_score(model, train, target, cv=5, scoring=scoring)
    print(cv_scores)
    print('RF CV mean : %.2f ' % (np.mean(cv_scores)))
    print('RF CV std : %.2f ' % (np.std(cv_scores)))
        
    print("True Distribution:")
    print(pd.value_counts(target, normalize=True).sort_index())
    print("Train Predicted Distribution:")
    print(pd.value_counts(pred_train, normalize=True).sort_index())
    print("Test Predicted Distribution:")
    print(pd.value_counts(pred_test, normalize=True).sort_index())
    
    features_importances = pd.Series(model.feature_importances_, index=train.columns)
    features_importances.nlargest(25).plot(kind='barh')
    
    return pred_test


In [53]:
pred_test = run_randomforest_classifier(train,test,"label")

ValueError: could not convert string to float: 'https://d1kvfoyrif6wzg.cloudfront.net/assets/images/None/main/100_6771_3b0f897.JPG'

In [None]:
@TODO : "LightGBM validation CV"

In [None]:
N_SPLITS = 2
pred_test = run_lgbm(train, test,'label',test_id)

#### SVM

In [None]:
def run_svm(train, test, label_column):
    target = train[label_column]
    train = train.drop([label_column], axis=1)
    
    scaler = StandardScaler()
    train_scaled = scaler.fit_transform(train)
    test_scaled = scaler.fit_transform(test)
    
    svm_params = {'C': 1.0, 
                  'cache_size': 200, 
                  'class_weight': None, 
                  'coef0': 0.0, 
                  'decision_function_shape': 'ovr', 
                  'degree': 3, 'gamma': 
                  'auto_deprecated', 
                  'kernel': 'rbf', 
                  'max_iter': -1, 
                  'probability': False, 
                  'random_state': None, 
                  'shrinking': True, 
                  'tol': 0.001, 
                  'verbose': False}
    
    svc=SVC() 
    svc.fit(train_scaled,target)
    y_pred_train=svc.predict(train_scaled)
    score = accuracy_score(target,y_pred_train)
    print('Accuracy Score: %.2f' % (score))
    

In [None]:
run_svm(train, test, "label")

#### Voting Classifier

In [None]:
def run_voting_classifier(train, test, label_column):
    
    target = train[label_column]
    train = train.drop([label_column], axis=1)
    
    ab_params = {'algorithm': 'SAMME.R', 
                 'base_estimator': None, 
                 'learning_rate': 0.1, 
                 'n_estimators': 20, 
                 'random_state': None}
    
    gbc_params = {'criterion': 'friedman_mse', 
                  'init': None, 'learning_rate': 0.1, 
                  'loss': 'deviance', 
                  'max_depth': 30, 
                  'max_features': None, 
                  'max_leaf_nodes': None, 
                  'min_impurity_decrease': 0.0, 
                  'min_impurity_split': None, 
                  'min_samples_leaf': 1, 
                  'min_samples_split': 2, 
                  'min_weight_fraction_leaf': 0.0, 
                  'n_estimators': 100, 
                  'n_iter_no_change': None, 
                  'presort': 'auto', 
                  'random_state': None, 
                  'subsample': 1.0, 
                  'tol': 0.0001, 
                  'validation_fraction': 0.1, 
                  'verbose': 0, 
                  'warm_start': False}
    
    bc_params = {'base_estimator': None, 
                 'bootstrap': True, 
                 'bootstrap_features': False, 
                 'max_features': 10, 
                 'max_samples': 1.0, 
                 'n_estimators': 20, 
                 'n_jobs': None, 
                 'oob_score': False, 
                 'random_state': None, 
                 'verbose': 0, 
                 'warm_start': False}
    
    clf1 = AdaBoostClassifier(**ab_params)
    clf2 = GradientBoostingClassifier(**gbc_params)
    clf3 = BaggingClassifier(**bc_params)
    vote_clf = VotingClassifier(estimators=[('ab', clf1), ('gbc', clf2), ('bc', clf3)], weights=[0.2,1.7,0.6], voting='soft')
    vote_clf = vote_clf.fit(train, target)
    
    pred_train = vote_clf.predict_proba(train)
    pred_cv = cross_val_predict(vote_clf, train, np.ravel(target),
                            method='predict_proba', cv=5, n_jobs=-1)
    pred_test = vote_clf.predict_proba(test)
    
    print("LogLoss on train sample ", log_loss(y_pred=pred_train, y_true=target))
    print("LogLoss on train sample (CV): ", log_loss(y_pred=pred_cv, y_true=target))
    
    return pred_test

In [None]:
pred_test = run_voting_classifier(train, test, "label")

In [None]:
#Gradient Boosting

In [158]:
def run_xgb_classifier(train, test, label_column):
    
    target = train[label_column]
    train = train.drop([label_column], axis=1)

    params = {'objective' : 'multi:softprob', 
              'num_class'  : 3,
              'eval_metric' : 'mlogloss',
              'nthread' : -1, 
              'booster' : "gbtree",
              'gamma' : 0.1, 
              'max_depth' : 5,
              'eta' : 0.1,
              'min_child_weight'  : 0.7
             }

    clf_xgb = XGBClassifier(**params)

    ppl = Pipeline([("clf", clf_xgb)])

    ppl.fit(train, np.ravel(y))

    pred_train = ppl.predict_proba(train)
    pred_cv = cross_val_predict(ppl, train, np.ravel(y),
                                method='predict_proba', cv=5, n_jobs=-1,verbose=1)

    print("LogLoss on train sample:",log_loss(y_pred=pred_train, y_true=y))
    print("LogLoss on train sample (CV):",log_loss(y_pred=pred_cv, y_true=y))
    
    pred_test = ppl.predict_proba(test)
    return pred_test 

In [160]:
run_xgb_classifier(train._get_numeric_data().fillna(train._get_numeric_data().mean()),test._get_numeric_data().fillna(test._get_numeric_data().mean()),'label')

LogLoss on train sample: 0.24616178027238814
LogLoss on train sample (CV): 1.4492864786074622


array([[0.06511849, 0.8484266 , 0.08645495],
       [0.20762211, 0.47952724, 0.31285062],
       [0.06228445, 0.33334592, 0.60436964],
       ...,
       [0.07604016, 0.13513353, 0.78882635],
       [0.05678065, 0.37313035, 0.57008904],
       [0.21156481, 0.36582002, 0.42261523]], dtype=float32)

In [163]:
from sklearn.ensemble import ExtraTreesClassifier

label_column = 'label'
train_,target = split_target_and_df(train._get_numeric_data().fillna(train._get_numeric_data().mean()),label_column)

model = ExtraTreesClassifier(bootstrap=True , 
                                         criterion="gini", 
                                         min_samples_leaf=10, 
                                         min_samples_split=100, 
                                         n_estimators=300,
                                         random_state = 50,
                                         n_jobs = -1)


cv_scores = cross_val_score(model, train_ , target, cv=5, scoring='neg_log_loss')
print(cv_scores)
print('RF CV mean : %.2f ' % (np.mean(cv_scores)))
print('RF CV std : %.2f ' % (np.std(cv_scores)))

ValueError: Input contains NaN, infinity or a value too large for dtype('float32').

# Submission

In [None]:
df_submission = pd.DataFrame(pred_test, index=test.index)

In [None]:
df_submission.head()

In [None]:
df_submission.to_csv("submission.csv", index_label="id", header=['0', '1', '2'])