# Model Notebook - DataScience Competition Baseline

### Created by Anis Ayari : https://github.com/anisayari on May 2019

Please consider to report any enhancements/bug/modification/use to : aayari@deloitte.fr

# Import Library

In [1]:
#DS & Math
import pandas as pd 
import numpy as np 

#Vizu libraries
import matplotlib.pyplot as plt 
import seaborn as sns
from tqdm import tqdm_notebook as tqdm

#sklearn libraries
from sklearn.decomposition import TruncatedSVD,NMF
from sklearn.preprocessing import LabelEncoder, Imputer, OneHotEncoder
from sklearn.model_selection import KFold,cross_val_score,cross_val_predict, GridSearchCV, StratifiedKFold, train_test_split
from sklearn.pipeline import Pipeline
from sklearn.metrics import log_loss, mean_squared_error, accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import VotingClassifier,AdaBoostClassifier,GradientBoostingClassifier, BaggingClassifier, RandomForestClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.preprocessing import StandardScaler
# Other ML libraries
import featuretools as ft
import xgboost as xgb
from xgboost.sklearn import XGBClassifier
import lightgbm as lgb
from nltk.corpus import stopwords
from scipy.sparse import csr_matrix
from stop_words import get_stop_words
stop_words_fr = get_stop_words('fr')
from functools import partial
import scipy as sp
from ml_metrics import quadratic_weighted_kappa
from collections import Counter
from math import sqrt
from sklearn.metrics import confusion_matrix as sk_cmatrix

#Others
import cv2
import warnings
import csv 
import os 
import time 
import urllib
import utils

warnings.filterwarnings('ignore')

  (fname, cnt))
This means that in case of installing LightGBM from PyPI via the ``pip install lightgbm`` command, you don't need to install the gcc compiler anymore.
Instead of that, you need to install the OpenMP library, which is required for running LightGBM on the system with the Apple Clang compiler.
You can install the OpenMP library by the following command: ``brew install libomp``.


# Feature engineering

### Feature Engineering common functions

In [2]:
def auto_features(df):
    print('[INFO] Auto Features Processing')
    
    es = ft.EntitySet(id = 'emmaus')
    #es = es.entity_from_dataframe(entity_id = 'data',dataframe = train_test.reset_index(drop=True),make_index = True,index='id')
    es = es.entity_from_dataframe(entity_id='data', index='id', dataframe = df)

    for groupby in ['brand','category','store_name','product_name','material']:
        es = es.normalize_entity(base_entity_id='data', new_entity_id=groupby, index=groupby)
    
    features, feature_names = ft.dfs(entityset = es, target_entity = 'data', max_depth = 2, verbose=2, n_jobs=5)

    df = df.set_index('id').append([features], sort=False)
    return df,feature_names

def drop_higlhy_correlated_features(df):
    # Threshold for removing correlated variables
    threshold = 0.95

    # Absolute value correlation matrix
    corr_matrix = df.corr().abs()
    upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(np.bool))
    upper.head(50)

    # Select columns with correlations above threshold
    collinear_features = [column for column in upper.columns if any(upper[column] > threshold)]

    print('There are %d features to remove.' % (len(collinear_features)))

    features_filtered = df.drop(columns = collinear_features)

    print('The number of features that passed the collinearity threshold: ', features_filtered.shape[1])
    features_positive = features_filtered.loc[:, features_filtered.all()]
    return features_positive,features_filtered


def features_engineering(df):
    """
    DROP NOT RELEVANT COLUMN 
    """
    print('[INFO] Dropping Columns...')
    columns_to_drop = ["image_url", "sub_category_3", "sub_category_4"]  #'To fill'
    df.drop(columns_to_drop, axis = 1, inplace = True)    
    
    text_columns = df.select_dtypes(include='object').columns.tolist()
    df[text_columns] = df[text_columns].fillna('missing')
    
    df,features_filtered = auto_features(df)
    df = df.reset_index()
    """
    ENCODE FEATURES
    """
    #df =utils.encoded_columns(df,df.select_dtypes(include=['category','object']).columns.tolist())
    df = utils.label_encoding(df, columns_to_encode=['color','age','product_size',"brand","shoe_size"] )
    """
    TEXT FEATURES
    """
    print('[INFO] Text Features processing')
    
    df = utils.get_len_columns(df, len_columns=['product_description'])
    
        

    
    column_to_vectorize = ["sub_category_1", "sub_category_2",'store_name','product_description',
                    'material', 'editor', 'product_name',"author"]  #'To fill'
    
    #for column_ in column_to_vectorize:
        #if column_ in df.columns :
            #df=apply_tfidf_vectorizer(df,column_)
            #df.drop(column_, inplace=True, axis=1)
    
    df= utils.tfidf_nmf_svd(df,text_columns=column_to_vectorize)
    
    binary_column = ['warranty','wifi','vintage']  #'To fill'
    for col_ in tqdm(binary_column):
        df = utils.binarie_fill(df,col_)
    
    columns_to_dummies = ['category']  # 'To fill'
    for col_ in tqdm(columns_to_dummies):
        df = pd.concat([df.drop(col_, axis=1), pd.get_dummies(df[col_],prefix=col_)], axis=1)
    
    """
    NUMERICAL FEATURES
    """
    count_column = ["brand", "author", "editor"]  #'To fill'
    for col_ in count_column:
        df = utils.count_item_column(df, col_, 'id')
        
    column_to_count = 'price'    #'To fill'
    column_to_groupby = 'store_name'    #'To fill'
    df = utils.create_mathematics_features(df, column_to_count, column_to_groupby)
    
    
    columns_to_log = ["price", "len_product_description"]  #'To fill'
    utils.transform_to_log(df,columns_to_log)

    #to_drop = ["price","id",'image_width','image_height','color','age','product_size',"brand","shoe_size","len_product_description", "condition", "year", "product_width","product_length", "product_height"]  #'To fill'
    #df.drop(to_drop,inplace=True, axis=1)
    #df,features_filtered=drop_higlhy_correlated_features(df)
    df.drop('id',axis=1,inplace=True)
    df = utils.reduce_mem_usage(df)
    
    return df

# Loading Data 

In [3]:
train = pd.read_csv("X_train.csv", index_col=0, error_bad_lines=False,header=0)
len_train = len(train)
test = pd.read_csv("X_test.csv", index_col=0, error_bad_lines=False,header=0)
y = pd.read_csv("y_train.csv", index_col=0)
train = train.reset_index()
test= test.reset_index()
train['id']  = train['id'].astype(str)+'_'+'train'
test['id']  = test['id'].astype(str)+'_'+'test'
train_test = pd.concat((train, test), axis=0)
train_test = features_engineering(train_test)
print(train_test.shape)
train = train_test.iloc[:len_train, :]
test = train_test.iloc[len_train:, :]
test_id = test.index
train['label'] = y

b'Skipping line 2168: expected 31 fields, saw 33\nSkipping line 4822: expected 31 fields, saw 37\nSkipping line 4859: expected 31 fields, saw 37\nSkipping line 7342: expected 31 fields, saw 37\n'


[INFO] Dropping Columns...
[INFO] Auto Features Processing
Built 483 features
EntitySet scattered to 5 workers in 4 seconds
Elapsed: 00:38 | Remaining: 00:00 | Progress: 100%|██████████| Calculated: 10/10 chunks


TypeError: ("'<' not supported between instances of 'str' and 'bool'", 'occurred at index wifi')

distributed.client - ERROR - Failed to reconnect to scheduler after 10.00 seconds, closing client
distributed.utils - ERROR - 
Traceback (most recent call last):
  File "/anaconda3/envs/datascience-env/lib/python3.6/site-packages/distributed/utils.py", line 713, in log_errors
    yield
  File "/anaconda3/envs/datascience-env/lib/python3.6/site-packages/distributed/client.py", line 1223, in _close
    quiet_exceptions=(CancelledError,),
  File "/anaconda3/envs/datascience-env/lib/python3.6/site-packages/tornado/gen.py", line 584, in with_timeout
    chain_future(future_converted, result)
  File "/anaconda3/envs/datascience-env/lib/python3.6/site-packages/tornado/concurrent.py", line 166, in chain_future
    future_add_done_callback(a, copy)
  File "/anaconda3/envs/datascience-env/lib/python3.6/site-packages/tornado/concurrent.py", line 262, in future_add_done_callback
    callback(future)
  File "/anaconda3/envs/datascience-env/lib/python3.6/site-packages/tornado/concurrent.py", line 16

In [6]:
from sklearn.svm import LinearSVC
from sklearn.feature_selection import SelectFromModel

train_X = train.copy()
lsvc = LinearSVC(C=0.01, penalty="l1", dual=False).fit(train_X.select_dtypes([np.number]).fillna(-1), y)
model = SelectFromModel(lsvc, prefit=True)
X_new = model.transform(train_X.select_dtypes([np.number]).fillna(-1))
X_selected_df = pd.DataFrame(X_new, columns=[train_X.select_dtypes([np.number]).fillna(-1).columns[i] for i in range(len(train_X.select_dtypes([np.number]).fillna(-1).columns)) if model.get_support()[i]])
print(X_selected_df.shape)
X_selected_df.columns

(8880, 32)


Index(['images_count', 'image_width', 'image_height', 'year', 'product_width',
       'product_length', 'shoe_size', 'vintage', 'product_height', 'price',
       'len_product_description', 'color_ENCODED', 'age_ENCODED',
       'product_size_ENCODED', 'brand_ENCODED', 'shoe_size_ENCODED',
       'cvec_len', 'tfidf_len', 'category_label selection', 'category_loisirs',
       'category_mobilier - deco', 'category_mode', 'brand_COUNT',
       'author_COUNT', 'editor_COUNT', 'count_price', 'mean_price',
       'std_price', 'max_price', 'min_price', 'log_price',
       'log_len_product_description'],
      dtype='object')

In [7]:
train_te

<class 'pandas.core.frame.DataFrame'>
Int64Index: 23680 entries, 0 to 23679
Columns: 517 entries, images_count to log_len_product_description
dtypes: category(88), float16(358), float32(54), float64(1), int16(5), int8(11)
memory usage: 26.1 MB


In [28]:
train_test.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 23680 entries, 0 to 23679
Columns: 517 entries, images_count to log_len_product_description
dtypes: category(88), float16(358), float32(54), float64(1), int16(5), int8(11)
memory usage: 26.1 MB


# Prediction

## Model

#### Random Forest

In [50]:
def split_target_and_df(train,label_column):
    return train.drop([label_column], axis=1),train[label_column]
    
def run_randomforest_classifier(train, test, label_column,scoring='accuracy'):
    
    train,target = split_target_and_df(train,label_column)
    
    params = {'bootstrap': True, 
              'class_weight': None, 
              'criterion': 'gini', 
              'max_depth': None,
              'max_features': 'auto', 
              'max_leaf_nodes': None, 
              'min_impurity_decrease': 0.0, 
              'min_impurity_split': None,
              'min_samples_leaf': 1,
              'min_samples_split': 2, 
              'min_weight_fraction_leaf': 0.0, 
              'n_estimators': 10,
              'n_jobs': -1, 
              'oob_score': False, 
              'random_state': None, 
              'verbose': 0, 
              'warm_start': False}
    
    model = RandomForestClassifier(**params)
    model.fit(train, target)
    pred_train = model.predict(train)
    pred_test = model.predict(test)
    
    cv_scores = cross_val_score(model, train, target, cv=5, scoring=scoring)
    print(cv_scores)
    print('RF CV mean : %.2f ' % (np.mean(cv_scores)))
    print('RF CV std : %.2f ' % (np.std(cv_scores)))
        
    print("True Distribution:")
    print(pd.value_counts(target, normalize=True).sort_index())
    print("Train Predicted Distribution:")
    print(pd.value_counts(pred_train, normalize=True).sort_index())
    print("Test Predicted Distribution:")
    print(pd.value_counts(pred_test, normalize=True).sort_index())
    
    features_importances = pd.Series(model.feature_importances_, index=train.columns)
    features_importances.nlargest(25).plot(kind='barh')
    
    return pred_test


In [53]:
pred_test = run_randomforest_classifier(train,test,"label")

ValueError: could not convert string to float: 'https://d1kvfoyrif6wzg.cloudfront.net/assets/images/None/main/100_6771_3b0f897.JPG'

In [None]:
#TODO:'LightGBM validation CV'

In [None]:
N_SPLITS = 2
pred_test = run_lgbm(train, test,'label',test_id)

#### SVM

In [8]:
def run_svm(train, test, label_column):
    target = train[label_column]
    train = train.drop([label_column], axis=1)
    
    scaler = StandardScaler()
    train_scaled = scaler.fit_transform(train)
    test_scaled = scaler.fit_transform(test)
    
    svm_params = {'C': 1.0, 
                  'cache_size': 200, 
                  'class_weight': None, 
                  'coef0': 0.0, 
                  'decision_function_shape': 'ovr', 
                  'degree': 3, 'gamma': 
                  'auto_deprecated', 
                  'kernel': 'rbf', 
                  'max_iter': -1, 
                  'probability': False, 
                  'random_state': None, 
                  'shrinking': True, 
                  'tol': 0.001, 
                  'verbose': False}
    
    svc=SVC() 
    svc.fit(train_scaled,target)
    y_pred_train=svc.predict(train_scaled)
    score = accuracy_score(target,y_pred_train)
    print('Accuracy Score: %.2f' % (score))
    

In [9]:
run_svm(train, test, "label")

KeyError: 'label'

#### Voting Classifier

In [None]:
def run_voting_classifier(train, test, label_column):
    
    target = train[label_column]
    train = train.drop([label_column], axis=1)
    
    ab_params = {'algorithm': 'SAMME.R', 
                 'base_estimator': None, 
                 'learning_rate': 0.1, 
                 'n_estimators': 20, 
                 'random_state': None}
    
    gbc_params = {'criterion': 'friedman_mse', 
                  'init': None, 'learning_rate': 0.1, 
                  'loss': 'deviance', 
                  'max_depth': 30, 
                  'max_features': None, 
                  'max_leaf_nodes': None, 
                  'min_impurity_decrease': 0.0, 
                  'min_impurity_split': None, 
                  'min_samples_leaf': 1, 
                  'min_samples_split': 2, 
                  'min_weight_fraction_leaf': 0.0, 
                  'n_estimators': 100, 
                  'n_iter_no_change': None, 
                  'presort': 'auto', 
                  'random_state': None, 
                  'subsample': 1.0, 
                  'tol': 0.0001, 
                  'validation_fraction': 0.1, 
                  'verbose': 0, 
                  'warm_start': False}
    
    bc_params = {'base_estimator': None, 
                 'bootstrap': True, 
                 'bootstrap_features': False, 
                 'max_features': 10, 
                 'max_samples': 1.0, 
                 'n_estimators': 20, 
                 'n_jobs': None, 
                 'oob_score': False, 
                 'random_state': None, 
                 'verbose': 0, 
                 'warm_start': False}
    
    clf1 = AdaBoostClassifier(**ab_params)
    clf2 = GradientBoostingClassifier(**gbc_params)
    clf3 = BaggingClassifier(**bc_params)
    vote_clf = VotingClassifier(estimators=[('ab', clf1), ('gbc', clf2), ('bc', clf3)], weights=[0.2,1.7,0.6], voting='soft')
    vote_clf = vote_clf.fit(train, target)
    
    pred_train = vote_clf.predict_proba(train)
    pred_cv = cross_val_predict(vote_clf, train, np.ravel(target),
                            method='predict_proba', cv=5, n_jobs=-1)
    pred_test = vote_clf.predict_proba(test)
    
    print("LogLoss on train sample ", log_loss(y_pred=pred_train, y_true=target))
    print("LogLoss on train sample (CV): ", log_loss(y_pred=pred_cv, y_true=target))
    
    return pred_test

In [None]:
pred_test = run_voting_classifier(train, test, "label")

In [None]:
#Gradient Boosting

In [11]:
def run_xgb_classifier(train, test, label_column):
    
    target = train[label_column]
    train = train.drop([label_column], axis=1)

    params = {'objective' : 'multi:softprob', 
              'num_class'  : 3,
              'eval_metric' : 'mlogloss',
              'nthread' : -1, 
              'booster' : "gbtree",
              'gamma' : 0.1, 
              'max_depth' : 5,
              'eta' : 0.1,
              'min_child_weight'  : 0.7
             }

    clf_xgb = XGBClassifier(**params)

    ppl = Pipeline([("clf", clf_xgb)])

    ppl.fit(train, np.ravel(y))

    pred_train = ppl.predict_proba(train)
    pred_cv = cross_val_predict(ppl, train, np.ravel(y),
                                method='predict_proba', cv=5, n_jobs=-1,verbose=1)

    print("LogLoss on train sample:",log_loss(y_pred=pred_train, y_true=y))
    print("LogLoss on train sample (CV):",log_loss(y_pred=pred_cv, y_true=y))
    
    pred_test = ppl.predict_proba(test)
    return pred_test 

In [12]:
run_xgb_classifier(train._get_numeric_data().fillna(train._get_numeric_data().mean()),test._get_numeric_data().fillna(test._get_numeric_data().mean()),'label')

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.


LogLoss on train sample: 0.860296679305349
LogLoss on train sample (CV): 0.9724305440512326


[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    4.5s finished


array([[0.26313093, 0.39374426, 0.3431248 ],
       [0.33498582, 0.28532153, 0.3796926 ],
       [0.2880523 , 0.60220563, 0.10974209],
       ...,
       [0.3619428 , 0.2921225 , 0.34593472],
       [0.5003393 , 0.3738521 , 0.12580857],
       [0.32796404, 0.2268061 , 0.44522986]], dtype=float32)

In [163]:
from sklearn.ensemble import ExtraTreesClassifier

label_column = 'label'
train_,target = split_target_and_df(train._get_numeric_data().fillna(train._get_numeric_data().mean()),label_column)

model = ExtraTreesClassifier(bootstrap=True , 
                                         criterion="gini", 
                                         min_samples_leaf=10, 
                                         min_samples_split=100, 
                                         n_estimators=300,
                                         random_state = 50,
                                         n_jobs = -1)


cv_scores = cross_val_score(model, train_ , target, cv=5, scoring='neg_log_loss')
print(cv_scores)
print('RF CV mean : %.2f ' % (np.mean(cv_scores)))
print('RF CV std : %.2f ' % (np.std(cv_scores)))

ValueError: Input contains NaN, infinity or a value too large for dtype('float32').

# Submission

In [None]:
df_submission = pd.DataFrame(pred_test, index=test.index)

In [None]:
df_submission.head()

In [None]:
df_submission.to_csv("submission.csv", index_label="id", header=['0', '1', '2'])