# Import Library

In [None]:
import pandas as pd 
import numpy as np
from tqdm import tqdm_notebook as tqdm

import matplotlib.pyplot as plt 
import seaborn as sns

from scipy.sparse import csr_matrix

from sklearn.preprocessing import LabelEncoder, OneHotEncoder, Imputer
from sklearn.model_selection import KFold, cross_val_predict, cross_val_score, GridSearchCV, StratifiedKFold, train_test_split
from sklearn.metrics import log_loss, accuracy_score, f1_score

from sklearn.pipeline import Pipeline

import xgboost as xgb
from xgboost.sklearn import XGBClassifier
from xgboost import plot_importance

import lightgbm as lgb

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier

from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from stop_words import get_stop_words
stop_words_fr = get_stop_words('fr')
stop_words_en = get_stop_words('en')
from nltk.corpus import stopwords

from sklearn.decomposition import NMF
from sklearn.decomposition import TruncatedSVD


import warnings
warnings.filterwarnings('ignore')


In [None]:
#Reduce memory usage 

def reduce_mem_usage(df):
    """ iterate through all the columns of a dataframe and modify the data type
        to reduce memory usage.
    """
    start_mem = df.memory_usage().sum() / 1024 ** 2
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))

    for col in df.columns:
        col_type = df[col].dtype

        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            df[col] = df[col].astype('category')

    end_mem = df.memory_usage().sum() / 1024 ** 2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))

    return df

# Import Data 

In [None]:
train = pd.read_csv("X_train.csv", index_col=0, error_bad_lines=False)
test = pd.read_csv("X_test.csv", index_col=0, error_bad_lines=False)
y = pd.read_csv("y_train.csv", index_col=0)

# Feature engineering

In [None]:
def apply_tfidf_vectorizer(df, column):
    df[column] = df[column].fillna("missing")
    df[column] = df[column].astype(str)
    vectorizer = TfidfVectorizer(analyzer='word', ngram_range=(1,3), stop_words = stop_words_fr, lowercase=True, 
                                     max_features=50, binary=True, norm=None,use_idf=False)
    tfidf = vectorizer.fit_transform(df[column])
    tfidf_cols = vectorizer.get_feature_names()
    tmp = pd.DataFrame(data=tfidf.toarray(), columns=['tfidf_' + column + '_' + i for i in tfidf_cols])
    df = pd.concat([df, tmp], axis=1)
    return df

def count_item_column(df, column_to_count, column_groupby):
    rescuer_count = df.groupby([column_to_count])[column_groupby].count().reset_index()
    rescuer_count.rename(columns={rescuer_count.columns[0]: column_to_count}, inplace=True)
    rescuer_count.columns = [column_to_count, column_to_count+'_COUNT']
    df = df.merge(rescuer_count, how='left', on=column_to_count)
    return df

def binarie_fill(df,column):
    df[column] = df[column].fillna(0)
    if True in df[column].tolist():
        df[column]= np.where(df[column]==True,1,0)
    else:
        df[column]= np.where(df[column]==0,0,1)
    return df

def label_encoding(df,columns_to_encode):
    labelencoder = LabelEncoder()
    categ_cols = columns_to_encode
    for columns_ in categ_cols:
        df[columns_+'_ENCODED'] = labelencoder.fit_transform(df[columns_].values.astype(str))
    return df

def transform_to_log(df,columns_to_log):
    for col_ in columns_to_log:
        df['log_' + col_] = (1+df[col_]).apply(np.log)
        df.drop(col_, inplace=True, axis=1)
    return df

def get_len_columns(df, len_columns):
    for col_ in len_columns:
        df["len_" + col_] = df[col_].str.len()
    return df

def tfidf_nmf_svd(df,text_columns):
    for col_ in tqdm(text_columns):
        print(col_)
        text = df[col_].values.tolist()
        print('[INFO] Start count vectorize')
        cvec = CountVectorizer(min_df=2, ngram_range=(1, 3), max_features=1000,
                               strip_accents='unicode',
                               lowercase=True, analyzer='word', token_pattern=r'\w+',
                               stop_words=stop_words_fr)

        cvec.fit(text)
        X = cvec.transform(text)
        df['cvec_sum'] = X.sum(axis=1)
        df['cvec_mean'] = X.mean(axis=1)
        df['cvec_len'] = (X != 0).sum(axis=1)

        print('[INFO] Start TFDIDF')
        tfv = TfidfVectorizer(min_df=2, max_features=1000,
                              strip_accents='unicode', analyzer='word',
                              ngram_range=(1, 3), use_idf=1, smooth_idf=1, sublinear_tf=1,
                              stop_words=stop_words_fr)

        # Fit TFIDF
        X = tfv.fit_transform(text)
        df['tfidf_sum'] = X.sum(axis=1)
        df['tfidf_mean'] = X.mean(axis=1)
        df['tfidf_len'] = (X != 0).sum(axis=1)
        n_components = 20

        print('[INFO] Start NMF')

        nmf_ = NMF(n_components=n_components)
        X_nmf = nmf_.fit_transform(X)
        X_nmf = pd.DataFrame(X_nmf, columns=['{}_nmf_{}'.format(col_, i) for i in range(n_components)])
        X_nmf['id'] = df.id.values.tolist()
        df = pd.concat([df.set_index('id'), X_nmf.set_index('id')], sort=False, axis=1).reset_index()
        df.rename(columns={df.columns[0]: 'id'}, inplace=True)

        print('[INFO] Start SVD')
        svd = TruncatedSVD(n_components=n_components)
        svd.fit(X)
        print('fit done')
        X_svd = svd.transform(X)
        X_svd = pd.DataFrame(X_svd, columns=['{}_svd_{}'.format(col_, i) for i in range(n_components)])
        X_svd['id'] = df.id.values.tolist()
        df = pd.concat([df.set_index('id'), X_svd.set_index('id')], sort=False, axis=1).reset_index()
        df.rename(columns={df.columns[0]: 'id'}, inplace=True)
        df.drop(col_, axis=1, inplace=True)
    return df

In [None]:
def features_engineering(df, columns_to_drop, column_to_encode, column_to_vectorize, 
                         binary_column, text_columns, count_columns, columns_to_log, len_columns):
    
    df = df.reset_index()
    df = df.rename(index=str, columns={"index": "id"})
    
    """
    DROP NOT RELEVANT COLUMN 
    """
    
    df.drop(columns_to_drop, axis = 1, inplace = True)
    
    """
    TEXT FEATURES
    """
    
    df = get_len_columns(df, len_columns)

    df[text_columns] = df[text_columns].fillna('missing')
    
    df = label_encoding(df, column_to_encode)
    
    for col_ in count_columns:
        df = count_item_column(df, col_, 'id')

    for column_ in column_to_vectorize:
        if column_ in df.columns :
            df=apply_tfidf_vectorizer(df,column_)
            df.drop(column_, inplace=True, axis=1)
    
    for col_ in binary_column:
        df = binarie_fill(df,col_)

    """
    NUMERICAL FEATURES
    """
    
    df = transform_to_log(df,columns_to_log)
    
    df = reduce_mem_usage(df)
    
    return df

In [None]:
columns_to_drop =  []
column_to_encode = []
column_to_vectorize = []
binary_column = []
text_columns = []
count_columns = []
columns_to_log = []
len_columns = []

In [None]:
train = features_engineering(train, columns_to_drop, column_to_encode, column_to_vectorize, binary_column, 
                             text_columns, count_columns, columns_to_log, len_columns)

test = features_engineering(test, columns_to_drop, column_to_encode, column_to_vectorize, binary_column, 
                            text_columns, count_columns, columns_to_log, len_columns)

# Algorithmes

#### Random Forest

In [None]:
clf_rf = RandomForestClassifier()

ppl = Pipeline([("imputer", Imputer(strategy='median')),
                ("clf", clf_rf)])

ppl.fit(train, np.ravel(y))

pred_train = ppl.predict_proba(train)
pred_cv = cross_val_predict(ppl, train, np.ravel(y),
                            method='predict_proba', cv=5, n_jobs=-1)

print("LogLoss on train sample:",log_loss(y_pred=pred_train, y_true=y))
print("LogLoss on train sample (CV):",log_loss(y_pred=pred_cv, y_true=y))

In [None]:
features_importances = pd.Series(clf_rf.feature_importances_, index=train.columns)
features_importances.nlargest(20).plot(kind='barh')

In [None]:
pred_test = ppl.predict_proba(test)

#### XGBoost

In [None]:
params = {'objective' : 'multi:softprob', 
          'num_class'  : 3,
          'eval_metric' : 'mlogloss',
          'nthread' : -1, 
          'booster' : "gbtree",
          'gamma' : 0.01, 
          'max_depth' : 7,
          'eta' : 0.1,
          'min_child_weight'  : 0.7
         }

clf_xgb = XGBClassifier(**params)

ppl = Pipeline([("clf", clf_xgb)])

ppl.fit(train, np.ravel(y))

pred_train = ppl.predict_proba(train)
pred_cv = cross_val_predict(ppl, train, np.ravel(y),
                            method='predict_proba', cv=5, n_jobs=-1)

print("LogLoss on train sample:",log_loss(y_pred=pred_train, y_true=y))
print("LogLoss on train sample (CV):",log_loss(y_pred=pred_cv, y_true=y))

In [None]:
d_train = xgb.DMatrix(csr_matrix(train), label=y, feature_names=train.columns.values)
d_test = xgb.DMatrix(csr_matrix(test))
clf = clf_xgb.fit(train, np.ravel(y))

fig, ax = plt.subplots(figsize=(20, 20))
plot_importance(clf, max_num_features=50, ax=ax)

In [None]:
pred_test = ppl.predict_proba(test)

#### Light GBM 

In [None]:
params = {
    'metric' : 'multi_logloss',
    'objective':'multiclass',
    'boosting': 'gbdt', 
    'num_class' : 3,
    'subsample': 1, 
    'colsample_bytree': 0.9, 
    'min_split_gain': 0.4, 
    'min_child_weight': 1, 
    'min_child_samples': 5,
    'max_bin': 300, 
    'num_iterations': 90,
    'learning_rate': 0.15,
    'subsample_for_bin': 200, 
    'lambda_l1': 0, 
    'lambda_l2': 0, 
    'num_leaves': 80,
    'max_depth': 25, 
    'reg_alpha' : 1.2,
    'reg_lambda' : 1.2,
}

clf_lgb = lgb.LGBMClassifier(**params)

ppl = Pipeline([("imputer", Imputer(strategy='median')),
                ("clf", clf_lgb)])

ppl.fit(train, np.ravel(y))

pred_train = ppl.predict_proba(train)
pred_cv = cross_val_predict(ppl, train, np.ravel(y),
                            method='predict_proba', cv=5, n_jobs=-1)

print("LogLoss on train sample:",log_loss(y_pred=pred_train, y_true=y))
print("LogLoss on train sample (CV):",log_loss(y_pred=pred_cv, y_true=y))

In [None]:
pred_test = ppl.predict_proba(test)

# Soumission

In [None]:
df_submission = pd.DataFrame(pred_test, index=test.index)

In [None]:
df_submission.to_csv("submission.csv", index_label="id", header=['0', '1', '2'])