# MULTI-CLASS BASELINE

In [1]:
import time
import json
import pickle
import string
import random
import itertools
import numpy as np
import pandas as pd
from copy import deepcopy
from random import randint
import sklearn
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV, StratifiedKFold
from sklearn.base import TransformerMixin
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import MinMaxScaler
from sklearn.pipeline import Pipeline
from xgboost import XGBClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import VotingClassifier, StackingClassifier
from sklearn.metrics import classification_report, confusion_matrix
from bbclf.bb_classifier import BinaryBiasClassifier
from collections import defaultdict, Counter

import matplotlib.pyplot as plt
%matplotlib inline
#pd.set_option('display.max_rows', 500)
#pd.set_option('display.max_columns', 500)
%load_ext autoreload
%autoreload 2

# 1. Helper functions

In [2]:
class DenseTransformer(TransformerMixin):
    '''
        To use in Pipeline():
        model = Pipeline( steps=[('vect', vectorizer), ('to_dense', DenseTransformer()), ('lda', lda)] )
        model.fit( X_train, y_train )
    '''

    def fit(self, X, y=None, **fit_params):
        return self

    def transform(self, X, y=None, **fit_params):
        #return X.todense()
        return X.toarray()

In [3]:
def plot_confusion_matrix( cm, classes, title='Confusion matrix', figsize=(5,5),
                           cmap=plt.cm.PuBu ):   # originally plt.cm.Blues; also good: BuPu,RdPu,PuRd,OrRd,Oranges
    """
    Plot the confusion matrix
    """
    plt.rcParams['xtick.bottom'] = plt.rcParams['xtick.labelbottom'] = False
    plt.rcParams['xtick.top'] = plt.rcParams['xtick.labeltop'] = True
            
    plt.figure(figsize=figsize)
    im = plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar(im, fraction=0.046, pad=0.05)
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=90)
    plt.yticks(tick_marks, classes)
    
    fmt = 'd'
    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, format(cm[i, j], fmt),
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")

    plt.ylabel('True labels')
    plt.xlabel('Predicted labels')
    plt.tight_layout()
    plt.show()

In [4]:
def upsample( df_, to_oversample_, random_state_=None ):
    '''
        Upsample df_ by to_oversample_ more samples
    '''        
    # OVERSAMPLE AND CONCAT W/ORIGINAL DF_
    replace = False
    if len(df_) < to_oversample_:
        replace = True        

    df_upsampled = df_.sample( n=to_oversample_, replace=replace, random_state=random_state_ )
        
    return pd.concat([ df_, df_upsampled ]).sample(frac=1, random_state=random_state_).reset_index(drop=True)

In [5]:
def upsample_all( df_, random_state ):
    '''
        Upsample each class in df_ to the number of data points
        in the majority class
    '''
        
    # get sub-dataframes for each class & max length
    labels = df_['target'].unique()
    dframes, df_lengths = dict(), dict()
    for i in labels:
        temp          = df_[ df_['target'] == i ]
        dframes[i]    = temp.copy()
        df_lengths[i] = len(temp)
                
    max_len = max( list(df_lengths.values()) )
    df_lengths = {k: max_len-v for k,v in df_lengths.items()}                     # difference - how many to resample
        
    # upsample with replacement to max length
    for i in labels:
        if df_lengths[i] == max_len:
            dframes[i] = dframes[i].sample( frac=1, random_state=random_state )      # we know it's overrepresented
        else:
            if len(dframes[i]) >= df_lengths[i]:
                replace = False                                                      # enough data points
            else:
                replace = True
            temp = dframes[i].sample( df_lengths[i], replace=replace, random_state=random_state )
            dframes[i] = pd.concat( [dframes[i].copy(), temp.copy()] )               # df len + (max_len-df len)
            dframes[i] = dframes[i].sample( frac=1, random_state=random_state )      # shuffle
    
    # combine and reshuffle
    df_merged = pd.concat( list(dframes.values()) )
    df_merged = df_merged.sample( frac=1, random_state=random_state ).reset_index(drop=True)
    
    return df_merged

## 2. Load, Deduplicate, Usample Data

In [16]:
# LOAD MAIN DATASET
df = pd.read_pickle('./data/file_name.pkl')
print(df.shape)
df = df[ df['label'].isin(ml_categories) ]
print(df.shape)

(10932, 15)
(10786, 15)


In [6]:
random_state  = 47
label_to_key = {
     'class0': 0,
     'class1': 1,
     'class2': 2,
     'class3': 3,
     'class4': 4,
     'class5': 5,
}
ml_categories = list( label_to_key.keys() )
print(len(label_to_key))
print(len(ml_categories))

key_to_label = {v: k for k,v in label_to_key.items()}

15
15


In [21]:
# GET TARGET FOR ENTIRE CONCATENATED DF (NOT BEFORE!)
df['target'] = df['label'].map( label_to_key )
df['target'] = df['target'].astype(int)
print(df['target'].value_counts())

0     4609
6     1994
14    1110
12     996
10     620
9      412
7      314
4      281
11     278
2      252
8      208
5      160
3      150
13     131
1      109
Name: target, dtype: int64


In [None]:
# DEDUPLICATE AT THE SENTENCE-LABEL LEVEL
temp = df[ df.duplicated(subset=['sentence', 'label'], keep=False) ].sort_values(by='sentence')
print(temp.shape)
temp[['sentence', 'label',]]

In [33]:
df = df.drop_duplicates(subset=['sentence', 'label'], keep='first').reset_index(drop=True)
df.shape

(11499, 18)

In [8]:
df_train = df[ df['subset'] == 'train' ].copy()
df_test  = df[ df['subset'] == 'test' ].copy()
print('Before upsampling:\n', df_train['target'].value_counts(), sep='')

Before upsampling:
0     3592
6     1603
14     914
12     804
10     506
9      346
7      261
4      230
11     230
2      217
8      177
5      131
3      128
13     109
1       96
Name: target, dtype: int64


In [9]:
#### UPSAMPLE
df_train = upsample_all( df_train, random_state )
print('After upsampling:\n', df_train['target'].value_counts(), sep='')

After upsampling:
14    3592
13    3592
12    3592
11    3592
10    3592
9     3592
8     3592
7     3592
6     3592
5     3592
4     3592
3     3592
2     3592
1     3592
0     3592
Name: target, dtype: int64


## 3. Get X, y, and stopwords

In [22]:
# GET STOPWORDS
file = 'data/stopwords_no_lemmas.txt'
with open(file) as f:
    sw = f.readlines()
sw = [i.strip() for i in sw]
print(len(sw))
print(sw)

38
['a', 'also', 'an', 'and', 'as', 'at', 'be', 'been', 'being', 'am', 'are', 'is', 'was', 'were', 'but', 'by', 'for', 'from', 'have', 'having', 'has', 'had', 'in', 'it', 'much', 'of', 'on', 'one', 'thank', 'thanks', 'that', 'those', 'the', 'this', 'these', 'to', 'very', 'will']


In [41]:
feature_col = 'sentence'
X_train = df_train[feature_col].values
y_train = df_train['target'].values
X_test  = df_test[feature_col].values
y_test  = df_test['target'].values

X_train, y_train = sklearn.utils.shuffle(X_train, y_train, random_state=random_state)
X_test, y_test   = sklearn.utils.shuffle(X_test, y_test, random_state=random_state)
print('Shape of datasets:', X_train.shape, X_test.shape, y_train.shape, y_test.shape )

unique, counts = np.unique(y_train, return_counts=True)
print('Labels in train set:  ', dict(zip(unique, counts)))
unique, counts = np.unique(y_test, return_counts=True)
print('Labels in test set:   ', dict(zip(unique, counts)))

Shape of datasets: (53880,) (2155,) (53880,) (2155,)
Labels in train set:   {0: 3592, 1: 3592, 2: 3592, 3: 3592, 4: 3592, 5: 3592, 6: 3592, 7: 3592, 8: 3592, 9: 3592, 10: 3592, 11: 3592, 12: 3592, 13: 3592, 14: 3592}
Labels in test set:    {0: 918, 1: 13, 2: 33, 3: 22, 4: 49, 5: 28, 6: 386, 7: 52, 8: 30, 9: 63, 10: 110, 11: 45, 12: 189, 13: 21, 14: 196}


In [39]:
# GET STOPWORDS
file = 'data/stopwords_no_lemmas.txt'
with open(file) as f:
    sw = f.readlines()
sw = [i.strip() for i in sw]
print(len(sw))

38


In [40]:
sw_orig  = ['a', 'an', 'the', 'thank',]
sw_prec  = [ 'achieve', 'one', 'do', 'success', 'team', 'continue', 'hard', 'beyond', 'focus', 'deliver',
             'project', 'much', 'and', 'thank', 'other', 'like', 'happy', 'make', 'say', 'want', 'of',
             'role', 'good', 'with', 'we', 'month', 'an', "'s", 'some', 'very', 'what', 'can', 'sure',
             'she', 'which', 'a', 'everything', 'again', 'you', 'move', 'your', 'change', 'at', 'ensure',
             'effort', 'best', 'by', 'take', 'the', 'get', 'work', 'lot', 'his', 'proud', 'for', ]     # all
sw_prec2 = [ 'a', 'an', 'the', 'thank', "'s", 'at', 'team', 'month', 'hard', 'work', 'your', 'can',
             'for', 'make', 'best', 'continue', 'of', 'very', 'what', 'we', 'one', 'she', 'want', ]    # short
all_15   = [ 'a', 'an', 'the', 'thank', 'you', 'with', 'to', 'time', 'this', 'that', 'take', 'person',
              'our', 'on', 'not', 'it', 'in', 'have', 'great', 'good', 'ellipsis', 'do', 'be', 'as', 'and', 'always',]

# 4. XGBoost classifier

In [42]:
random_state_xgb = 47
sw2 = [ 'a','an','the','of','thank','you' ]

clf_params_xgb = {
    'n_estimators': 150,
    'max_depth': None,
    'learning_rate': 0.3,                                # eta
    'objective': 'multi:softmax',                        # multi:softmax, multi:softprob, rank:pairwise
    'eval_metric': 'mlogloss',                           # multiclass - merror, mlogloss
    'base_score': 0.25,
    'booster': 'gbtree',                                 # gbtree, dart
    'tree_method': 'approx',                             # auto, exact, approx, hist and gpu_hist
    'importance_type': 'gain',                           # default“gain”,“weight”,“cover”,“total_gain”,“total_cover”
    'gamma': 0.25,                                       # larger - more conservative, [0, inf], default 0
    'reg_alpha': 0.5,                                    # L1 reg., larger - more conservative, default 0
    'reg_lambda': 1,                                     # L2 rreg., larger - more conservative, default 1
    'sampling_method': 'uniform',                        # uniform, gradient_based
    'max_delta_step': 1,                                 # 1-10
    'min_child_weight': 1,
    'subsample': 0.75,                                   # 0-1  (lower values prevent overfitting)    
    'colsample_bylevel': 0.8,                            # 0-1
    'colsample_bynode': 0.75,                            # optimized for higher recall
    'colsample_bytree': 0.75,                            # 0-1  
    'seed': 5,
    'num_class': 15,
    'use_label_encoder': False,
    'random_state': random_state_xgb,
    'n_jobs': -1,    
}

vect_params_xgb = {
    'max_df': 1.0,                             
    'min_df': 5,    
    'analyzer': 'char',
    'ngram_range': (1,8), 
    'binary': True,
    'stop_words': None,
}

#vectorizer = TfidfVectorizer( **vect_params_xgb )
vectorizer = CountVectorizer( **vect_params_xgb )

clf = XGBClassifier( **clf_params_xgb )

In [None]:
# FIT AND TEST MODEL
model_xgb = Pipeline( steps=[('vect', vectorizer), ('clf', clf)] )
model_xgb.fit( X_train, y_train )
y_pred = model_xgb.predict( X_test )

In [None]:
# CLASSIFICATION REPORT
labels = [i[1] for i in sorted([(k,v) for k,v in key_to_label.items()])]
print( classification_report( y_test, y_pred, target_names=labels) )

In [None]:
# CONFUSION MATRIX
cm = confusion_matrix( y_test, y_pred )
plot_confusion_matrix( cm, labels, figsize=(10,10), )

In [44]:
file = 'model_binaries/20220122_xgboost_char_071_061.pkl'
with open(file,'wb') as f:
    pickle.dump(model_xgb, f)

# 5. Other classifiers

## SVM

In [17]:
sw = [ 'of', 'a','an','the', 'thank' ]

# VECTORIZER PARAMETERS
vect_params = {
    'max_df': 1.0,
    'min_df': 2,    
    'analyzer': 'word',
    'ngram_range': (1,2),
    'binary': False,
    'stop_words': None,#sw,
}

# CLASSIFIER PARAMETERS
clf_params = {
    
    'C': 1.0,                      # default=1.0
    'kernel': 'linear',            # default=’rbf’, {‘linear’, ‘poly’, ‘rbf’, ‘sigmoid’, ‘precomputed’}
    'degree': 3,                   # default=3, degree for polynomial f(x)
    'tol': 1e-3,                   # stopping criteria, default=1e-3
    'gamma': 'scale',              # default=’scale’, kernel coeff for ‘rbf’, ‘poly’ and ‘sigmoid’
                                   # 'scale' => 1 / (n_features * X.var()), ‘auto’ => 1 / n_features
    'coef0': 0.0,                  # default=0.0, independent term in kernel function in ‘poly’ and ‘sigmoid’
    'shrinking': True,             # default=True'
    'cache_size': 200,             # default=200,   size of the kernel cache (in MB)
    'decision_function_shape': 'ovr',    # default=’ovr’, {‘ovo’, ‘ovr’}, multiclass => always 'ovo'
    'break_ties': False,           # default=False, for decision_function_shape='ovr' and num classes>2 (longer)
    'max_iter': -1,                # default=-1,    limit on iterations
    'class_weight': None,          # default=None,  dict or ‘balanced'
    'verbose': 0,
    'random_state': random_state,
}

vectorizer = TfidfVectorizer( **vect_params )
#vectorizer = CountVectorizer( **vect_params )

clf = SVC( **clf_params )

In [None]:
# FIT AND TEST MODEL
model_svm = Pipeline( steps=[('vect', vectorizer), ('clf', clf)] )
model_svm.fit( X_train, y_train )

y_pred = model_svm.predict( X_test )
labels = [i[1] for i in sorted([(k,v) for k,v in key_to_label.items()])]
print( classification_report( y_test, y_pred, target_names=labels) )

In [None]:
cm = confusion_matrix( y_test, y_pred )
plot_confusion_matrix( cm, labels, figsize=(10,10), )

## LOGISTIC REGRESSION

In [13]:
# sw, sw_prec, sw_prec2, all_15
sw2 = [ 'of', 'a','an','the', 'thank' ]

# VECTORIZER PARAMETERS
vect_params = {
    'max_df': 1.0,
    'min_df': 2,    
    'analyzer': 'char',
    'ngram_range': (1,8),
    'binary': False,
    'stop_words': sw2,
}

# CLASSIFIER PARAMETERS
clf_params_rf = {   
    
    'solver': 'liblinear',          # default=’lbfgs’ {‘newton-cg’, ‘lbfgs’, ‘liblinear’, ‘sag’, ‘saga’},
                                    # small dataset => ‘liblinear’ big dataset => ‘sag’ and ‘saga’ (faster);
                                    # multiclass => ‘newton-cg’, ‘sag’, ‘saga’, ‘lbfgs’; ‘liblinear’ only for ovr
                                    # supported penalties by solver: ‘newton-cg’, ‘lbfgs’, ‘sag’ - [‘l2’, ‘none’],
                                    # ‘liblinear’ - [‘l1’, ‘l2’],
                                    # ‘saga’ - [‘elasticnet’, ‘l1’, ‘l2’, ‘none’]

    'penalty': 'l2',                # ‘l1’, ‘l2’, ‘elasticnet’ (both), ‘none’, default=’l2’ (not for al solvers)
    'class_weight': None,           # default=None, dict or ‘balanced'
    
    'multi_class': 'auto',          #  default=’auto’, {‘auto’, ‘ovr’, ‘multinomial’},
                                    # 'ovr’ => binary problem fit for each label
                                    # ‘multinomial’ => multinom. loss fit across entire prob distrib.
                                    # ‘auto’ selects ‘ovr’ for binary classif. or solver=’liblinear’,
                                    # otherwise ‘multinomial’
        
    'max_iter': 500,                # default=100, iters for solvers to converge
    'C': 1.0,                       # default 1.0, inverse regularization strength, smaller => stronger regularization
    'dual': False,                  # default=False (dual formulation only for l2  with liblinear solver
                                    # Prefer dual=False when n_samples > n_features
    'tol': 1e-4,                    # stopping criteria, default=1e-4
    'fit_intercept': True,          # default True; whether to fit bias / interceptbe added to the decision function
    'intercept_scaling': 1,         # default=1, for solver ‘liblinear’ and self.fit_intercept=True (additional term)
    'l1_ratio': None,               # default = None, elastic-Net mixing param, [0,1],
                                    # only for penalty='elasticnet'. l1_ratio=0 => penalty='l2',
                                    # l1_ratio=1 => penalty='l1', combination of L1 and L2 if in between    
    'verbose': 0,
    'warm_start': False,    
    'random_state': random_state,
    'n_jobs': -1,
}

#vectorizer = TfidfVectorizer( **vect_params )
vectorizer = CountVectorizer( **vect_params )

clf = LogisticRegression( **clf_params_rf )

In [None]:
# FIT AND TEST MODEL
model_lr = Pipeline( steps=[('vect', vectorizer), ('clf', clf)] )
model_lr.fit( X_train, y_train )

# testset 1
y_pred = model_lr.predict( X_test )
labels = [i[1] for i in sorted([(k,v) for k,v in key_to_label.items()])]
print( classification_report( y_test, y_pred, target_names=labels) )

In [None]:
cm = confusion_matrix( y_test, y_pred )
plot_confusion_matrix( cm, labels, figsize=(10,10), )

In [16]:
file = 'model_binaries/20220122_lr_char_15way_071_060.pkl'
with open(file,'wb') as f:
    pickle.dump(model_lr, f)

## Voting classifier

In [85]:
random_state = 47 
clf_params_xgb = {
    'n_estimators': 150,
    'max_depth': None,
    'learning_rate': 0.3,                                # eta
    'objective': 'multi:softmax',                        # multi:softmax, multi:softprob, rank:pairwise
    'eval_metric': 'mlogloss',                           # multiclass - merror, mlogloss
    'base_score': 0.25,
    'booster': 'gbtree',                                 # gbtree, dart
    'tree_method': 'approx',                             # auto, exact, approx, hist and gpu_hist
    'importance_type': 'gain',                           # default“gain”,“weight”,“cover”,“total_gain”,“total_cover”
    'gamma': 0.25,                                       # larger - more conservative, [0, inf], default 0
    'reg_alpha': 0.5,                                    # L1 reg., larger - more conservative, default 0
    'reg_lambda': 1,                                     # L2 rreg., larger - more conservative, default 1
    'sampling_method': 'uniform',                        # uniform, gradient_based
    'max_delta_step': 1,                                 # 1-10
    'min_child_weight': 1,
    'subsample': 0.75,                                   # 0-1  (lower values prevent overfitting)    
    'colsample_bylevel': 0.8,                            # 0-1
    'colsample_bynode': 0.75,                            # optimized for higher recall
    'colsample_bytree': 0.75,                            # 0-1  
    'seed': 5,
    'num_class': 15,
    'use_label_encoder': False,
    'random_state': random_state,
    'n_jobs': -1,    
}

vect_params_xgb = {
    'max_df': 1.0,                             
    'min_df': 5,    
    'analyzer': 'char',
    'ngram_range': (1,8), 
    'binary': True,
    'stop_words': None,
}

In [86]:
# 'sentence', CountVectorizer()
random_state    = 47
vect_params_lr1 = {
                    'max_df': 1.0,
                    'min_df': 2,    
                    'analyzer': 'word',
                    'ngram_range': (1,2),
                    'binary': True,
                    'stop_words': None,
}
clf_params_lr1 = {
                    'solver': 'liblinear',
                    'penalty': 'l1',        
                    'max_iter': 500,
                    'random_state': random_state,
                    'n_jobs': -1,
}
vect_params_lr2 = {
                    'max_df': 1.0,
                    'min_df': 2,    
                    'analyzer': 'char',
                    'ngram_range': (1,8),
                    'binary': False,
                    'stop_words': None,
}
clf_params_lr2 = {
                    'solver': 'liblinear',
                    'penalty': 'l2',        
                    'max_iter': 500,
                    'random_state': random_state,
                    'n_jobs': -1,
}

In [87]:
# 'sentence', TfidfVectorizer()
vect_params_svm = {
                    'max_df': 1.0,
                    'min_df': 2,    
                    'analyzer': 'word',
                    'ngram_range': (1,2),
                    'binary': False,
                    'stop_words': None,
}

# CLASSIFIER PARAMETERS
clf_params_svm = {    
                    'kernel': 'linear',
                    'random_state': random_state,
                    'probability': True,
}

In [93]:
# BUILD EACH CLASSIFIER
lr   = Pipeline( steps=[ ('vect', CountVectorizer( **vect_params_lr1 )),
                         ('clf',  LogisticRegression( **clf_params_lr1 )) ] )

lr2  = Pipeline( steps=[ ('vect', CountVectorizer( **vect_params_lr2 )),
                         ('clf',  LogisticRegression( **clf_params_lr2 )) ] )

svm  = Pipeline( steps=[ ('vect', TfidfVectorizer( **vect_params_svm )),
                         ('clf',  SVC( **clf_params_svm )) ] )

xgb  = Pipeline( steps=[ ('vect', CountVectorizer( **vect_params_xgb )),
                         ('clf',  XGBClassifier( **clf_params_xgb )) ] )

estimators = [ ('lr', lr), ('lr2', lr2), ('svm', svm), ('xgb', xgb), ]
weights    = [ 0.25, 0.25, 0.25, 0.25 ]

In [94]:
# BUILD VOTING CLASSIFIER ON TOP OF BASE CLASSIFIERS
model_vc = VotingClassifier( estimators        = estimators,
                             weights           = weights,
                             voting            = 'soft',              # soft, hard
                             flatten_transform = True,     
                             n_jobs            = -1,
                           )
model_vc.fit( X_train, y_train )

VotingClassifier(estimators=[('lr',
                              Pipeline(steps=[('vect',
                                               CountVectorizer(binary=True,
                                                               min_df=2,
                                                               ngram_range=(1,
                                                                            2))),
                                              ('clf',
                                               LogisticRegression(max_iter=500,
                                                                  n_jobs=-1,
                                                                  penalty='l1',
                                                                  random_state=47,
                                                                  solver='liblinear'))])),
                             ('lr2',
                              Pipeline(steps=[('vect',
                                          

In [None]:
y_pred = model_vc.predict( X_test )
labels = [i[1] for i in sorted([(k,v) for k,v in key_to_label.items()])]
print( classification_report( y_test, y_pred, target_names=labels, digits=4) )

In [None]:
cm = confusion_matrix( y_test, y_pred )
plot_confusion_matrix( cm, labels, figsize=(10,10), )

In [92]:
file = 'model_binaries/20220125_voting_classifier_4_models_073_062.pkl'
with open(file,'wb') as f:
    pickle.dump(model_vc, f)

## Stacking classifier

In [114]:
# estimators = [ ('lr', lr), ('lr2', lr2), ('svm', svm), ('xgb', xgb), ]
estimators      = [ ('lr', lr), ('svm', svm) ]
#final_estimator = LogisticRegression(max_iter=1000)        # 0.69/0.54
#final_estimator = SVC()                                    # 0.68, 0.5
#final_estimator = RandomForestClassifier()                 # 0.65/0.4
#final_estimator = XGBClassifier()                          # 0.65, 0.39

#final_estimator = LinearDiscriminantAnalysis()
#final_estimator = KNeighborsClassifier
#final_estimator = MultinomialNB

In [115]:
model_sc = StackingClassifier( estimators=estimators,
                               final_estimator=final_estimator,
                               stack_method='auto',            #auto, predict_proba, decision_function, predict
                               cv=None,                        # num folds (5 if None) 
                               passthrough=False,              # whether pass data to final estimator or just preds
                               n_jobs=-1,
                              )
model_sc.fit( X_train, y_train )

StackingClassifier(estimators=[('lr',
                                Pipeline(steps=[('vect',
                                                 CountVectorizer(binary=True,
                                                                 min_df=2,
                                                                 ngram_range=(1,
                                                                              2))),
                                                ('clf',
                                                 LogisticRegression(max_iter=500,
                                                                    n_jobs=-1,
                                                                    penalty='l1',
                                                                    random_state=47,
                                                                    solver='liblinear'))])),
                               ('svm',
                                Pipeline(steps=[('vect',
              

In [None]:
y_pred = model_sc.predict( X_test )
labels = [i[1] for i in sorted([(k,v) for k,v in key_to_label.items()])]
print( classification_report( y_test, y_pred, target_names=labels, digits=4, ) )

In [None]:
cm = confusion_matrix( y_test, y_pred )
plot_confusion_matrix( cm, labels, figsize=(10,10), )

## Naive Bayes

In [49]:
# VECTORIZER PARAMETERS
sw_orig  = ['a', 'an', 'the', 'thank',]
sw_prec  = [ 'achieve', 'one', 'do', 'success', 'team', 'continue', 'hard', 'beyond', 'focus', 'deliver',
             'project', 'much', 'and', 'thank', 'other', 'like', 'happy', 'make', 'say', 'want', 'of',
             'role', 'good', 'with', 'we', 'month', 'an', "'s", 'some', 'very', 'what', 'can', 'sure',
             'she', 'which', 'a', 'everything', 'again', 'you', 'move', 'your', 'change', 'at', 'ensure',
             'effort', 'best', 'by', 'take', 'the', 'get', 'work', 'lot', 'his', 'proud', 'for', ]     # all
sw_prec2 = [ 'a', 'an', 'the', 'thank', "'s", 'at', 'team', 'month', 'hard', 'work', 'your', 'can',
             'for', 'make', 'best', 'continue', 'of', 'very', 'what', 'we', 'one', 'she', 'want', ]    # short
all_15   = [ 'a', 'an', 'the', 'thank', 'you', 'with', 'to', 'time', 'this', 'that', 'take', 'person',
              'our', 'on', 'not', 'it', 'in', 'have', 'great', 'good', 'ellipsis', 'do', 'be', 'as', 'and', 'always',]

vect_params = {
    'max_df': 1.0,
    'min_df': 4,    
    'analyzer': 'char',
    'ngram_range': (1,8),
    'binary': False,
    'stop_words': all_15,
}

# CLASSIFIER PARAMETERS
clf_params = {
    'alpha': 1.0,
    'fit_prior': True,
}

#vectorizer = TfidfVectorizer( **vect_params )
vectorizer = CountVectorizer( **vect_params )

clf = MultinomialNB( **clf_params )
svd = TruncatedSVD(n_components=500)
lda = LinearDiscriminantAnalysis()    #solver='lsqr','eigen'

In [None]:
# TFIDF LOOKS SLIGHTLY BETTER? OR IS IT JUST NO CRODD-VALIDATED?
model_nb = Pipeline( steps=[ ('vect', vectorizer), ('svd', svd),
                               ('lda', lda), ], )
model_nb.fit( X_train, y_train )

# testset 1
y_pred = model_nb.predict( X_test )
labels = [i[1] for i in sorted([(k,v) for k,v in key_to_label.items()])]
print( classification_report( y_test, y_pred, target_names=labels) )

## Feature importance in NB

In [140]:
# TOTAL NUMBER OF CLASSES AND FEATURES
classifier = model_nb['clf']
classifier.feature_log_prob_.shape

(15, 166143)

In [None]:
# PRINT MOST IMPORTANT FEATURES PER CATEGORY
print('PRINT 100 MOST IMPORTANT FEATURES FOR EACH CLASS:\n')
for i in range(len(classifier.classes_)):    
    this_class = key_to_label[classifier.classes_[i]]
    print('**********' + this_class.upper() + '********************')
    prob_sorted = classifier.feature_log_prob_[i, :].argsort()
    print(np.take(model_nb['vect'].get_feature_names(), prob_sorted[-100:]))
    print()

In [145]:
# GET N MOST IMPORTANT FEATURES FOR EACH CATEGORY AND FIND FEATURES OCCURRING IN MAX NUMBER OF FEE_TYPES
n = 1000
important_features = dict()
for i in range(len(classifier.classes_)):    
    prob_sorted = classifier.feature_log_prob_[i, :].argsort()
    important_features[clf.classes_[i]] = np.take(model_nb['vect'].get_feature_names(), prob_sorted[-n:])

In [146]:
# FEATURES OCCURRING IN EACH CATEGORY
feature_frequency = []
vocabulary = set(np.concatenate(list(important_features.values()), axis=None))

for feature in vocabulary:
    count = 0
    for key in important_features:
        if feature in important_features[key]:
            count += 1    
    feature_frequency.append((count, feature))

In [None]:
# NUMBER IN EACH TUPLE MEANS THE NUMBER OF CATEGORIES IN WHICH THIS NGRAM OCCURS
sorted(feature_frequency, reverse=True)

## RANDOM FOREST

In [112]:
sw2 = [ 'of', 'a','an','the', 'thank' ]

# VECTORIZER PARAMETERS
vect_params = {
    'max_df': 1.0,
    'min_df': 2,    
    'analyzer': 'word',
    'ngram_range': (1,3),
    'binary': True,
    'stop_words': sw2,
}

# CLASSIFIER PARAMETERS
clf_params_rf = {
    'n_estimators': 100,
    'criterion': 'gini',                         # “gini”, “entropy”
    'max_depth': None,
    'min_samples_split': 2,
    'min_samples_leaf': 1,
    'max_features': 'auto',                      # “auto”, “sqrt”, “log2”
    'class_weight': 'balanced',                  # dict, 'balanced', 'balanced_subsample', None
    'random_state': random_state,
    'n_jobs': -1,
}

#vectorizer = TfidfVectorizer( **vect_params )
vectorizer = CountVectorizer( **vect_params )
svd = TruncatedSVD(n_components=100)

clf = RandomForestClassifier( **clf_params_rf )

In [None]:
# FIT AND TEST MODEL
model_rf = Pipeline( steps=[('vect', vectorizer), ('clf', clf)] )
model_rf.fit( X_train, y_train )

# testset 1
y_pred = model_rf.predict( X_test )
labels = [i[1] for i in sorted([(k,v) for k,v in key_to_label.items()])]
print( classification_report( y_test, y_pred, target_names=labels) )

## kNN

In [37]:
# VECTORIZER PARAMETERS
vect_params = {
    'max_df': 1.0,
    'min_df': 5,    
    'analyzer': 'char',
    'ngram_range': (1,5),
    'binary': True,
    'stop_words': sw,
}

# CLASSIFIER PARAMETERS
clf_params = {    
    'n_neighbors': 5,
    'weights': 'distance',     # default=’uniform’, {‘uniform’, ‘distance’}
    'algorithm': 'auto',      # default=’auto’, {‘auto’, ‘ball_tree’, ‘kd_tree’, ‘brute’}
    'metric': 'minkowski',    # default=’minkowski’{ 'euclidean', 'cosine', } + sklearn.neighbors.VALID_METRICS['brute']
    'p': 2,                   # default=2, p for minkowski distance
    'n_jobs': -1,
}

#vectorizer = TfidfVectorizer( **vect_params )
vectorizer = CountVectorizer( **vect_params )

clf = KNeighborsClassifier( **clf_params )

In [None]:
# FIT AND TEST MODEL
model_knn = Pipeline( steps=[('vect', vectorizer), ('clf', clf)] )
model_knn.fit( X_train, y_train )

y_pred = model_knn.predict( X_test )
labels = [i[1] for i in sorted([(k,v) for k,v in key_to_label.items()])]
print( classification_report( y_test, y_pred, target_names=labels) )

In [446]:
sorted(sklearn.neighbors.VALID_METRICS_SPARSE['brute'])

['cityblock', 'cosine', 'euclidean', 'l1', 'l2', 'manhattan', 'precomputed']

# APPENDIX

## Save model

In [498]:
import joblib
import pickle

In [502]:
# 4MB
with open('model_pickle.pkl','wb') as f:
    pickle.dump(pipe, f)

In [503]:
# 4MB, saves slower than pickle (stackoverflow - better for numpy arrays which are part of the model)
joblib.dump(pipe, "model_joblib.pkl")

['model_joblib.pkl']

In [504]:
with open('model_pickle.pkl', 'rb') as f:
    pipe = pickle.load(f)

In [505]:
# loads slightly slower than pickle
pipe = joblib.load("model_joblib.pkl")