In [27]:
# Import data handling libraries
import pandas as pd
from sklearn.model_selection import train_test_split
import json
import numpy as np
from scipy.sparse import csr_matrix, hstack

# Import functions from local scripts
import sys
sys.path.insert(1, './scripts/development')
import scripts.development.preprocessing as pre
import scripts.development.experiment as ex

# Import modeling libraries
from sklearn.preprocessing import MinMaxScaler
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_validate, RepeatedStratifiedKFold

# Import metrics functions
from sklearn.metrics import classification_report, f1_score, recall_score, average_precision_score
from sklearn.metrics import accuracy_score, roc_auc_score, balanced_accuracy_score
from sklearn.model_selection import GridSearchCV, StratifiedKFold

# Import function libraries
from functools import partial

# Import balancing libraries
from imblearn.over_sampling import SMOTE, ADASYN, RandomOverSampler
from imblearn.under_sampling import ClusterCentroids, RandomUnderSampler, AllKNN
from imblearn.combine import SMOTEENN, SMOTETomek

# Import I/O libraries
from pickle import dump

# Filter warnings
import warnings
warnings.filterwarnings('ignore')

In [2]:
tickets_df = pd.read_csv("../data/labeled_tickets.csv", index_col=0)
tickets_df.head()

Unnamed: 0,Description,is_about_order_status
0,"- __EMAIL__ Hi , I have just ordered a pair of...",0
1,I am missing a pair of shoes from my order. Co...,0
2,I didn'tget a my order - __EMAIL__,1
3,"Hello, I ordered two __PRODUCTS_NAMES and one ...",0
4,My shipment never was delivered. The tracking ...,1


In [3]:
# Rename column name from Description to text.
# I will keep this consistent throughout the functions
tickets_df = tickets_df.rename(columns={'Description': 'text'})
tickets_df.head(1)

Unnamed: 0,text,is_about_order_status
0,"- __EMAIL__ Hi , I have just ordered a pair of...",0


In [4]:
# Ensure that dtype is string
tickets_df['text'] = tickets_df['text'].astype('string')

In [5]:
# Translate all text into English
tickets_df['text'] = tickets_df['text'].apply(lambda x: pre.translate_to_en(x))

In [6]:
# Check how imbalanced the dataset is
tickets_df["is_about_order_status"].value_counts()

0    1249
1     184
Name: is_about_order_status, dtype: int64

### What is next?

We can see that our dataset is very imbalanced. We will deal with that later. For now, I just want to extract some samples for testing and the remaining files for training. I will go with a 20% split.

I will also define the preprocessing functions for the raw text in the python script files and test them accordingly.

In [7]:
X_train, X_test, y_train, y_test = train_test_split(tickets_df['text'], tickets_df["is_about_order_status"],
                                                    stratify=tickets_df["is_about_order_status"],
                                                    test_size=0.2)
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

(1146,) (287,) (1146,) (287,)


In [8]:
# Convert y_train and y_test to int pandas.Series
y_train = y_train.astype('int8')
y_test = y_test.astype('int8')

### Raw text processing

1. I will want to check if a text is in a language different than English. If it is, I will translate it into English.
2. I will measure text length and store it as a feature.
3. I will measure average word length and store it as a feature.
4. I will measure number of words and store it as a feature.
4. I will use a library to reduce spelling errors.
5. I will use a library to expand words like can't to can not.
6. I will measure the numeric counts of each message (the count of number values present in each message)
7. I will convert all text to lowercase
8. I will get a set of all tokens in the texts (like __email__ and __company__ etc.) and I will create a column for each in which I will count their occurences. I will then remove these tokens from each text. The set of tokens will be stored in the script file to be used with the general processing function. It will be easy to later find new tokens and add them to the set.
9. I will count number of emails in each message and remove them.
10. I will count and remove any url from the text.
11. I will measure the number of stopwords and store it as a feature.
12. I will remove stopwords, single characters and special characters from text.
13. I will stem the remaining words.

All these steps will be wrapped into a function that preprocesses raw text. I will also create a function which uses the former function for a pandas series, so I can easily use it when testing ways to balance the dataset.

In [9]:
# Because ticket tokens are present in the raw text, we can extract here the token set and see if we spot anything wrong.
token_set = pre.get_tokens(X_train)
token_set

{'_ADDRESS_',
 '_COMPANNY_',
 '_COMPANY_',
 '_COMPANY__',
 '_COMPANY___',
 '_CREDIT_',
 '_DATE_',
 '_DATE__PRODUCT_',
 '_INVOICE_NUMBER_',
 '_ITEM_PHOTO_',
 '_LOCATION_',
 '_MONTH_',
 '_NAME_',
 '_NAME__ADRRESS_',
 '_ORDER_NUMBBER_',
 '_ORDER_NUMBER_',
 '_OTHER_PI_',
 '_PHONE_',
 '_PRICE_',
 '_PRODUCTS_',
 '_PRODUCTS_NAMES_',
 '_PRODUCT_',
 '_PRODUCT_NAME_',
 '_TRACKING_NUMBER_',
 '_URL_',
 '__ADDRESS__',
 '__AMOUNT__',
 '__COMPANY_NAME__',
 '__COMPANY__',
 '__CREDIT_CARD__',
 '__DATE__',
 '__DISCOUNT_CODE__',
 '__EMAIL__',
 '__INVOICE_NUMBER__',
 '__MAIN__',
 '__NAMES__',
 '__NAME__',
 '__NAME____',
 '__ORDER_NUMBER__',
 '__OTHER_PI__',
 '__PHONE__',
 '__PLACE__',
 '__PLACE____',
 '__PRODUCTS_NAMES__',
 '__PRODUCT_NAMES__',
 '__PRODUCT_NAME__',
 '__PRODUCT__NAMES__',
 '__PRODUCT__NAME__',
 '__REFERENCE_NUMBER__',
 '__TRACKING_NUMBER__',
 '__URL__'}

We can spot some mistakes from our regex which we will eliminate from the set:
\_COMPANY___, \_COMPANY__ \_NAME__ADRRESS_, \_NUMBER__, \__DATE____, \_DATE__PRODUCT_, \__NAME____, \__PLACE____

In order to extract features from these tokens it is best to define a dictionary where the key is the column and the value is a list of tokens corresponding to the column. For each token in the list, we will increase the corresponding column value by 1.
Example: column value is product_name and the token list is \[\_product_name_, \_product_names_]. Then for each token of value \_product_name_ or \_product_names_, we will increase the product_name column value by 1.

We will store the final dictionary into a file in order to be used later.

In [10]:
# Remove incorrect tokens and check all the tokens visually
token_set.difference({"_COMPANY__", "_COMPANY___", "_NAME__ADRRESS_", "_NUMBER__", "__DATE____",
                      "\_DATE__PRODUCT_", "\__NAME____", "\__PLACE____"})

{'_ADDRESS_',
 '_COMPANNY_',
 '_COMPANY_',
 '_CREDIT_',
 '_DATE_',
 '_DATE__PRODUCT_',
 '_INVOICE_NUMBER_',
 '_ITEM_PHOTO_',
 '_LOCATION_',
 '_MONTH_',
 '_NAME_',
 '_ORDER_NUMBBER_',
 '_ORDER_NUMBER_',
 '_OTHER_PI_',
 '_PHONE_',
 '_PRICE_',
 '_PRODUCTS_',
 '_PRODUCTS_NAMES_',
 '_PRODUCT_',
 '_PRODUCT_NAME_',
 '_TRACKING_NUMBER_',
 '_URL_',
 '__ADDRESS__',
 '__AMOUNT__',
 '__COMPANY_NAME__',
 '__COMPANY__',
 '__CREDIT_CARD__',
 '__DATE__',
 '__DISCOUNT_CODE__',
 '__EMAIL__',
 '__INVOICE_NUMBER__',
 '__MAIN__',
 '__NAMES__',
 '__NAME__',
 '__NAME____',
 '__ORDER_NUMBER__',
 '__OTHER_PI__',
 '__PHONE__',
 '__PLACE__',
 '__PLACE____',
 '__PRODUCTS_NAMES__',
 '__PRODUCT_NAMES__',
 '__PRODUCT_NAME__',
 '__PRODUCT__NAMES__',
 '__PRODUCT__NAME__',
 '__REFERENCE_NUMBER__',
 '__TRACKING_NUMBER__',
 '__URL__'}

In [11]:
# Define dictionary
token_dictionary = {"address_count": ['_ADDRESS_', '__ADDRESS__', '_LOCATION_', '__PLACE__'],
                    "company_name_count": ['_COMPANNY_', '_COMPANY_', '__COMPANY_NAME__', '__COMPANY__'],
                    "credit_card_count": ['_CREDIT_', '__CREDIT_CARD__'],
                    "date_count": ['_DATE_', '_MONTH_', '__DATE__'],
                    "invoice_count": ['_INVOICE_NUMBER_', '__INVOICE_NUMBER__'],
                    "photo_count": ['_ITEM_PHOTO_'],
                    "name_count": ['_NAME_', '__NAMES__', '__NAME__'],
                    "order_count": ['_ORDER_NUMBBER_', '_ORDER_NUMBER_', '__ORDER_NUMBER__'],
                    "other_pi_count": ['_OTHER_PI_', '__OTHER_PI__'],
                    "phone_count": ['_PHONE_', '__PHONE__'],
                    "price_count": ['_PRICE_', '__AMOUNT__'],
                    "product_count": ['_PRODUCT_', '_PRODUCT_NAME_', '__PRODUCTS_NAMES__', '__PRODUCT_NAMES__', 
                                      '__PRODUCT_NAME__', '__PRODUCT__NAMES__', '__PRODUCT__NAME__'],
                    "tracking_number_count": ['_TRACKING_NUMBER_', '__TRACKING_NUMBER__'],
                    "url_count": ['_URL_', '__URL__'],
                    "discount_code_count": ['__DISCOUNT_CODE__'],
                    "email_count": ['__EMAIL__'],
                    "reference_number_count": ['__REFERENCE_NUMBER__']
                   }

# However, because there are some tokens that are contained within other tokens, we found a way to use the dictionary above
# such that we can replace the biggest strings first. And so we want to obtain from the dictionary above something like
# {'_ADDRESS_': 'address_count', '__ADDRESS__': 'address_count', '_LOCATION_': 'address_count'} etc.
# We do this by converting our dictionary in the following way
new_token_dict = {}
for key, val in token_dictionary.items():
    for x in val:
        # We now have each value in the lists as key. And the value at that key is the original value of the key.
        # Or simply put: the group the item in the list is part of
        new_token_dict.setdefault(x, key)

# Save token dictionary to file
with open("../data/token_dictionary.json", "w") as file:
    json.dump(new_token_dict, file)
    file.close()

## Getting a first result

We can now use the preprocessing functions defined in the python scripts to get a first result on some classifiers.
I will use the tf-idf vectorizer from sklearn to achieve trainable data

In [12]:
def get_tfidf_vectorizer(cleaned_text):
    """
    Function which takes a dataframe containing preprocessed text, fits a tf-idf vectorizer through it 
    and returns the vectorizer.
    :param cleaned_text: The preprocessed text on which we will use tf-idf.
    :return: A scikit-learn.TfidfVectorizer instance that was fitted on the provided preprocessed text.
    """
    tfidf_vectorizer = TfidfVectorizer()
    tfidf_vectorizer.fit(cleaned_text)
    
    return tfidf_vectorizer

In [13]:
def get_scaler(manual_features):
    """
    Function which takes a dataframe containing preprocessed text, fits a scaler through it 
    and returns the scaler.
    :param manual_features: TA dataframe representing the manual features consisting of numeric values.
    :return: A scikit-learn.MinMaxScaler instance that was fitted on the provided dataframe.
    """
    scaler = MinMaxScaler()
    scaler.fit(manual_features)
    
    return scaler

In [14]:
def get_text_m_feats(dataset, token_dictionary, with_manual_features):
    """
    Function which takes a dataframe containing text, preprocesses it and returns two dataframes representing the
    cleaned text dataframe and the extracted manual features.
    :param dataset: The text dataset. It must be a pandas.Series containing rows of text.
    :param token_dictionary: dictionary of tokens, where the key is the general meaning of the token and what we will
    use for column naming and the value is a list of actual values found in the text.
    :param with_manual_features: Flag. If set to True, return the extracted manual features as dataframe. 
    If set to False, return a None value instead.
    :return: Two pandas.DataFrame s, the first representing the cleaned text and the second representing the manual features.
    """
    p_dataset = pre.preprocess_text_series(dataset, token_dictionary, with_manual_features)
    m_feats_dataset = p_dataset.drop(columns='text')
    text_dataset = p_dataset['text']
    
    if with_manual_features:
        return text_dataset, m_feats_dataset
    else:
        return text_dataset, None

In [15]:
def obtain_tfidf_matrix(token_dictionary, train, with_manual_features=True):
    """
    Function which takes a train dataframe containing text, preprocesses it and converts it into a sparse
    matrix through a tf-idf vectorizer.
    :param token_dictionary: dictionary of tokens, where the key is the general meaning of the token and what we will
    use for column naming and the value is a list of actual values found in the text.
    :param train: The train dataset. It must be a pandas.Series containing rows of text.
    :param with_manual_features: Flag. If set to False, do not append manual features to the resulting DataFrame.
    Append them if set to True. Default is True.
    :return: A sparse matrix, being the representation of the train dataset after tf-idf vectorization.
    """
    text_train, m_feats_train = get_text_m_feats(train, token_dictionary, with_manual_features)
    
    tfidf_vectorizer = get_tfidf_vectorizer(text_train)
    tfidf_train = tfidf_vectorizer.transform(text_train)
    
    if with_manual_features:
        scaler = get_scaler(m_feats_train)
        scaled_m_feats_train = pd.DataFrame(scaler.transform(m_feats_train), index = m_feats_train.index.values)
        tfidf_train = hstack((tfidf_train, csr_matrix(scaled_m_feats_train)))
    
    return tfidf_train

In [16]:
def transform_to_tfidf_matrix(token_dictionary, tfidf_vectorizer, scaler, test, with_manual_features=True):
    """
    Function which takes a test dataframe containing text, preprocesses it and converts it into a sparse
    matrix through a tf-idf vectorizer.
    :param token_dictionary: dictionary of tokens, where the key is the general meaning of the token and what we will
    use for column naming and the value is a list of actual values found in the text.
    :param tfidf_vectorizer: A scikit-learn.TfidfVectorizer that is already fit on data.
    :param scaler: A scikit-learn.MinMaxScaler that is already fit on data.
    :param test: The test dataset. It must be a pandas.Series containing rows of text.
    :param with_manual_features: Flag. If set to False, do not append manual features to the resulting DataFrame.
    Append them if set to True. Default is True.
    :return: A sparse matrix, being the representation of the test dataset after tf-idf vectorization.
    """
    text_test, m_feats_test = get_text_m_feats(test, token_dictionary, with_manual_features)
    
    tfidf_test = tfidf_vectorizer.transform(text_test)
    
    if with_manual_features:
        scaled_m_feats_test = pd.DataFrame(scaler.transform(m_feats_test), index = m_feats_test.index.values)
        tfidf_test = hstack((tfidf_test, csr_matrix(scaled_m_feats_test)))
    
    return tfidf_test

In [17]:
# We can use partial so we can only call this function with parameters that actually change
obtain_tfidf = partial(obtain_tfidf_matrix, new_token_dict)

In [30]:
def evaluate_model(train, model, model_name, target_train=y_train):
    """
    Function which trains a model on a train set and builds a dataframe containing a list of metrics based on testing
    the model through cross validation.
    :param train: The train dataset. It must be a pandas.Series containing rows of text.
    :param model: The scikit-learn model that will be trained on the train data.
    :param model_name: A string representing the name of the model to be trained and evaluated.
    :param with_manual_features: Flag. If set to False, do not append manual features to the resulting DataFrame.
    Append them if set to True. Default is True.
    :return: a dataframe containing values for the following columns: 
    [model_name, test_average_precision, train_average_precision, test_f1, train_f1, 
    test_recall, train_recall, test_accuracy, train_accuracy]
    """
    # Get the mean of every metric on a RepeatedStratifiedKFold that does 5 random splits, 5 times, meaning that
    # we fit the model 25 times and average the result
    results = cross_validate(model, train, target_train, 
                   scoring=['average_precision', 'f1', 'recall', 'accuracy'], return_estimator=True,
                             return_train_score=True, cv=RepeatedStratifiedKFold(n_repeats=5))
    # Model the results such that the object will be easy to work with
    results = pd.DataFrame(results)
    # We can drop the estimator because we have the model_name provided
    results = results.drop(columns=['fit_time', 'score_time', 'estimator']).mean(numeric_only=True)
    results = results.rename(model_name)
    
    # The mean function gives us a pandas.Series having the metric as indexes. We need the metrics as columns.
    # We also take the index (model_name) and add it as a column. This will make it easy for us to
    # see the results for each model type
    return pd.DataFrame(results).T.reset_index().rename(columns={'index': 'model_name'})

In [19]:
models = [MultinomialNB(), SVC(random_state=42, class_weight='balanced', probability=True, kernel='linear'),
          RandomForestClassifier(n_estimators=150, random_state=42), LogisticRegression(random_state=42), 
          SGDClassifier(loss='log_loss', random_state=42)]
model_names = ['Naive Bayes', 'Linear SVC', 'RandomForestClassifier', ' Logistic Regression', 'SGD Classifier']

In [31]:
def evaluate_on_all_models(train, model_names, models, target_train=y_train):
    """
    Function which trains a model on a train test and builds a dataframe containing a list of metrics based on testing
    the model through cross validation.
    :param train: The train dataset. It must be a pandas.Series containing rows of text.
    :param test: The test dataset. It must be a pandas.Series containing rows of text.
    :param model_names: A string list representing model names.
    :param models: A list of sci-kit learn model objects. Must be in same order and of same length as model_names.
    :param target_train: An iterable or pd.Series representing the target values of the train set. 
    This parameter is needed when resampling. Default: initial defined y_train from train-test splitting.
    :return: a pd.Dataframe having scores defined at the function evaluate_model for all the models defined at models
    and their names defined at model_names.
    """
    scores = pd.DataFrame(
        columns=["model_name", "test_average_precision", "train_average_precision", "test_f1",
                 "train_f1", "test_recall", "train_recall", "test_accuracy", "train_accuracy"])
    for model, model_name in list(zip(models, model_names)):
        row = evaluate_model(train, model, model_name, target_train)
        scores = pd.concat([scores, row])
    
    return scores

In [22]:
# Define list of sampling methods
sampling_method_names = ['oversampling_SMOTE', 'oversampling_ADASYN', 'oversampling_random',
                     'undersampling_Cluster_Centroids', 'undersampling_All_KNN', 'undersampling_random',
                     'mixed_SMOTEEN', 'mixed_SMOTETomek']
sampling_methods = [SMOTE(random_state=42), ADASYN(random_state=42), RandomOverSampler(random_state=42, sampling_strategy='minority'),
                 ClusterCentroids(random_state=42), AllKNN(), RandomUnderSampler(sampling_strategy='majority' ,random_state=42),
                 SMOTEENN(random_state=42), SMOTETomek(random_state=42)]

In [32]:
def evaluate_balancing(sampling_names, samplings, model_names, models, train, with_manual_features=True):
    """
    Function which returns a DataFrame containing the scores for all of the models on all of the sampling methods.
    :param sampling_names: A list containing the names of the sampling methods.
    :param samplings: A list containing sampling instances, in the same order as sampling_names so that 
    they correspond to each other
    :param model_names: A string list representing model names.
    :param models: A list of sci-kit learn model objects. Must be in same order and of same length as model_names.
    :param train: The train dataset. It must be a pandas.Series containing rows of text.
    :param with_manual_features: Whether to add manually defined features to the extracted tf-idf data extracted 
    at preprocessing phase.
    """
    train = obtain_tfidf(train, with_manual_features)
    results_df = evaluate_on_all_models(train, model_names, models)
    row_length_per_loop = results_df.shape[0]
    sampling_name_list = ['None'] * row_length_per_loop
    
    for sampling_name, sampling in list(zip(sampling_names, samplings)):
        X_resampled, y_resampled = sampling.fit_resample(train, y_train)
        temp_results_df = evaluate_on_all_models(X_resampled, model_names, models, y_resampled)
        sampling_name_list.extend([sampling_name] * row_length_per_loop)
        results_df = pd.concat([results_df, temp_results_df], ignore_index=True)
    
    return pd.concat([results_df, pd.Series(sampling_name_list, name='sampling_method')], axis=1)

In [33]:
%%time
# Evaluate all the 5 models on all of the sampling possibilities to check which one performs best, while using the text data
# with manual features.
with_manual_df = evaluate_balancing(sampling_method_names, sampling_methods, model_names, models, X_train, True)
with_manual_df.sort_values(by=['test_average_precision', 'test_f1', 'test_recall', 'test_accuracy'], ascending=False)

CPU times: total: 8min 18s
Wall time: 8min 16s


Unnamed: 0,model_name,test_average_precision,train_average_precision,test_f1,train_f1,test_recall,train_recall,test_accuracy,train_accuracy,sampling_method
36,Linear SVC,0.999989,1.0,0.998902,1.0,0.9998,1.0,0.998264,1.0,mixed_SMOTEEN
39,SGD Classifier,0.999964,1.0,0.998304,1.0,0.9998,1.0,0.997317,1.0,mixed_SMOTEEN
17,RandomForestClassifier,0.999943,1.0,0.979635,1.0,1.0,1.0,0.97918,1.0,oversampling_random
38,Logistic Regression,0.999887,0.999984,0.993453,0.997728,1.0,1.0,0.989583,0.996409,mixed_SMOTEEN
37,RandomForestClassifier,0.999881,1.0,0.99561,1.0,0.997997,1.0,0.993053,1.0,mixed_SMOTEEN
35,Naive Bayes,0.999851,0.999961,0.946979,0.961067,1.0,1.0,0.911595,0.936109,mixed_SMOTEEN
12,RandomForestClassifier,0.998049,1.0,0.972632,1.0,0.9908,1.0,0.972084,1.0,oversampling_ADASYN
7,RandomForestClassifier,0.997541,1.0,0.970513,1.0,0.983584,1.0,0.970067,1.0,oversampling_SMOTE
42,RandomForestClassifier,0.997461,1.0,0.971241,1.0,0.984984,1.0,0.970774,1.0,mixed_SMOTETomek
19,SGD Classifier,0.981908,0.998827,0.962239,0.989809,0.9982,0.9998,0.96076,0.98969,oversampling_random


In [34]:
%%time
# Evaluate all the 5 models on all of the sampling possibilities to check which one performs best, while using the text data
# without manual features.
wo_manual_df = evaluate_balancing(sampling_method_names, sampling_methods, model_names, models, X_train, False)
wo_manual_df.sort_values(by=['test_average_precision', 'test_f1', 'test_recall', 'test_accuracy'], ascending=False)

CPU times: total: 7min 50s
Wall time: 7min 51s


Unnamed: 0,model_name,test_average_precision,train_average_precision,test_f1,train_f1,test_recall,train_recall,test_accuracy,train_accuracy,sampling_method
36,Linear SVC,1.0,1.0,0.999401,1.0,1.0,1.0,0.999046,1.0,mixed_SMOTEEN
39,SGD Classifier,0.999997,1.0,0.999103,1.0,1.0,1.0,0.99857,1.0,mixed_SMOTEEN
38,Logistic Regression,0.999985,0.999999,0.996514,0.999199,1.0,1.0,0.994434,0.998727,mixed_SMOTEEN
37,RandomForestClassifier,0.999973,1.0,0.996699,1.0,0.997997,1.0,0.994749,1.0,mixed_SMOTEEN
35,Naive Bayes,0.999971,0.999996,0.950628,0.970087,1.0,1.0,0.917423,0.951034,mixed_SMOTEEN
17,RandomForestClassifier,0.999868,1.0,0.975426,1.0,0.999198,1.0,0.974774,1.0,oversampling_random
12,RandomForestClassifier,0.997845,1.0,0.968201,1.0,0.994563,1.0,0.96678,1.0,oversampling_ADASYN
7,RandomForestClassifier,0.997113,1.0,0.967231,1.0,0.990791,1.0,0.966369,1.0,oversampling_SMOTE
42,RandomForestClassifier,0.996964,1.0,0.966195,1.0,0.989791,1.0,0.965261,1.0,mixed_SMOTETomek
44,SGD Classifier,0.985193,0.998338,0.960896,0.987911,0.992593,0.999449,0.959558,0.987763,mixed_SMOTETomek


In [35]:
# Add column to differentiate between w/ and w/o manual features
with_manual_df['m_feats'] = True
wo_manual_df['m_feats'] = False
all_results_df = pd.concat([with_manual_df, wo_manual_df], ignore_index=True)
all_results_df.sort_values(by=['test_average_precision', 'test_f1', 'test_recall', 'test_accuracy'], ascending=False)

Unnamed: 0,model_name,test_average_precision,train_average_precision,test_f1,train_f1,test_recall,train_recall,test_accuracy,train_accuracy,sampling_method,m_feats
81,Linear SVC,1.0,1.0,0.999401,1.0,1.0,1.0,0.999046,1.0,mixed_SMOTEEN,False
84,SGD Classifier,0.999997,1.0,0.999103,1.0,1.0,1.0,0.99857,1.0,mixed_SMOTEEN,False
36,Linear SVC,0.999989,1.0,0.998902,1.0,0.9998,1.0,0.998264,1.0,mixed_SMOTEEN,True
83,Logistic Regression,0.999985,0.999999,0.996514,0.999199,1.0,1.0,0.994434,0.998727,mixed_SMOTEEN,False
82,RandomForestClassifier,0.999973,1.0,0.996699,1.0,0.997997,1.0,0.994749,1.0,mixed_SMOTEEN,False
...,...,...,...,...,...,...,...,...,...,...,...
2,RandomForestClassifier,0.661767,1.0,0.30053,1.0,0.185103,1.0,0.891977,1.0,,True
46,Linear SVC,0.64928,0.925489,0.63489,0.857634,0.727632,1.0,0.892846,0.957329,,False
1,Linear SVC,0.642544,0.928927,0.638268,0.858607,0.723632,1.0,0.895451,0.957722,,True
45,Naive Bayes,0.538275,0.769662,0.0,0.02129,0.0,0.010874,0.869111,0.872295,,False


## Evaluating the models

Now that all the models have been evaluated on the train set through cross-validation, we can check how the best models are doing on our hold out test set and then pick one of them to go forward with.

Our top 5 models are:

model_name	sampling_method	m_feats \
Linear SVC	mixed_SMOTEEN False \
SGD Classifier	mixed_SMOTEEN False \
Linear SVC	mixed_SMOTEEN True \
Logistic Regressionr  mixed_SMOTEEN	False \
Random Forest Classifier  mixed_SMOTEEN	False \

Therefore, we will fit these 5 models again on the given datasets and then compare their results and choose the best one for later predictions.

In [79]:
# Create the augmented dataset and scalers/vectorizers

# SMOTEEN with manual features
text_train, m_feats_train = get_text_m_feats(X_train, new_token_dict, True)

tfidf_vectorizer_with_mf = get_tfidf_vectorizer(text_train)
tfidf_train = tfidf_vectorizer_with_mf.transform(text_train)

scaler_with_mf = get_scaler(m_feats_train)
scaled_m_feats_train = pd.DataFrame(scaler_with_mf.transform(m_feats_train), index = m_feats_train.index.values)

train_with_mf = hstack((tfidf_train, csr_matrix(scaled_m_feats_train)))

X_SMOTEEN_with_mf, y_SMOTEEN_with_mf = SMOTEENN(random_state=42).fit_resample(train_with_mf, y_train)


# SMOTEENN without manual features
text_train, _ = get_text_m_feats(X_train, new_token_dict, False)

tfidf_vectorizer_wo_mf = get_tfidf_vectorizer(text_train)
tfidf_train_wo_mf = tfidf_vectorizer_wo_mf.transform(text_train)

X_SMOTEEN_wo_mf, y_SMOTEEN_wo_mf = SMOTEENN(random_state=42).fit_resample(tfidf_train_wo_mf, y_train)

In [82]:
# Process test data for both with and without manual features for testing purposes
X_test_with_mf = transform_to_tfidf_matrix(
    new_token_dict, tfidf_vectorizer_with_mf, scaler_with_mf, X_test, True)
X_test_wo_mf = transform_to_tfidf_matrix(new_token_dict, tfidf_vectorizer_wo_mf, None, X_test, False)

In [38]:
# Create the 5 models that performed the best (3 will be created actually, but 2 will be fitted on 2 types of datasets)
svc = SVC(random_state=42, class_weight='balanced', probability=True, kernel='linear')
sgd = SGDClassifier(random_state=42, loss='log_loss')
rf = RandomForestClassifier(n_estimators=150, random_state=42)
lr = LogisticRegression(random_state=42)

In [39]:
# Create function to fit a model on a training set and evaluate it on a test set
def fit_and_evaluate(train_set, y_train, test_set, y_test, model_name, model):
    """
    Function which trains a model on a train set and builds a dataframe containing a list of metrics based on testing
    the model on the test data.
    :param train_set: The train dataset. It must be a pandas.Series containing rows of text.
    :param y_train: The target values in the train set used for fitting the model.
    :param test_set: The test dataset. It must be a pandas.Series containing rows of text.
    :param y_test: The target values in the test set used for evaluating the reliability of the model.
    :param model_name: A string representing the name of the model to be trained and evaluated.
    :param model: The scikit-learn model that will be trained on the train data.
    :return: a list of 6 items representing: 
    [model name, precision-recall-auc score of the model, f1 score of the positive class, 
    recall score of the positive class, accuracy score]
    """
    model.fit(train_set, y_train)
    predicted = model.predict(test_set)

    acc = accuracy_score(y_test, predicted)
    f1_sc = f1_score(y_test, predicted, average=None)
    ap_auc = average_precision_score(y_test, model.predict_proba(test_set)[:, 1])
    recall_of_positive = recall_score(y_test, predicted, zero_division=0)
    return [model_name, ap_auc, f1_sc[1], recall_of_positive, acc]

In [40]:
# Create function to fit data on models and test the models on a test data set.
def fit_and_evaluate_all(train_set, y_train, test_set, y_test, model_names, models):
    """
    Function which trains a list of models on a train test and builds a dataframe containing a list of metrics 
    based on testing the models on the given test data.
    :param train_set: The train dataset. It must be a pandas.Series containing rows of text.
    :param y_train: The target values in the train set used for fitting the model.
    :param test_set: The test dataset. It must be a pandas.Series containing rows of text.
    :param y_test: The target values in the test set used for evaluating the reliability of the model.
    :param model_names: A list of strings representing the names of the models to be trained and evaluated.
    :param models: The a list of scikit-learn models that will be trained on the train data.
    :return: a pd.Dataframe having scores defined at the function fit_and_evaluate for all the models defined at models
    and their names defined at model_names.
    """
    scores = pd.DataFrame(columns=["model_name", "average_precision", "f1", "recall", "accuracy"])
    for model, model_name in list(zip(models, model_names)):
        row = fit_and_evaluate(train_set, y_train, test_set, y_test, model_name, model)
        scores.loc[len(scores)] = row
    
    return scores

In [83]:
# Fit and predict. Then print statistics.
final_results_with_mf = fit_and_evaluate_all(
    X_SMOTEEN_with_mf, y_SMOTEEN_with_mf, X_test_with_mf, y_test, ['Linear SVC'], [svc])
final_results_wo_mf = fit_and_evaluate_all(
    X_SMOTEEN_wo_mf, y_SMOTEEN_wo_mf, X_test_wo_mf, y_test, 
    ['Linear SVC', 'SGD', 'Logistic Regression', 'Random Forest Classifier'], [svc, sgd, lr, rf])
pd.concat([final_results_with_mf, final_results_wo_mf])

Unnamed: 0,model_name,average_precision,f1,recall,accuracy
0,Linear SVC,0.638826,0.585366,0.972973,0.8223
0,Linear SVC,0.533015,0.507692,0.891892,0.777003
1,SGD,0.541969,0.483221,0.972973,0.731707
2,Logistic Regression,0.538339,0.404494,0.972973,0.630662
3,Random Forest Classifier,0.580823,0.478873,0.918919,0.74216


It looks like we get the best performance when training a Linear Support Vector Machine on a dataset augmented by SMOTENN.
Let's use gridsearch to see if we can obtain even better results.

In [84]:
%%time
# Define parameter grid for svc
param_grid = [{'C': np.logspace(-4, 4, 30)}]

# Define the grid search
grid_search = GridSearchCV(SVC(random_state=42, class_weight='balanced', probability=True, kernel='linear'), param_grid=param_grid, 
                           scoring='average_precision', cv=StratifiedKFold(n_splits=5), n_jobs=-1)

# Do the actual searching
grid_search.fit(X_SMOTEEN_with_mf, y_SMOTEEN_with_mf)

CPU times: total: 1.42 s
Wall time: 53.2 s


In [85]:
# Output the results
pd.DataFrame(grid_search.cv_results_)[['param_C','mean_test_score']].sort_values(
    by='mean_test_score', ascending=False).head()

Unnamed: 0,param_C,mean_test_score
15,1.373824,1.0
16,2.592944,1.0
28,5298.316906,1.0
27,2807.216204,1.0
26,1487.352107,1.0


In [86]:
# Check top 5 results against test class

fit_and_evaluate_all(X_SMOTEEN_with_mf, y_SMOTEEN_with_mf, X_test_with_mf, y_test,
                     ['C=1.37', 'C=2.59', 'C=5298', 'C=2807', 'C=1487'], 
                     [SVC(random_state=42, class_weight='balanced', probability=True, kernel='linear', C=1.37),
                     SVC(random_state=42, class_weight='balanced', probability=True, kernel='linear', C=2.59),
                     SVC(random_state=42, class_weight='balanced', probability=True, kernel='linear', C=5298),
                     SVC(random_state=42, class_weight='balanced', probability=True, kernel='linear', C=2807),
                     SVC(random_state=42, class_weight='balanced', probability=True, kernel='linear', C=1487)])

Unnamed: 0,model_name,average_precision,f1,recall,accuracy
0,C=1.37,0.635812,0.585366,0.972973,0.8223
1,C=2.59,0.638826,0.585366,0.972973,0.8223
2,C=5298,0.638533,0.585366,0.972973,0.8223
3,C=2807,0.638533,0.585366,0.972973,0.8223
4,C=1487,0.638533,0.585366,0.972973,0.8223


There is a slight improvement for the model having C=2.59, so we will pick that one.

I will now save the model, the scaler and the vectorizer to file.

In [88]:
# We have the vectorizer already, but we need to create an SVC model object and fit it
svc = SVC(random_state=42, class_weight='balanced', probability=True, kernel='linear', C=2.59)
svc.fit(X_SMOTEEN_with_mf, y_SMOTEEN_with_mf)

In [90]:
# Save tfidf-vectorizer to file
with open("../data/modeling/tfidf_1.pkl", "wb") as tfidf_file:
    dump(tfidf_vectorizer_with_mf, tfidf_file)
    tfidf_file.close()

In [91]:
# Save tfidf-vectorizer to file
with open("../data/modeling/scaler_1.pkl", "wb") as scaler_file:
    dump(scaler_with_mf, scaler_file)
    scaler_file.close()

In [92]:
# Save the model to file
with open("../data/modeling/model_1_with_mf.pkl", "wb") as model_file:
    dump(svc, model_file)
    model_file.close()

## Experiment

What I would like to try is to artificially enhance my data set by translating the imbalanced data class into another language, then back to English.

As we have seen above, we have 1249 negative class counts and 184 positive class counts. That means that the majority class is 6.8 times bigger. In order to counterbalance this, we need to generate 6 times the data that we have now for the minority class. Or, in simpler terms, we need to translate our messages in 6 different languages, then back to English.

For this, I will create a function in a python script and call it in this notebook.

In [51]:
# Get the indexes for all the positive examples in the training set
positive_indexes = y_train.loc[y_train == 1].index.values

# Get the actual psoitive examples in the training set
X_train_pos = X_train[positive_indexes].astype('string')
X_train_pos.head(1)

501    - __EMAIL__
 

 Ordered on June 20th, order nu...
Name: text, dtype: string

In [52]:
%%time
# Augment the data
aug_X_train = ex.enhance_series(X_train_pos)
aug_X_train.head()

CPU times: total: 13.8 s
Wall time: 3min 58s


0    - __EMAIL__\n \n\n Ordered on June 20th, order...
1    - __EMAIL__\n \n\n Ordered on June 20, order n...
2    - __E-MAIL__\n \n\n Ordered on June 20, order ...
3    - __E-MAIL__\n \n\n Ordered June 20, order num...
4    - __EMAIL__\n \n\n Ordered June 20, order numb...
dtype: object

In [53]:
# Create index to add the data correctly; Last index from the dataset is 1432, so we'll start at 1433 and end at 1443 + 882
index_values = [x for x in range(1433, 1433+882)]
len(index_values)

882

In [54]:
# Index the augmented text series
aug_X_train.index = pd.Index(index_values)
aug_X_train.head()

1433    - __EMAIL__\n \n\n Ordered on June 20th, order...
1434    - __EMAIL__\n \n\n Ordered on June 20, order n...
1435    - __E-MAIL__\n \n\n Ordered on June 20, order ...
1436    - __E-MAIL__\n \n\n Ordered June 20, order num...
1437    - __EMAIL__\n \n\n Ordered June 20, order numb...
dtype: object

In [55]:
# Create indexed target series
aug_y_train = pd.Series([1]*len(index_values), index=index_values)
aug_y_train.head()

1433    1
1434    1
1435    1
1436    1
1437    1
dtype: int64

In [56]:
# Now mix them all together to form a final training df
final_X_train = pd.concat([X_train, aug_X_train])
final_y_train = pd.concat([y_train, aug_y_train])
print(final_X_train.shape, final_y_train.shape)

(2028,) (2028,)


In [57]:
# Check value counts to see if it's balanced
final_y_train.value_counts()

1    1029
0     999
dtype: int64

In [65]:
# Get the tfidf representation of train and test sets with manual features
text_train, m_feats_train = get_text_m_feats(final_X_train, new_token_dict, True)

tfidf_vectorizer_with_mf = get_tfidf_vectorizer(text_train)
tfidf_train = tfidf_vectorizer_with_mf.transform(text_train)

scaler_with_mf = get_scaler(m_feats_train)
scaled_m_feats_train = pd.DataFrame(scaler_with_mf.transform(m_feats_train), index = m_feats_train.index.values)
final_X_train_tfidf = hstack((tfidf_train, csr_matrix(scaled_m_feats_train)))

final_X_test_tfidf = transform_to_tfidf_matrix(
    new_token_dict, tfidf_vectorizer_with_mf, scaler_with_mf, X_test)

In [59]:
results = fit_and_evaluate_all(final_X_train_tfidf, final_y_train, final_X_test_tfidf, y_test, model_names, models)
results

Unnamed: 0,model_name,average_precision,f1,recall,accuracy
0,Naive Bayes,0.628295,0.653061,0.864865,0.881533
1,Linear SVC,0.730447,0.738095,0.837838,0.923345
2,RandomForestClassifier,0.797544,0.780488,0.864865,0.937282
3,Logistic Regression,0.706843,0.72093,0.837838,0.916376
4,SGD Classifier,0.758444,0.740741,0.810811,0.926829


In [66]:
# Get the tfidf representation of train and test sets without manual features
text_train, _ = get_text_m_feats(final_X_train, new_token_dict, False)

tfidf_vectorizer_wo_mf = get_tfidf_vectorizer(text_train)
final_X_train_tfidf = tfidf_vectorizer_wo_mf.transform(text_train)

final_X_test_tfidf = transform_to_tfidf_matrix(new_token_dict, tfidf_vectorizer_wo_mf, None, X_test, False)

In [67]:
results = fit_and_evaluate_all(final_X_train_tfidf, final_y_train, final_X_test_tfidf, y_test, model_names, models)
results

Unnamed: 0,model_name,average_precision,f1,recall,accuracy
0,Naive Bayes,0.6565,0.625,0.810811,0.874564
1,Linear SVC,0.698138,0.731707,0.810811,0.923345
2,RandomForestClassifier,0.790456,0.758621,0.891892,0.926829
3,Logistic Regression,0.674859,0.712644,0.837838,0.912892
4,SGD Classifier,0.730498,0.708861,0.756757,0.919861


It looks like we got a much better result using this type of data augmentation. Here we have obtained the best result by far with a RandomForestClassifier using the manual features as well.

I will see if hypertuning it will improve this result.

In [71]:
# Get dataset with manual features
text_train, m_feats_train = get_text_m_feats(final_X_train, new_token_dict, True)

tfidf_vectorizer_with_mf = get_tfidf_vectorizer(text_train)
tfidf_train = tfidf_vectorizer_with_mf.transform(text_train)

scaler_with_mf = get_scaler(m_feats_train)
scaled_m_feats_train = pd.DataFrame(scaler_with_mf.transform(m_feats_train), index = m_feats_train.index.values)
final_X_train_tfidf = hstack((tfidf_train, csr_matrix(scaled_m_feats_train)))

final_X_test_tfidf = transform_to_tfidf_matrix(
    new_token_dict, tfidf_vectorizer_with_mf, scaler_with_mf, X_test)

In [69]:
%%time
# Define parameter grid for svc
param_grid = [{'n_estimators' : [50, 100, 200],
              'criterion': ['gini', 'entropy', 'log_loss'],
              'max_depth': [None, 2, 6],
              'min_samples_split': [2, 4, 10]}]

# Define the grid search
grid_search = GridSearchCV(RandomForestClassifier(random_state=42, n_jobs=-1), param_grid=param_grid, 
                           scoring='average_precision', cv=StratifiedKFold(n_splits=5), n_jobs=-1)

# Do the actual searching
grid_search.fit(final_X_train_tfidf, final_y_train)

# Output the results
pd.DataFrame(grid_search.cv_results_)[['param_n_estimators', 'param_criterion', 'param_max_depth', 
                                       'param_min_samples_split','mean_test_score']].sort_values(
    by='mean_test_score', ascending=False).head(7)

CPU times: total: 3.98 s
Wall time: 1min 51s


Unnamed: 0,param_n_estimators,param_criterion,param_max_depth,param_min_samples_split,mean_test_score
2,200,gini,,2,0.981887
1,100,gini,,2,0.981221
29,200,entropy,,2,0.979862
56,200,log_loss,,2,0.979862
5,200,gini,,4,0.979398
4,100,gini,,4,0.978286
28,100,entropy,,2,0.978226


In [72]:
# Check top 5 results against test class

fit_and_evaluate_all(final_X_train_tfidf, final_y_train, final_X_test_tfidf, y_test,
                     ['gini-200', 'entropy-200', 'log_loss-200', 'gini-100-4', 'gini-100'], 
                     [RandomForestClassifier(n_estimators=200, random_state=42, n_jobs=-1, criterion='gini'),
                     RandomForestClassifier(n_estimators=200, random_state=42, n_jobs=-1, criterion='entropy'),
                     RandomForestClassifier(n_estimators=200, random_state=42, n_jobs=-1, criterion='log_loss'),
                     RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1, criterion='gini'),
                     RandomForestClassifier(n_estimators=200, min_samples_split=4, random_state=42, n_jobs=-1, criterion='gini')])

Unnamed: 0,model_name,average_precision,f1,recall,accuracy
0,gini-200,0.80169,0.771084,0.864865,0.933798
1,entropy-200,0.806606,0.780488,0.864865,0.937282
2,log_loss-200,0.806606,0.780488,0.864865,0.937282
3,gini-100-4,0.802272,0.785714,0.891892,0.937282
4,gini-100,0.808916,0.746988,0.837838,0.926829


We can see that hypertuning improved the average_precision accuracy from 0.797 to 0.802 and the recall score from 0.864 to 0.891. We will store the scaler, vectorizers and model having the best parameters.

In [73]:
# We have the vectorizer already, but we need to create an RandomForestClassifier model object and fit it
rf = RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1, criterion='entropy')
rf.fit(final_X_train_tfidf, final_y_train)

In [74]:
# Save tfidf-vectorizer to file
with open("../data/modeling/tfidf_2.pkl", "wb") as tfidf_file:
    dump(tfidf_vectorizer_with_mf, tfidf_file)
    tfidf_file.close()

In [75]:
# Save tfidf-vectorizer to file
with open("../data/modeling/scaler_2.pkl", "wb") as scaler_file:
    dump(scaler_with_mf, scaler_file)
    scaler_file.close()

In [76]:
# Save the model to file
with open("../data/modeling/model_2_with_mf.pkl", "wb") as model_file:
    dump(rf, model_file)
    model_file.close()