In [26]:
# Import data handling libraries
import pandas as pd
from sklearn.model_selection import train_test_split
import json
import numpy as np
from scipy.sparse import csr_matrix, hstack

# Import functions from local scripts
import sys
sys.path.insert(1, './scripts/development')
import scripts.development.preprocessing as pre
import scripts.development.experiment as ex

# Import modeling libraries
from sklearn.preprocessing import MinMaxScaler
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier

# Import metrics functions
from sklearn.metrics import classification_report, f1_score, precision_score, average_precision_score
from sklearn.metrics import accuracy_score, roc_auc_score, balanced_accuracy_score
from sklearn.model_selection import GridSearchCV, StratifiedKFold

# Import function libraries
from functools import partial

# Import balancing libraries
from imblearn.over_sampling import SMOTE, ADASYN, RandomOverSampler
from imblearn.under_sampling import ClusterCentroids, RandomUnderSampler, AllKNN
from imblearn.combine import SMOTEENN, SMOTETomek

# Import I/O libraries
from pickle import dump

In [37]:
tickets_df = pd.read_csv("../data/labeled_tickets.csv", index_col=0)
tickets_df.head()

Unnamed: 0,Description,is_about_order_status
0,"- __EMAIL__ Hi , I have just ordered a pair of...",0
1,I am missing a pair of shoes from my order. Co...,0
2,I didn'tget a my order - __EMAIL__,1
3,"Hello, I ordered two __PRODUCTS_NAMES and one ...",0
4,My shipment never was delivered. The tracking ...,1


In [38]:
# Rename column name from Description to text.
# I will keep this consistent throughout the functions
tickets_df = tickets_df.rename(columns={'Description': 'text'})
tickets_df.head(1)

Unnamed: 0,text,is_about_order_status
0,"- __EMAIL__ Hi , I have just ordered a pair of...",0


In [39]:
# Ensure that dtype is string
tickets_df['text'] = tickets_df['text'].astype('string')

In [40]:
# Translate all text into English
tickets_df['text'] = tickets_df['text'].apply(lambda x: pre.translate_to_en(x))

In [41]:
# Check how imbalanced the dataset is
tickets_df["is_about_order_status"].value_counts()

0    1249
1     184
Name: is_about_order_status, dtype: int64

### What is next?

We can see that our dataset is very imbalanced. We will deal with that later. For now, I just want to extract some samples for testing and the remaining files for training. I will go with a 20% split.

I will also define the preprocessing functions for the raw text in the python script files and test them accordingly.

In [66]:
X_train, X_test, y_train, y_test = train_test_split(tickets_df['text'], tickets_df["is_about_order_status"],
                                                    stratify=tickets_df["is_about_order_status"],
                                                    test_size=0.2)
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

(1146,) (287,) (1146,) (287,)


In [67]:
# Convert y_train and y_test to int pandas.Series
y_train = y_train.astype('int8')
y_test = y_test.astype('int8')

### Raw text processing

1. I will want to check if a text is in a language different than English. If it is, I will translate it into English.
2. I will measure text length and store it as a feature.
3. I will measure average word length and store it as a feature.
4. I will measure number of words and store it as a feature.
4. I will use a library to reduce spelling errors.
5. I will use a library to expand words like can't to can not.
6. I will measure the numeric counts of each message (the count of number values present in each message)
7. I will convert all text to lowercase
8. I will get a set of all tokens in the texts (like __email__ and __company__ etc.) and I will create a column for each in which I will count their occurences. I will then remove these tokens from each text. The set of tokens will be stored in the script file to be used with the general processing function. It will be easy to later find new tokens and add them to the set.
9. I will count number of emails in each message and remove them.
10. I will count and remove any url from the text.
11. I will measure the number of stopwords and store it as a feature.
12. I will remove stopwords, single characters and special characters from text.
13. I will stem the remaining words.

All these steps will be wrapped into a function that preprocesses raw text. I will also create a function which uses the former function for a pandas series, so I can easily use it when testing ways to balance the dataset.

In [68]:
# Because ticket tokens are present in the raw text, we can extract here the token set and see if we spot anything wrong.
token_set = pre.get_tokens(X_train)
token_set

{'_ADDRESS_',
 '_COMPANY_',
 '_COMPANY__',
 '_COMPANY___',
 '_CREDIT_',
 '_DATE_',
 '_DATE__PRODUCT_',
 '_INVOICE_NUMBER_',
 '_ITEM_PHOTO_',
 '_LOCATION_',
 '_MONTH_',
 '_NAME_',
 '_NAME__ADRRESS_',
 '_NUMBER__',
 '_ORDER_NUMBBER_',
 '_ORDER_NUMBER_',
 '_OTHER_PI_',
 '_PHONE_',
 '_PRICE_',
 '_PRODUCTS_',
 '_PRODUCTS_NAMES_',
 '_PRODUCT_',
 '_PRODUCT_NAME_',
 '_TRACKING_NUMBER_',
 '_URL_',
 '__ADDRESS__',
 '__AMOUNT__',
 '__COMPANY_NAME__',
 '__COMPANY__',
 '__CREDIT_CARD__',
 '__DATE__',
 '__DATE____',
 '__DISCOUNT_CODE__',
 '__EMAIL__',
 '__INVOICE_NUMBER__',
 '__MAIN__',
 '__NAMES__',
 '__NAME__',
 '__NAME____',
 '__ORDER_NUMBER__',
 '__OTHER_PI__',
 '__PHONE__',
 '__PLACE__',
 '__PLACE____',
 '__PRODUCTS_NAMES__',
 '__PRODUCT_NAMES__',
 '__PRODUCT_NAME__',
 '__PRODUCT__NAMES__',
 '__PRODUCT__NAME__',
 '__REFERENCE_NUMBER__',
 '__TRACKING_NUMBER__',
 '__URL__'}

We can spot some mistakes from our regex which we will eliminate from the set:
\_COMPANY___, \_COMPANY__ \_NAME__ADRRESS_, \_NUMBER__, \__DATE____, \_DATE__PRODUCT_, \__NAME____, \__PLACE____

In order to extract features from these tokens it is best to define a dictionary where the key is the column and the value is a list of tokens corresponding to the column. For each token in the list, we will increase the corresponding column value by 1.
Example: column value is product_name and the token list is \[\_product_name_, \_product_names_]. Then for each token of value \_product_name_ or \_product_names_, we will increase the product_name column value by 1.

We will store the final dictionary into a file in order to be used later.

In [69]:
# Remove incorrect tokens
token_set.difference({"_COMPANY__", "_COMPANY___", "_NAME__ADRRESS_", "_NUMBER__", "__DATE____",
                      "\_DATE__PRODUCT_", "\__NAME____", "\__PLACE____"})

# Define dictionary
token_dictionary = {"address_count": ['_ADDRESS_', '__ADDRESS__', '_LOCATION_', '__PLACE__'],
                    "company_name_count": ['_COMPANNY_', '_COMPANY_', '__COMPANY_NAME__', '__COMPANY__'],
                    "credit_card_count": ['_CREDIT_', '__CREDIT_CARD__'],
                    "date_count": ['_DATE_', '_MONTH_', '__DATE__'],
                    "invoice_count": ['_INVOICE_NUMBER_', '__INVOICE_NUMBER__'],
                    "photo_count": ['_ITEM_PHOTO_'],
                    "name_count": ['_NAME_', '__NAMES__', '__NAME__'],
                    "order_count": ['_ORDER_NUMBBER_', '_ORDER_NUMBER_', '__ORDER_NUMBER__'],
                    "other_pi_count": ['_OTHER_PI_', '__OTHER_PI__'],
                    "phone_count": ['_PHONE_', '__PHONE__'],
                    "price_count": ['_PRICE_', '__AMOUNT__'],
                    "product_count": ['_PRODUCT_', '_PRODUCT_NAME_', '__PRODUCTS_NAMES__', '__PRODUCT_NAMES__', 
                                      '__PRODUCT_NAME__', '__PRODUCT__NAMES__', '__PRODUCT__NAME__'],
                    "tracking_number_count": ['_TRACKING_NUMBER_', '__TRACKING_NUMBER__'],
                    "url_count": ['_URL_', '__URL__'],
                    "discount_code_count": ['__DISCOUNT_CODE__'],
                    "email_count": ['__EMAIL__'],
                    "reference_number_count": ['__REFERENCE_NUMBER__']
                   }

# Save token dictionary to file
with open("../data/token_dictionary.json", "w") as file:
    json.dump(token_dictionary, file)
    file.close()

## Getting a first result

We can now use the preprocessing functions defined in the python scripts to get a first result on some classifiers.
I will use the tf-idf vectorizer from sklearn to achieve trainable data

In [10]:
def obtain_tfidf_matrices(token_dictionary, token_list, train, test, with_manual_features=True):
    """
    Function which takes train and test dataframes containing text, preprocesses them and converts them into sparse
    matrices through a tf-idf vectorizer
    :param token_dictionary: dictionary of tokens, where the key is the general meaning of the token and what we will
    use for column naming and the value is a list of actual values found in the text.
    :param token_list: The list of all token values.
    :param train: The train dataset. It must be a pandas.Series containing rows of text.
    :param test: The test dataset. It must be a pandas.Series containing rows of text.
    :param with_manual_features: Flag. If set to False, do not append manual features to the resulting DataFrame.
    Append them if set to True. Default is True.
    :return: two sparse matrices, the first one being the representation of the train dataset, while the second is the
    representation of the test dataset
    """
    p_train = pre.preprocess_text_series(train, token_dictionary, token_list, with_manual_features)
    m_feats_train = p_train.drop(columns='text')
    text_train = p_train['text']
    
    p_test = pre.preprocess_text_series(test, token_dictionary, token_list, with_manual_features)
    m_feats_test = p_test.drop(columns='text')
    text_test = p_test['text']
    
    tfidf_vectorizer = TfidfVectorizer()
    tfidf_train = tfidf_vectorizer.fit_transform(text_train)
    tfidf_test = tfidf_vectorizer.transform(text_test)
    
    if with_manual_features:
        scaler = MinMaxScaler()
        scaled_m_feats_train = pd.DataFrame(scaler.fit_transform(m_feats_train), index = m_feats_train.index.values)
        scaled_m_feats_test = pd.DataFrame(scaler.transform(m_feats_test), index = m_feats_test.index.values)
        tfidf_train = hstack((tfidf_train, csr_matrix(scaled_m_feats_train)))
        tfidf_test = hstack((tfidf_test, csr_matrix(scaled_m_feats_test)))
    
    return tfidf_train, tfidf_test

In [11]:
# We can use partial so we can only call this function with parameters that actually change
obtain_tfidf = partial(obtain_tfidf_matrices, token_dictionary, token_set, X_train, X_test)

In [64]:
def evaluate_model(train, test, model, model_name, target_train=y_train):
    """
    Function which trains a model on a train test and builds a dataframe containing a list of metrics based on testing
    the model on the test data
    :param train: The train dataset. It must be a pandas.Series containing rows of text.
    :param test: The test dataset. It must be a pandas.Series containing rows of text.
    :param model: The scikit-learn model that will be trained on the train data.
    :param model_name: A string representing the name of the model to be trained and evaluated.
    :param with_manual_features: Flag. If set to False, do not append manual features to the resulting DataFrame.
    Append them if set to True. Default is True.
    :return: a list of 6 items representing: 
    [model name, accuracy score, balanced accuracy score, f1 score of the negative class, f1 score of the positive class, 
    precision score of the positive class, roc-auc score of the model, precision-recall-auc score of the model]
    """
    model.fit(train, target_train)
    predicted = model.predict(test)

    acc = accuracy_score(y_test, predicted)
    balanced_acc = balanced_accuracy_score(y_test, predicted)
    f1_sc = f1_score(y_test, predicted, average=None)
    roc_auc_sc = roc_auc_score(y_test, model.predict_proba(test)[:, 1])
    ap_auc = average_precision_score(y_test, model.predict_proba(test)[:, 1])
    precision_of_positive = precision_score(y_test, predicted, zero_division=0)
    return [model_name, acc, balanced_acc, f1_sc[0], f1_sc[1], precision_of_positive, roc_auc_sc, ap_auc]

In [13]:
models = [MultinomialNB(), SVC(random_state=42, class_weight='balanced', probability=True, kernel='linear'),
          RandomForestClassifier(n_estimators=150, random_state=42), LogisticRegression(random_state=42), 
          SGDClassifier(loss='log_loss', random_state=42)]
model_names = ['Naive Bayes', 'Linear SVC', 'RandomForestClassifier', ' Logistic Regression', 'SGD Classifier']

In [14]:
def evaluate_on_all_models(train, test, target_train=y_train):
    """
    Function which trains a model on a train test and builds a dataframe containing a list of metrics based on testing
    the model on the test data
    :param train: The train dataset. It must be a pandas.Series containing rows of text.
    :param test: The test dataset. It must be a pandas.Series containing rows of text.
    :param target_train: An iterable or pd.Series representing the target values of the train set. 
    This parameter is needed when resampling. Default: initial defined y_train from train-test splitting.
    :return: a pd.Dataframe having scores defined at the function evaluate_model for all the models defined at models
    and their names defined at model_names.
    """
    scores = pd.DataFrame(
        columns=["model_name", "accuracy", "balanced_accuracy", "f1_score_negative",
                 "f1_score_positive", "precision_positive", "roc_auc_score", "ap_auc_score"])
    for model, model_name in list(zip(models, model_names)):
        row = evaluate_model(train, test, model, model_name, target_train)
        scores.loc[len(scores)] = row
    
    return scores

In [15]:
# Define list of sampling methods
sampling_method_names = ['oversampling_SMOTE', 'oversampling_ADASYN', 'oversampling_random',
                     'undersampling_Cluster_Centroids', 'undersampling_All_KNN', 'undersampling_random',
                     'mixed_SMOTEEN', 'mixed_SMOTETomek']
sampling_methods = [SMOTE(random_state=42), ADASYN(random_state=42), RandomOverSampler(random_state=42, sampling_strategy='minority'),
                 ClusterCentroids(random_state=42), AllKNN(), RandomUnderSampler(sampling_strategy='majority' ,random_state=42),
                 SMOTEENN(random_state=42), SMOTETomek(random_state=42)]

In [16]:
def evaluate_balancing(sampling_names, samplings, with_manual_features=True):
    """
    Function which returns a DataFrame containing the scores for all of the models on all of the sampling methods.
    :param sampling_names: A list containing the names of the sampling methods.
    :param samplings: A list containing sampling instances, in the same order as sampling_names so that 
    they correspond to each other
    :param with_manual_features: Whether to add manually defined features to the extracted tf-idf data extracted 
    at preprocessing phase.
    """
    train, test = obtain_tfidf(with_manual_features)
    results_df = evaluate_on_all_models(train, test)
    row_length_per_loop = results_df.shape[0]
    sampling_name_list = ['None'] * row_length_per_loop
    
    for sampling_name, sampling in list(zip(sampling_names, samplings)):
        X_resampled, y_resampled = sampling.fit_resample(train, y_train)
        temp_results_df = evaluate_on_all_models(X_resampled, test, y_resampled)
        sampling_name_list.extend([sampling_name] * row_length_per_loop)
        results_df = pd.concat([results_df, temp_results_df], ignore_index=True)
    
    return pd.concat([results_df, pd.Series(sampling_name_list, name='sampling_method')], axis=1)

In [17]:
%%time
with_manual_df = evaluate_balancing(sampling_method_names, sampling_methods, True)
with_manual_df.sort_values(by=['ap_auc_score', 'f1_score_positive', 'precision_positive'], ascending=False)

CPU times: total: 32 s
Wall time: 29.2 s


Unnamed: 0,model_name,accuracy,balanced_accuracy,f1_score_negative,f1_score_positive,precision_positive,roc_auc_score,ap_auc_score,sampling_method
24,SGD Classifier,0.909408,0.913459,0.945833,0.723404,0.596491,0.968324,0.853625,undersampling_Cluster_Centroids
21,Linear SVC,0.919861,0.896432,0.952772,0.735632,0.64,0.971243,0.849615,undersampling_Cluster_Centroids
4,SGD Classifier,0.919861,0.72373,0.955513,0.596491,0.85,0.966811,0.824016,
9,SGD Classifier,0.926829,0.842865,0.957916,0.72,0.710526,0.963459,0.822563,oversampling_SMOTE
44,SGD Classifier,0.926829,0.842865,0.957916,0.72,0.710526,0.963459,0.822563,mixed_SMOTETomek
14,SGD Classifier,0.933798,0.846865,0.962076,0.739726,0.75,0.964973,0.822131,oversampling_ADASYN
22,RandomForestClassifier,0.902439,0.897946,0.941667,0.702128,0.578947,0.971676,0.821959,undersampling_Cluster_Centroids
7,RandomForestClassifier,0.923345,0.76027,0.957031,0.645161,0.8,0.966054,0.818,oversampling_SMOTE
42,RandomForestClassifier,0.923345,0.76027,0.957031,0.645161,0.8,0.966054,0.818,mixed_SMOTETomek
13,Logistic Regression,0.933798,0.881405,0.961616,0.759494,0.714286,0.965297,0.816872,oversampling_ADASYN


In [None]:
%%time
wo_manual_df = evaluate_balancing(sampling_method_names, sampling_methods, False)
wo_manual_df.sort_values(by=['ap_auc_score', 'f1_score_positive', 'precision_positive'], ascending=False)

## Evaluating the models

I evaluated my models multiple times using train test randomization. Most of the models do not achieve a good f1 score for the imbalanced class, even after using undersampling, oversampling or a mix of the two.
There was however a particular data split that got us an f1 score of the imbalanced data set of over 70% for most of the models. I will save that model and test it in another notebook to see if it works well.

What I want to achieve with this model is to accurately classify a message as "where-is-my-order" with great precision. Therefore, if two models have similar f-scores for the positive target variable, I would rather choose the one with higher precision.

Since the metrics were very different for all the random train test splits, it makes sense to try another for of balancing: translating training text into other languages and then back to English to artifically create more examples in the minority class.

In [18]:
# Firstly, I want to save the scaler, vectorizer and model that had the best results, in order to test it on real data.
def get_scaler_and_vectorizer(token_dictionary, token_list, train):
    """
    Function which takes train and test dataframes containing text, preprocesses them and returns the scaler and vectorizer
    used on the training set.
    :param token_dictionary: dictionary of tokens, where the key is the general meaning of the token and what we will
    use for column naming and the value is a list of actual values found in the text.
    :param token_list: The list of all token values.
    :param train: The train dataset. It must be a pandas.Series containing rows of text.
    :return: a MinMaxScaler instance and a TfidfVectorizer instance, in this order
    """
    p_train = pre.preprocess_text_series(train, token_dictionary, token_list)
    m_feats_train = p_train.drop(columns='text')
    text_train = p_train['text']
    
    tfidf_vectorizer = TfidfVectorizer(decode_error='ignore')
    tfidf_vectorizer.fit(text_train)
    
    scaler = MinMaxScaler()
    scaler.fit(m_feats_train)
    
    return scaler, tfidf_vectorizer

In [19]:
# Obtain scaler and vectorizer
scaler, tfidf_vectorizer = get_scaler_and_vectorizer(token_dictionary, token_set, X_train)

In [20]:
# Save them to files
with open("../data/modeling/scaler_1.pkl", "wb") as scaler_file:
    dump(scaler, scaler_file)
    scaler_file.close()
    
with open("../data/modeling/tfidf_1.pkl", "wb") as tfidf_file:
    dump(tfidf_vectorizer, tfidf_file)
    tfidf_file.close()

In [24]:
# Fit the best model to data and save it to file
train, test = obtain_tfidf()

X_resampled, y_resampled = ClusterCentroids(random_state=42).fit_resample(train, y_train)

sgd = SGDClassifier(random_state=42, loss='log_loss')
sgd.fit(X_resampled, y_resampled)

with open("../data/modeling/sgd_1.pkl", "wb") as model_file:
    dump(sgd, model_file)
    model_file.close()

## Experiment

What I would like to try is to artificially enhance my data set by translating the imbalanced data class into another language, then back to English.

As we have seen above, we have 1249 negative class counts and 184 positive class counts. That means that the majority class is 6.8 times bigger. In order to counterbalance this, we need to generate 6 times the data that we have now for the minority class. Or, in simpler terms, we need to translate our messages in 6 different languages, then back to English.

For this, I will create a function in a python script and call it in this notebook.

In [70]:
# Get the indexes for all the positive examples in the training set
positive_indexes = y_train.loc[y_train == 1].index.values

# Get the actual psoitive examples in the training set
X_train_pos = X_train[positive_indexes].astype('string')
X_train_pos.head(1)

98    Hi, 
 

 I still haven't received my trainers ...
Name: text, dtype: string

In [71]:
%%time
# Augment the data
aug_X_train = ex.enhance_series(X_train_pos)
aug_X_train.head()

CPU times: total: 18.3 s
Wall time: 3min 49s


0    Hi,\n \n\n I still haven't received my sneaker...
1    Hi,\n \n\n I still haven't received my trainer...
2    Hi,\n \n\n I still haven't received my trainer...
3    Hi,\n \n\n I still haven't received my sneaker...
4    Hey,\n \n\n I still haven't received my traine...
dtype: object

In [56]:
# Create index to add the data correctly; Last index from the dataset is 1432, so we'll start at 1433 and end at 1443 + 882
index_values = [x for x in range(1433, 1433+882)]
len(index_values)

882

In [72]:
# Index the augmented text series
aug_X_train.index = pd.Index(index_values)
aug_X_train.head()

1433    Hi,\n \n\n I still haven't received my sneaker...
1434    Hi,\n \n\n I still haven't received my trainer...
1435    Hi,\n \n\n I still haven't received my trainer...
1436    Hi,\n \n\n I still haven't received my sneaker...
1437    Hey,\n \n\n I still haven't received my traine...
dtype: object

In [73]:
# Create indexed target series
aug_y_train = pd.Series([1]*len(index_values), index=index_values)
aug_y_train.head()

1433    1
1434    1
1435    1
1436    1
1437    1
dtype: int64

In [74]:
# Now mix them all together to form a final training df
final_X_train = pd.concat([X_train, aug_X_train])
final_y_train = pd.concat([y_train, aug_y_train])
print(final_X_train.shape, final_y_train.shape)

(2028,) (2028,)


In [75]:
# Check value counts to see if it's balanced
final_y_train.value_counts()

1    1029
0     999
dtype: int64

In [76]:
# get the tfidf representation of train and test sets
final_X_train_tfidf, final_X_test_tfidf = obtain_tfidf_matrices(token_dictionary, token_set, final_X_train, X_test)

In [77]:
results = evaluate_on_all_models(final_X_train_tfidf, final_X_test_tfidf, final_y_train)
results

Unnamed: 0,model_name,accuracy,balanced_accuracy,f1_score_negative,f1_score_positive,precision_positive,roc_auc_score,ap_auc_score
0,Naive Bayes,0.84669,0.831405,0.906383,0.576923,0.447761,0.910703,0.557154
1,Linear SVC,0.885017,0.830378,0.931959,0.629213,0.538462,0.902378,0.650329
2,RandomForestClassifier,0.89547,0.847892,0.938272,0.659091,0.568627,0.92173,0.631479
3,Logistic Regression,0.87108,0.845405,0.922432,0.618557,0.5,0.907135,0.574426
4,SGD Classifier,0.885017,0.818865,0.932238,0.62069,0.54,0.907243,0.614693
