In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from collections import Counter

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import GradientBoostingClassifier
from xgboost import XGBClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB

from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
from statistics import mean
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler #fixed import
from sklearn import svm

import nltk
import re
from langdetect import detect
from scipy.sparse import hstack
import gensim
from gensim.models import Word2Vec, KeyedVectors
from nltk import word_tokenize

In [2]:
import warnings
warnings.filterwarnings("ignore")

In [3]:
import numpy as np
np.random.seed(42)  # Set random seed for numpy

import random
random.seed(42)  # Set random seed for random module

In [2]:
df = pd.read_csv("D:\\MS DATA SCIENCE\\NLP TESE\\data\\data_processed_selected.csv")

In [34]:
# #convert reason to numbers
# reason_dict ={'Mau Serviço Prestado': 0, 'Condições de entrega': 1, 'Atraso de entrega': 2, 
#                'Enganos': 3}
# df['reason'].replace(reason_dict, inplace=True)
# df['reason'].unique()

In [35]:
y=df['reason']
X = df[['narrative_tfidf', 'tfidf_title', 'tfidf_keywords', 'events_tfidf', 'entities']]

In [36]:
# Split the data into a training set (70%) and a test set (30%)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, stratify=y, random_state=42)

# Print the shape of each set
print("X_train shape:", X_train.shape)
print("y_train shape:", y_train.shape)
print("X_test shape:", X_test.shape)
print("y_test shape:", y_test.shape)

X_train shape: (15701, 5)
y_train shape: (15701,)
X_test shape: (6729, 5)
y_test shape: (6729,)


# 1. BOW WITH NAIVE BAYES

In [37]:
def applyclf(X_train, y_train, vectorizer, model, feature_set):
    X_train_selected = X_train[feature_set].apply(lambda x: ' '.join(x.fillna('').astype(str)), axis=1)
    
    # Create a CountVectorizer and Multinomial Naive Bayes pipeline
    pipeline = Pipeline([
        ('vectorizer', vectorizer),
        ('classifier', model)
    ])

    # Define cross-validation strategy (e.g., StratifiedKFold with 5 folds)
    kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    
    # Lists to store predictions and true labels
    all_true_labels = []
    all_predicted_labels = []

    # Perform cross-validation and accumulate the confusion matrix
    for train_index, val_index in kf.split(X_train_selected, y_train):
        X_train_fold, X_val_fold = X_train_selected.iloc[train_index], X_train_selected.iloc[val_index]
        y_train_fold, y_val_fold = y_train.iloc[train_index], y_train.iloc[val_index]

        # Fit the model on the training fold
        pipeline.fit(X_train_fold, y_train_fold)

        # Predict on the validation fold
        y_pred_val = pipeline.predict(X_val_fold)
        
        # Append true labels and predicted labels
        all_true_labels.extend(y_val_fold)
        all_predicted_labels.extend(y_pred_val)
        
    # Get the number of features from the vectorizer
    num_features = len(vectorizer.get_feature_names_out())
    # Print the number of features, confusion matrix, and classification report
    print(f"Number of Features: {num_features}")
            
    # Generate an overall confusion matrix
    confusion_mat = confusion_matrix(all_true_labels, all_predicted_labels, labels=[0,1,2,3])
    
    # Calculate the classification report
    class_report = classification_report(all_true_labels, all_predicted_labels, zero_division=1)

    # Print the confusion matrix and classification report
    print("Confusion Matrix:")
    print(confusion_mat)

    print("\nClassification Report:")
    print(class_report)

In [46]:
feature_set=['narrative_tfidf']
applyclf(X_train, y_train, CountVectorizer(), MultinomialNB(), feature_set)

Number of Features: 18785
Confusion Matrix:
[[3461  151 1901   63]
 [ 944  115  462   32]
 [1270   23 6187   30]
 [ 649   24  292   97]]

Classification Report:
              precision    recall  f1-score   support

           0       0.55      0.62      0.58      5576
           1       0.37      0.07      0.12      1553
           2       0.70      0.82      0.76      7510
           3       0.44      0.09      0.15      1062

    accuracy                           0.63     15701
   macro avg       0.51      0.40      0.40     15701
weighted avg       0.59      0.63      0.59     15701



# 2. TF-IDF WITH DIFFERENT ML MODELS

"The choice of n-gram size for the title feature depends on the specific characteristics of your text data and the requirements of your classification task. However, since the title feature is typically shorter and more concise than the description feature, you might consider using a smaller n-gram range, such as unigrams (ngram_size=(1,1)) or bigrams (ngram_size=(2,2)), to capture the most important keywords or phrases in the title.

Using a smaller n-gram range can help prevent overfitting and reduce the dimensionality of the feature space, while still capturing the key information in the title. However, it's also important to experiment with different n-gram ranges and compare their performance using cross-validation or hold-out validation to determine the most effective approach for your specific application.

Additionally, you might also consider using different TF-IDF configurations for the title feature compared to the description feature, such as a different minimum document frequency threshold or weighting scheme, to better capture the unique characteristics of each feature type."  

max_df=0.9 or 0.95 because the texts are short and have a similar structure,  
min_df=1 or 2 just to keep rare words but remove typos, eg,  
bigrams and trigrams because texts are short, using unigrams alone may not capture enough information?

### best tfidf configuration?

In [9]:
classifiers = [KNeighborsClassifier(),
               DecisionTreeClassifier(random_state=42), 
               RandomForestClassifier(random_state=42), 
               GradientBoostingClassifier(random_state=42),
               XGBClassifier(random_state=42), 
               MultinomialNB(),
               LinearSVC()]

In [10]:
X_train_selected = X_train[['narrative_tfidf']].apply(lambda x: ' '.join(x.fillna('').astype(str)), axis=1)

# Create a parameter grid for TF-IDF configurations
param_grid = {
    'vectorizer__ngram_range': [(1, 2), (1, 3)],
    'vectorizer__max_df': [0.7, 0.8, 0.9, 0.95],
    'vectorizer__min_df': [0.01, 0.02, 0.03]
}

# Create an empty dictionary to store the best configurations and their scores for each classifier
best_configs = {}

# Loop through each classifier
for clf in classifiers:
    # Create a pipeline for the current classifier
    pipeline = Pipeline([
        ('vectorizer', TfidfVectorizer()),
        ('classifier', clf)
    ])

    # Perform grid search
    kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    grid_search = GridSearchCV(pipeline, param_grid, cv=kf, n_jobs=-1, scoring='f1_macro')
    grid_search.fit(X_train_selected, y_train)

    # Get the best parameters and the associated accuracy
    best_params = grid_search.best_params_
    best_f1 = grid_search.best_score_

    # Store the best configuration and accuracy for this classifier
    best_configs[clf.__class__.__name__] = (best_params, best_f1)

# Print the best configurations and their f1 for each classifier
for clf_name, (best_params, best_f1) in best_configs.items():
    print(f"Best Configuration for {clf_name}:")
    print(best_params)
    print(f"Best F1-score: {best_f1}")
    print()

Best Configuration for KNeighborsClassifier:
{'vectorizer__max_df': 0.7, 'vectorizer__min_df': 0.01, 'vectorizer__ngram_range': (1, 2)}
Best F1-score: 0.3866297429328453

Best Configuration for DecisionTreeClassifier:
{'vectorizer__max_df': 0.7, 'vectorizer__min_df': 0.02, 'vectorizer__ngram_range': (1, 2)}
Best F1-score: 0.3565817113615028

Best Configuration for RandomForestClassifier:
{'vectorizer__max_df': 0.7, 'vectorizer__min_df': 0.02, 'vectorizer__ngram_range': (1, 3)}
Best F1-score: 0.3562689961823028

Best Configuration for GradientBoostingClassifier:
{'vectorizer__max_df': 0.8, 'vectorizer__min_df': 0.01, 'vectorizer__ngram_range': (1, 2)}
Best F1-score: 0.4095637717573499

Best Configuration for XGBClassifier:
{'vectorizer__max_df': 0.8, 'vectorizer__min_df': 0.01, 'vectorizer__ngram_range': (1, 2)}
Best F1-score: 0.42781269338094674

Best Configuration for MultinomialNB:
{'vectorizer__max_df': 0.8, 'vectorizer__min_df': 0.01, 'vectorizer__ngram_range': (1, 3)}
Best F1-scor

best results is with linear svc and xgboost both with {'vectorizer__max_df': 0.8, 'vectorizer__min_df': 0.01, 'vectorizer__ngram_range': (1, 2)}, so I'm going to choose this config.

## description

In [31]:
feature_set=['narrative_tfidf']
config= {'ngram_range': (1,2), 'max_df':0.80, 'min_df': 0.01}
for clf in classifiers:
    print(clf.__class__.__name__)
    applyclf(X_train, y_train, TfidfVectorizer(**config), clf, feature_set)

MultinomialNB
Number of Features: 1191
Confusion Matrix:
[[3428    2 2137    9]
 [1003    0  547    3]
 [1080    1 6428    1]
 [ 662    0  376   24]]

Classification Report:
              precision    recall  f1-score   support

           0       0.56      0.61      0.58      5576
           1       0.00      0.00      0.00      1553
           2       0.68      0.86      0.76      7510
           3       0.65      0.02      0.04      1062

    accuracy                           0.63     15701
   macro avg       0.47      0.37      0.35     15701
weighted avg       0.57      0.63      0.57     15701

KNeighborsClassifier
Number of Features: 1191
Confusion Matrix:
[[3296  304 1835  141]
 [ 901  169  431   52]
 [1979  189 5286   56]
 [ 588   81  268  125]]

Classification Report:
              precision    recall  f1-score   support

           0       0.49      0.59      0.53      5576
           1       0.23      0.11      0.15      1553
           2       0.68      0.70      0.69    

## description + title

In [32]:
feature_set=['narrative_tfidf','tfidf_title']
config= {'ngram_range': (1,2), 'max_df':0.80, 'min_df': 0.01}
for clf in classifiers:
    print(clf.__class__.__name__)
    applyclf(X_train, y_train, TfidfVectorizer(**config), clf, feature_set)

MultinomialNB
Number of Features: 1240
Confusion Matrix:
[[3512    1 2043   20]
 [1028    0  515   10]
 [1040    2 6462    6]
 [ 678    0  321   63]]

Classification Report:
              precision    recall  f1-score   support

           0       0.56      0.63      0.59      5576
           1       0.00      0.00      0.00      1553
           2       0.69      0.86      0.77      7510
           3       0.64      0.06      0.11      1062

    accuracy                           0.64     15701
   macro avg       0.47      0.39      0.37     15701
weighted avg       0.57      0.64      0.58     15701

KNeighborsClassifier
Number of Features: 1240
Confusion Matrix:
[[3308  293 1800  175]
 [ 902  189  414   48]
 [1773  186 5492   59]
 [ 579   80  222  181]]

Classification Report:
              precision    recall  f1-score   support

           0       0.50      0.59      0.55      5576
           1       0.25      0.12      0.16      1553
           2       0.69      0.73      0.71    

will other kernels perform better?

In [40]:
feature_set=['narrative_tfidf','tfidf_title']
config= {'ngram_range': (1,2), 'max_df':0.80, 'min_df': 0.01}
classifiers = [SVC(kernel='rbf'),SVC(kernel='poly')]
for clf in classifiers:
    print(clf.__class__.__name__)
    applyclf(X_train, y_train, TfidfVectorizer(**config), clf, feature_set)

SVC
Number of Features: 1240
Confusion Matrix:
[[3744    8 1756   68]
 [1067   10  450   26]
 [1087    4 6401   18]
 [ 696    0  219  147]]

Classification Report:
              precision    recall  f1-score   support

           0       0.57      0.67      0.62      5576
           1       0.45      0.01      0.01      1553
           2       0.73      0.85      0.78      7510
           3       0.57      0.14      0.22      1062

    accuracy                           0.66     15701
   macro avg       0.58      0.42      0.41     15701
weighted avg       0.63      0.66      0.61     15701

SVC
Number of Features: 1240
Confusion Matrix:
[[3478    8 2056   34]
 [ 999   16  526   12]
 [ 891    5 6607    7]
 [ 694    0  285   83]]

Classification Report:
              precision    recall  f1-score   support

           0       0.57      0.62      0.60      5576
           1       0.55      0.01      0.02      1553
           2       0.70      0.88      0.78      7510
           3       0

# 3. EMBEDDINGS WITH DIFFERENT ML MODELS

In [11]:
y=df['reason']
X = df[['narrative_embeddings', 'embeddings_title']]

In [12]:
# Split the data into a training set (70%) and a test set (30%)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, stratify=y, random_state=42)

# Print the shape of each set
print("X_train shape:", X_train.shape)
print("y_train shape:", y_train.shape)
print("X_test shape:", X_test.shape)
print("y_test shape:", y_test.shape)

X_train shape: (15701, 2)
y_train shape: (15701,)
X_test shape: (6729, 2)
y_test shape: (6729,)


In [13]:
def embeddings_transformation(X, model):
    '''returns the mean of the embeddings of the document'''
    # initiating a sentence with all zeros
    embedding_size = 600  
    X_transformed = np.zeros((len(X), embedding_size))
   
    # Loop over each string in X
    for i, sentence in enumerate(X):
        # Loop over each word in the sentence and, if it is in the model's vocabulary, add its feature vector to the total
        embeddings = [model[word] for word in sentence.split() if word in model]
        if embeddings:
            X_transformed[i] = np.mean(embeddings, axis=0)
    return X_transformed

In [14]:
isinstance(MultinomialNB(), MultinomialNB)

True

In [15]:
def applyclf_emb(X_train, y_train, model, feature_set, emb_model):
    X_train_selected = X_train[feature_set].apply(lambda x: ' '.join(x.fillna('').astype(str)), axis=1)
    # Transform the text data into embeddings
    X_train_embeddings = embeddings_transformation(X_train_selected, emb_model)
    
    # Conditionally add MinMaxScaler if the model is MultinomialNB
    if isinstance(model, MultinomialNB):
        # Add MinMaxScaler before the classifier
        pipeline = Pipeline([
            ('scaler', MinMaxScaler()),
            ('classifier', model)
        ])
    else:
        pipeline = Pipeline([
            ('classifier', model)
        ])
    
    # Define cross-validation strategy (e.g., StratifiedKFold with 5 folds)
    kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    
    # Lists to store predictions and true labels
    all_true_labels = []
    all_predicted_labels = []

    # Perform cross-validation and accumulate the confusion matrix
    for train_index, val_index in kf.split(X_train_embeddings, y_train):
        X_train_fold, X_val_fold = X_train_embeddings[train_index], X_train_embeddings[val_index]
        y_train_fold, y_val_fold = y_train.iloc[train_index], y_train.iloc[val_index]

        # Fit the model on the training fold
        pipeline.fit(X_train_fold, y_train_fold)

        # Predict on the validation fold
        y_pred_val = pipeline.predict(X_val_fold)
        
        # Append true labels and predicted labels
        all_true_labels.extend(y_val_fold)
        all_predicted_labels.extend(y_pred_val)
        
    # Generate an overall confusion matrix
    confusion_mat = confusion_matrix(all_true_labels, all_predicted_labels, labels=[0,1,2,3])
    
    # Calculate the classification report
    class_report = classification_report(all_true_labels, all_predicted_labels, zero_division=1)

    # Print the confusion matrix and classification report
    print("Confusion Matrix:")
    print(confusion_mat)

    print("\nClassification Report:")
    print(class_report)

In [16]:
classifiers = [MultinomialNB(),
               KNeighborsClassifier(),
               DecisionTreeClassifier(random_state=42), 
               RandomForestClassifier(random_state=42), 
               GradientBoostingClassifier(random_state=42),
               XGBClassifier(random_state=42),
               LinearSVC()]

## 3.1. WORD2VEC

In [17]:
w2v= KeyedVectors.load_word2vec_format('D:\\MS DATA SCIENCE\\NLP TESE\\embeddings\\skip_s600_word2vec.txt')

In [18]:
w2v

<gensim.models.keyedvectors.KeyedVectors at 0x1880f26afa0>

## description

In [19]:
feature_set=['narrative_embeddings']
for clf in classifiers:
    print(clf.__class__.__name__)
    applyclf_emb(X_train, y_train, clf, feature_set, w2v)

MultinomialNB
Confusion Matrix:
[[2814    0 2762    0]
 [ 856    0  697    0]
 [1123    0 6387    0]
 [ 543    0  519    0]]

Classification Report:
              precision    recall  f1-score   support

           0       0.53      0.50      0.52      5576
           1       1.00      0.00      0.00      1553
           2       0.62      0.85      0.71      7510
           3       1.00      0.00      0.00      1062

    accuracy                           0.59     15701
   macro avg       0.79      0.34      0.31     15701
weighted avg       0.65      0.59      0.52     15701

KNeighborsClassifier
Confusion Matrix:
[[4266  150 1087   73]
 [1193   59  268   33]
 [3755  159 3533   63]
 [ 818   42  138   64]]

Classification Report:
              precision    recall  f1-score   support

           0       0.43      0.77      0.55      5576
           1       0.14      0.04      0.06      1553
           2       0.70      0.47      0.56      7510
           3       0.27      0.06      0.10

## description + title

In [20]:
feature_set=['narrative_embeddings', 'embeddings_title']
for clf in classifiers:
    print(clf.__class__.__name__)
    applyclf_emb(X_train, y_train, clf, feature_set, w2v)

MultinomialNB
Confusion Matrix:
[[2986    0 2589    1]
 [ 891    0  662    0]
 [1166    0 6344    0]
 [ 578    0  484    0]]

Classification Report:
              precision    recall  f1-score   support

           0       0.53      0.54      0.53      5576
           1       1.00      0.00      0.00      1553
           2       0.63      0.84      0.72      7510
           3       0.00      0.00      0.00      1062

    accuracy                           0.59     15701
   macro avg       0.54      0.35      0.31     15701
weighted avg       0.59      0.59      0.53     15701

KNeighborsClassifier
Confusion Matrix:
[[4237  141 1112   86]
 [1152   81  277   43]
 [3440  135 3864   71]
 [ 799   40  128   95]]

Classification Report:
              precision    recall  f1-score   support

           0       0.44      0.76      0.56      5576
           1       0.20      0.05      0.08      1553
           2       0.72      0.51      0.60      7510
           3       0.32      0.09      0.14

## 3.2 GLOVE

In [21]:
glove= KeyedVectors.load_word2vec_format('D:\\MS DATA SCIENCE\\NLP TESE\\embeddings\\glove_s600.txt')

## description

In [22]:
feature_set=['narrative_embeddings']
for clf in classifiers:
    print(clf.__class__.__name__)
    applyclf_emb(X_train, y_train, clf, feature_set, glove)

MultinomialNB
Confusion Matrix:
[[2791    0 2785    0]
 [ 857    0  696    0]
 [1277    0 6233    0]
 [ 546    0  516    0]]

Classification Report:
              precision    recall  f1-score   support

           0       0.51      0.50      0.51      5576
           1       1.00      0.00      0.00      1553
           2       0.61      0.83      0.70      7510
           3       1.00      0.00      0.00      1062

    accuracy                           0.57     15701
   macro avg       0.78      0.33      0.30     15701
weighted avg       0.64      0.57      0.52     15701

KNeighborsClassifier
Confusion Matrix:
[[4205  174 1116   81]
 [1179   72  268   34]
 [3795  162 3482   71]
 [ 780   43  169   70]]

Classification Report:
              precision    recall  f1-score   support

           0       0.42      0.75      0.54      5576
           1       0.16      0.05      0.07      1553
           2       0.69      0.46      0.56      7510
           3       0.27      0.07      0.11

## description + title

In [23]:
feature_set=['narrative_embeddings', 'embeddings_title']
for clf in classifiers:
    print(clf.__class__.__name__)
    applyclf_emb(X_train, y_train, clf, feature_set, glove)

MultinomialNB
Confusion Matrix:
[[3003    0 2570    3]
 [ 893    0  660    0]
 [1330    0 6179    1]
 [ 589    0  471    2]]

Classification Report:
              precision    recall  f1-score   support

           0       0.52      0.54      0.53      5576
           1       1.00      0.00      0.00      1553
           2       0.63      0.82      0.71      7510
           3       0.33      0.00      0.00      1062

    accuracy                           0.58     15701
   macro avg       0.62      0.34      0.31     15701
weighted avg       0.60      0.58      0.53     15701

KNeighborsClassifier
Confusion Matrix:
[[4225  167 1107   77]
 [1159   79  274   41]
 [3552  152 3726   80]
 [ 770   61  155   76]]

Classification Report:
              precision    recall  f1-score   support

           0       0.44      0.76      0.55      5576
           1       0.17      0.05      0.08      1553
           2       0.71      0.50      0.58      7510
           3       0.28      0.07      0.11