# 1. Bag of words

## A. Data loading

In [1]:
# General Packages #
import os
import pandas as pd
import numpy as np
import string
import re
from scipy.stats import randint

# Load TQDM to Show Progress Bars #
from tqdm import tqdm
from tqdm.notebook import tqdm as tqdm_notebook

# Sklearn Packages #
from sklearn.feature_extraction.text import TfidfTransformer, CountVectorizer, TfidfVectorizer
from sklearn.metrics import mean_squared_error, r2_score, accuracy_score, classification_report
from sklearn.model_selection import train_test_split, StratifiedShuffleSplit, StratifiedKFold, cross_val_predict, RandomizedSearchCV
from sklearn.metrics import roc_auc_score, precision_score, recall_score, f1_score, confusion_matrix
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, BaggingClassifier, GradientBoostingClassifier
from sklearn.linear_model import Lasso, LassoCV, SGDClassifier, LogisticRegression, RidgeCV, RidgeClassifierCV, HuberRegressor, LinearRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.neural_network import MLPClassifier
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import LabelEncoder

# NLTK Packages #
import nltk
from nltk.corpus import stopwords
from textblob import TextBlob, Word

# Import necessary libraries for handling imbalanced data
from imblearn.combine import SMOTETomek
from imblearn.metrics import geometric_mean_score
from imblearn.ensemble import RUSBoostClassifier
from imblearn.ensemble import smote
from imblearn.pipeline import make_pipeline
from imblearn.over_sampling import SMOTE



In [2]:
# Turn off warnings, just to avoid pesky messages that might cause confusion here
# Remove when testing your own code #
import warnings
warnings.filterwarnings("ignore")

In [3]:
# Change to Working Directory with Training Data # 
os.chdir("/Users/Artur/Desktop/thesis_HIR_versie5/coding")

# Load Training Data #
df_train = pd.read_csv("./data/train_adjusted.csv", header = 0)

# inspect the data
df_train.head(5)

Unnamed: 0,id,Headline,category
0,194578,Head Line: US Patent granted to BASF SE (Delaw...,
1,564295,Societe Generale Launches a Next-Generation Ca...,
2,504138,BARCLAYS PLC Form 8.3 - EUTELSAT COMMUNICATION...,
3,91379,ASML: 4Q Earnings Snapshot,
4,265750,Form 8.3 - AXA INVESTMENT MANAGERS : Booker Gr...,


## B. Handling class imbalance

Here, I would handle the class imbalance issue in my dataset. I would perform this step before tuning any hyperparameters. I would solve the class imbalance in my dataset by using SMOTE and the Tomek links. This algorithm generates synthetic examples of the minority class. On the other hand, Tomek links are used to remove examples that are near the decision boundary between two classes. Together, they oversample the minority classes and undersample the majority class.

## C. Define the vectorizer

Define a function to clean and tokenize the text

In [4]:
# Define Tokenizer
def textblob_tokenizer(str_input):
    
    # Convert list to string
    input_str = str_input
    
    if isinstance(input_str, list):
        input_str = ' '.join(input_str)
    
    # Remove punctuation
    str_input = str_input.translate(str.maketrans('', '', string.punctuation))
    
    # Define stop words
    stop_words = set(stopwords.words('english'))

    # Tokenize and lemmatize text
    blob = TextBlob(str_input.lower())
    tokens = [Word(token).lemmatize() for token in blob.words]

    # Remove numbers and stop words
    words = [token for token in tokens if not re.match('^\d+$', token) and token not in stop_words]

    return words

In [6]:
# inspect the cleaned data
df_train['cleaned_headline'] = df_train['Headline'].apply(textblob_tokenizer)

# check the data
df_train[["cleaned_headline", "Headline"]]

Unnamed: 0,cleaned_headline,Headline
0,"[head, line, u, patent, granted, basf, se, del...",Head Line: US Patent granted to BASF SE (Delaw...
1,"[societe, generale, launch, nextgeneration, ca...",Societe Generale Launches a Next-Generation Ca...
2,"[barclays, plc, form, eutelsat, communication]",BARCLAYS PLC Form 8.3 - EUTELSAT COMMUNICATION...
3,"[asml, 4q, earnings, snapshot]",ASML: 4Q Earnings Snapshot
4,"[form, axa, investment, manager, booker, group...",Form 8.3 - AXA INVESTMENT MANAGERS : Booker Gr...
...,...,...
43249,"[tomra, system, asa, tom, purchase, share]",Tomra Systems ASA: TOM: Purchase of own shares
43250,"[swiss, federal, institute, intellectual, prop...",Swiss Federal Institute of Intellectual Proper...
43251,"[icon, pfizer, roche, join, addplan, df, conso...",ICON: Pfizer and Roche Join ADDPLAN DF Consort...
43252,"[rio, tinto, plc, transaction, share]",Rio Tinto PLC Transaction in Own Shares -3-


Defining model parameters to use in a grid search:
1. MINDF = Minimum Document Frequency -- Minimum number of times a word needs to occur to be considered
2. MAXDF = Maximum Document Frequency -- Maximum share of documents where a word needs to occur to be considered
3. MF = Maximum number of features we would want to consider -- ranked by most frequently occurring
4. NGrams -- Number of Word Pairs. Takes the form (Min, Max). E.g. (1, 2) means single words and word pairs

In [7]:
# Define the hyperparameter search space for the TfidfVectorizer
tfidf_param_dist = {'max_features': randint(500, 2000),
                    'max_df': [0.5, 0.6, 0.7, 0.8, 0.9],
                    'min_df': [1, 2, 5, 10],
                    'ngram_range': [(1,1), (1,2), (1,3)]}


Define the vectorizer

In [8]:
# Create a TfidfVectorizer with default parameters
tfidf_vectorizer = TfidfVectorizer(tokenizer=textblob_tokenizer)

# Convert the text data to a matrix of TF-IDF features
X_tfidf = tfidf_vectorizer.fit_transform(df_train['Headline'])

#### Hyper tune the vectorizer 

Now, we still need to hypertune the vectorizer. Therefore, I first define a random forest classifier with default parameters. This rf classifier is hypertuned by cross validating its performance in terms of the G_mean score, while using the default parameters of the vectorizer.

After I tuned the rf classifier, I use this classifier to hypertune the vectorizer by cross validating its performance in terms of the G_mean score. Then, these parameters will be used to train the other models.

In [9]:
# Create a Random Forest classifier with default parameters
rfc = RandomForestClassifier()

In [10]:
# Define the custom scoring function
def custom_scoring(estimator, X, y):
    # Define a RandomForestClassifier with the same parameters as the estimator
    rfc = RandomForestClassifier(**estimator.get_params())
    
    # Use cross_val_score to calculate the mean accuracy of the classifier
    scores = cross_val_score(rfc, X, y, cv=5, scoring='G_mean')
    
    # Return the mean accuracy
    return scores.mean()

In [23]:
# Define the hyperparameter search space for the Random Forest classifier
rfc_param_dist = {'n_estimators': randint(50, 100),
                  'max_features': randint(1, 10),
                  'max_depth': randint(1, 10)}

# Create a randomized search object with cross-validation
random_search = RandomizedSearchCV(estimator=rfc,
                                   param_distributions=rfc_param_dist,
                                   n_iter=10,
                                   cv=5,
                                   scoring=custom_scoring)

In [24]:
# Fit the randomized search object to the training data
random_search.fit(X_tfidf, df_train['category'])

# Print the best hyperparameters found
print('Best hyperparameters for Random Forest: ', random_search.best_params_)


Best hyperparameters for Random Forest:  {'max_depth': 5, 'max_features': 2, 'n_estimators': 84}


In [25]:
# Use the best hyperparameters for Random Forest to fit a model on the TF-IDF matrix
best_rfc = RandomForestClassifier(**random_search.best_params_)
best_rfc.fit(X_tfidf, df_train['category'])

# Use the best hyperparameters for TfidfVectorizer to fit a model on the training data
random_search = RandomizedSearchCV(estimator=tfidf_vectorizer,
                                   param_distributions=tfidf_param_dist,
                                   n_iter=10,
                                   cv=5,
                                   scoring=custom_scoring)

# Fit the randomized search object to the training data
random_search.fit(df_train['Headline'], df_train['category'])

# Print the best hyperparameters found
print('Best hyperparameters for TfidfVectorizer: ', random_search.best_params_)

Best hyperparameters for TfidfVectorizer:  {'max_df': 0.9, 'max_features': 1347, 'min_df': 10, 'ngram_range': (1, 3)}


## D. Hyper parameting tuning of the models

## E. Evaluating the different models with the optimal parameters

In [17]:
# Store Data in Lists for Text Classification #
IDs = np.array(df_train['id'].values.tolist())
headlines = np.array(df_train['Headline'].values.tolist())
Classes = df_train['category'].values.tolist()



In [18]:
# Perform vectorization and extract feature names #
Abstract_Vectors = vec.fit_transform(headlines)
FEATURENAMES = vec.get_feature_names_out()

In [9]:
# define a function to calculate classification metrics

def get_classification_metrics(y_true, y_pred, y_pred_proba):
    # Calculate Model Performance Metrics
    accuracy = accuracy_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred, average='macro')
    recall = recall_score(y_true, y_pred, average='macro')
    f1 = f1_score(y_true, y_pred, average='macro')
    cm = confusion_matrix(y_true, y_pred)

    # Calculate the extended G-mean
    g_mean = geometric_mean_score(y_true, y_pred, average='weighted')

    # Calculate the MAUC score
    mauc = np.round(roc_auc_score(y_true, y_pred_proba, multi_class='ovo', average='macro'), 3)

    return accuracy, precision, recall, f1, cm, g_mean, mauc


In [None]:
# Define the Set of Classifiers and their parameters
CLASSIFIERS = [
               ["RUSBoost", RUSBoostClassifier(random_state=1)],
               ["SMOTEBoost (AdaBoost)", make_pipeline(SMOTE(random_state=1), AdaBoostClassifier(random_state=1))],
               ["SMOTEBoost (Gradient Boosting)", make_pipeline(SMOTE(random_state=1), GradientBoostingClassifier(random_state=1))],
               ["Random Forest", RandomForestClassifier(n_estimators=100)],
              ]

# Number of Folds (Splits) for Cross Validation #
NUM_OF_SPLITS = 5

# Define whether you want to manually reweight the sample by oversampling the smaller class 
Reweight = False

# Define arrays in which to store classification outputs # 
RESULTS = []
Classified_Values =[]
Classified_Values_p =[]


# Loop Through Different Classifiers #
for CL in tqdm_notebook(CLASSIFIERS, desc = "Evaluating Classifiers"):

    # Extract Classifier Names & Model #
    name  = CL[0]
    Model = CL[1]

    # Define Arrays to store Actual, Predicted and Ids variables (Because we are shuffling them in next step) # 
    y_actual = []
    y_predicted = []
    y_predicted_proba = []

    id_s = []

    # Loop through K Folds and Repeat Cross Validation #
    KFoldSplitter = StratifiedKFold(n_splits=NUM_OF_SPLITS, shuffle=True, random_state=1)
    
    for train_i, test_i in tqdm_notebook(KFoldSplitter.split(Abstract_Vectors, Classes), 
                                         desc = 'Cross-Validating',
                                         leave = False,
                                         total = NUM_OF_SPLITS):

        # Select Rows in Data Based on Indexes [train_i, test_i]
        Y = np.asarray(Classes)

        train_X, test_X = Abstract_Vectors[train_i], Abstract_Vectors[test_i]
        train_y, test_y = Y[train_i], Y[test_i]
        Train_IDs, Test_IDs = IDs[train_i], IDs[test_i]

        # solving class imbalance issues      
        temp_y = list(train_y)
        temp_X = train_X.todense().tolist()

        if Reweight == True:
            
            # Use SMOTE and Tomek links to handle class imbalance
            smt = SMOTETomek(random_state=1)
            X_train_balanced, y_train_balanced = smt.fit_resample(train_X, train_y)
            
        else:
            X_train_balanced, y_train_balanced = train_X, train_y


        # Train Model #
        Results = Model.fit(X_train_balanced, y_train_balanced)
        
        # Perform Prediction on Holdout Sample # 
        y_pred = Model.predict(test_X.toarray())
        y_pred_proba = Model.predict_proba(test_X.toarray())

        # Add to List with Final Results # 
        y_actual = y_actual + list(test_y)
        y_predicted = y_predicted + list(y_pred)
        y_predicted_proba = y_predicted_proba + list(y_pred_proba)
        id_s = id_s + list(Test_IDs)

    # ---------------------------------------------------------- #
    # This runs only after all of the folds have been classified # 
    # ---------------------------------------------------------- #

    # Calculate classification metrics
    Accuracy, Precision, Recall, F1, CM, G_mean, MAUC = get_classification_metrics(y_actual, y_predicted, y_predicted_proba)


    # Round to 3 Decimal Places # 
    #FN = np.round(CM[0][0]/CM[0].sum(), 3)
    #FP = np.round(CM[0][1]/CM[0].sum(), 3)
    #TN = np.round(CM[1][0]/CM[1].sum(), 3)
    #TP = np.round(CM[1][1]/CM[1].sum(), 3)

    FN = np.round(CM[0][0]/(CM[0][0] + CM[1][0]), 3)
    FP = np.round(CM[0][1]/(CM[0][1] + CM[1][1]), 3)
    TN = np.round(CM[1][0]/(CM[0][0] + CM[1][0]), 3)
    TP = np.round(CM[1][1]/(CM[0][1] + CM[1][1]), 3)


    FN = np.round(CM[0][0]/(CM[0][0] + CM[1][0]), 3)
    FP = np.round(CM[0][1]/(CM[0][1] + CM[1][1]), 3)
    TN = np.round(CM[1][0]/(CM[0][0] + CM[1][0]), 3)
    TP = np.round(CM[1][1]/(CM[0][1] + CM[1][1]), 3)

    # Add Classification Performance Metrics to List #
    RESULTS.append([name, TP, FN, FP, TN, 
                          np.round(Accuracy, 3),
                          np.round(Precision, 3),
                          np.round(Recall, 3),
                          np.round(F1, 3),
                          G_mean,
                          MAUC])

    # Add Classification Results to List # 
    Classified_Values.append(list(zip(len(id_s)*[name],id_s, y_actual, y_predicted, 
                                       len(id_s)*[G_mean], len(id_s)*[MAUC])))


In [None]:
# Convert List of Model Performance Metrics to Dataframe #
RESULTS_TABLE = pd.DataFrame(RESULTS, columns = ["Name", "True-Positives", 
                                                 "False-Negatives", "False-Positives", 
                                                 "True-Negatives","Accuracy", 
                                                 "Precision", "Recall", "F1", "G_mean", "MAUC"] )
RESULTS_TABLE["Type"] = "Bag of Words"
RESULTS_TABLE = RESULTS_TABLE[["Name", "True-Positives", 
                                                 "False-Negatives", "False-Positives", 
                                                 "True-Negatives","Accuracy", 
                                                 "Precision", "Recall", "F1", "G_mean", "MAUC"]]

# Output Results #
RESULTS_TABLE.sort_values("Accuracy", ascending = False ).to_csv("./Output/Model Performance/BOW Model Classification Performance.csv")

# Display Results -- Out of Sample (Holdout) prediction -- Sorted by Accuracy #
RESULTS_TABLE.sort_values("Accuracy", ascending = False )

# New try

In [48]:
from sklearn.pipeline import Pipeline
from sklearn.model_selection import RandomizedSearchCV

# Define the set of classifiers and their parameters
CLASSIFIERS = [
    ["RUSBoost", RUSBoostClassifier(random_state=1)],
    ["Random Forest", RandomForestClassifier(random_state=1)],
]


# Number of Folds (Splits) for Cross Validation #
NUM_OF_SPLITS = 5

# Define whether you want to manually reweight the sample by oversampling the smaller class 
Reweight = False

# Define arrays in which to store classification outputs # 
RESULTS = []
Classified_Values =[]
Classified_Values_p =[]


In [49]:
# Define the pipeline with the vectorizer and classifier
pipeline = Pipeline([
    ('vectorizer', TfidfVectorizer(tokenizer=textblob_tokenizer)),
    ('classifier', None)
])


# Define the grid of parameters for the vectorizer and classifiers
vectorizer_params = {
    'max_features': [200, 2000],
    'max_df': [0.7, 0.8],
    'ngram_range': [(1, 1), (1, 2)]
}

classifier_params = {
    'RUSBoost': {
        'n_estimators': [50, 100],
        'learning_rate': [0.1, 0.2]
    },
    'Random Forest': {
        'n_estimators': [50, 100],
        'max_depth': [1, 5, 10]
    }
}

param_grid = {
        'vectorizer__max_features': vectorizer_params['max_features'],
        'vectorizer__max_df': vectorizer_params['max_df'],
        'vectorizer__ngram_range': vectorizer_params['ngram_range'],
        'classifier': [CL[1] for CL in CLASSIFIERS],
    }


# Define Randomized Search #
rs = RandomizedSearchCV(pipeline, param_distributions=param_grid, n_iter=10, 
                            cv=KFoldSplitter, verbose=1, n_jobs= 1, scoring=scorer)

In [13]:
# Store Data in Lists for Text Classification #
IDs = np.array(df_train['id'].values.tolist())
headlines = np.array(df_train['Headline'].values.tolist())
Classes = df_train['category'].values.tolist()

In [33]:
print(df_train['Headline'][8].todense().tolist())


AttributeError: 'str' object has no attribute 'todense'

In [53]:
from sklearn.metrics import make_scorer
from imblearn.metrics import geometric_mean_score
from sklearn.model_selection import RandomizedSearchCV





# Define best model and metric variables
best_model = None
best_metric = 0.0

# Loop Through Different Classifiers #
for CL in tqdm_notebook(CLASSIFIERS, desc = "Evaluating Classifiers"):

    # Extract Classifier Names & Model #
    name  = CL[0]
    Model = CL[1]
    # Combine the vectorizer and classifier parameter grids
    
    param_grid = {
        'vectorizer__max_features': vectorizer_params['max_features'],
        'vectorizer__max_df': vectorizer_params['max_df'],
        'vectorizer__ngram_range': vectorizer_params['ngram_range'],
        'classifier': [
            RUSBoostClassifier(random_state=1, **classifier_params['RUSBoost']),
            RandomForestClassifier(random_state=1, **classifier_params['Random Forest'])
        ],
    }


    # Define Arrays to store Actual, Predicted and Ids variables (Because we are shuffling them in next step) # 
    y_actual = []
    y_predicted = []
    y_predicted_proba = []

    id_s = []

    # Loop through K Folds and Repeat Cross Validation #
    KFoldSplitter = StratifiedKFold(n_splits=NUM_OF_SPLITS, shuffle=True, random_state=1)
    
    # Define the custom scorer
    scorer = make_scorer(geometric_mean_score)

    # Define Randomized Search #
    rs = RandomizedSearchCV(pipeline, param_distributions=param_grid, n_iter=10, 
                            cv=KFoldSplitter, verbose=1, n_jobs= 1, scoring=scorer)

    # Define variable to store best model for this classifier
    best_model_cl = None
    
    for train_i, test_i in tqdm_notebook(KFoldSplitter.split(headlines, Classes), 
                                         desc = 'Cross-Validating',
                                         leave = False,
                                         total = NUM_OF_SPLITS):

        # Select Rows in Data Based on Indexes [train_i, test_i]
        Y = np.asarray(Classes)

        train_X, test_X = df_train['Headline'][train_i], df_train['Headline'][test_i]
        train_y, test_y = Y[train_i], Y[test_i]
        Train_IDs, Test_IDs = IDs[train_i], IDs[test_i]



        if Reweight == True:
            
            # Use SMOTE and Tomek links to handle class imbalance
            smt = SMOTETomek(random_state=1)
            X_train_balanced, y_train_balanced = smt.fit_resample(train_X, train_y)
            
        else:
            X_train_balanced, y_train_balanced = train_X, train_y


        # Train Model #
        Results = rs.fit(X_train_balanced, y_train_balanced)
        
        # Perform Prediction on Holdout Sample # 
        y_pred = Model.predict(test_X.toarray())
        y_pred_proba = Model.predict_proba(test_X.toarray())

        # Add to List with Final Results # 
        y_actual = y_actual + list(test_y)
        y_predicted = y_predicted + list(y_pred)
        y_predicted_proba = y_predicted_proba + list(y_pred_proba)
        id_s = id_s + list(Test_IDs)

    # ---------------------------------------------------------- #
    # This runs only after all of the folds have been classified # 
    # ---------------------------------------------------------- #

    # Calculate classification metrics
    Accuracy, Precision, Recall, F1, CM, G_mean, MAUC = get_classification_metrics(y_actual, y_predicted, y_predicted_proba)

    # Check if this classifier performed better than the previous best
    if G_mean > best_metric:
        best_metric = G_mean
        best_model = rs.best_estimator_
        best_model_cl = name

    # Add Classification Performance Metrics to List #
    RESULTS.append([name, np.round(Accuracy, 3),
                          np.round(Precision, 3),
                          np.round(Recall, 3),
                          np.round(F1, 3),
                          G_mean,
                          MAUC])

    # Add Classification Results to List # 
    Classified_Values.append(list(zip(len(id_s)*[name],id_s, y_actual, y_predicted, 
                                       len(id_s)*[G_mean], len(id_s)*[MAUC])))


Evaluating Classifiers:   0%|          | 0/2 [00:00<?, ?it/s]

Cross-Validating:   0%|          | 0/5 [00:00<?, ?it/s]

Fitting 5 folds for each of 10 candidates, totalling 50 fits


ValueError: 
All the 50 fits failed.
It is very likely that your model is misconfigured.
You can try to debug the error by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
30 fits failed with the following error:
Traceback (most recent call last):
  File "/Users/Artur/opt/anaconda3/lib/python3.8/site-packages/sklearn/model_selection/_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Users/Artur/opt/anaconda3/lib/python3.8/site-packages/sklearn/pipeline.py", line 405, in fit
    self._final_estimator.fit(Xt, y, **fit_params_last_step)
  File "/Users/Artur/opt/anaconda3/lib/python3.8/site-packages/sklearn/ensemble/_forest.py", line 340, in fit
    self._validate_params()
  File "/Users/Artur/opt/anaconda3/lib/python3.8/site-packages/sklearn/base.py", line 600, in _validate_params
    validate_parameter_constraints(
  File "/Users/Artur/opt/anaconda3/lib/python3.8/site-packages/sklearn/utils/_param_validation.py", line 97, in validate_parameter_constraints
    raise InvalidParameterError(
sklearn.utils._param_validation.InvalidParameterError: The 'max_depth' parameter of RandomForestClassifier must be an int in the range [1, inf) or None. Got [1, 5, 10] instead.

--------------------------------------------------------------------------------
20 fits failed with the following error:
Traceback (most recent call last):
  File "/Users/Artur/opt/anaconda3/lib/python3.8/site-packages/sklearn/model_selection/_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Users/Artur/opt/anaconda3/lib/python3.8/site-packages/sklearn/pipeline.py", line 405, in fit
    self._final_estimator.fit(Xt, y, **fit_params_last_step)
  File "/Users/Artur/opt/anaconda3/lib/python3.8/site-packages/imblearn/ensemble/_weight_boosting.py", line 244, in fit
    self._validate_params()
  File "/Users/Artur/opt/anaconda3/lib/python3.8/site-packages/sklearn/base.py", line 600, in _validate_params
    validate_parameter_constraints(
  File "/Users/Artur/opt/anaconda3/lib/python3.8/site-packages/sklearn/utils/_param_validation.py", line 97, in validate_parameter_constraints
    raise InvalidParameterError(
sklearn.utils._param_validation.InvalidParameterError: The 'learning_rate' parameter of RUSBoostClassifier must be a float in the range (0, inf). Got [0.1, 0.2] instead.
