# Vectorization

## 1. Data loading

In [18]:
# General Packages #
import os
import pandas as pd
import numpy as np
import string
import re
from scipy.stats import randint
import random

# Load TQDM to Show Progress Bars #
from tqdm import tqdm
from tqdm.notebook import tqdm as tqdm_notebook

# Sklearn Packages #
from sklearn.feature_extraction.text import TfidfTransformer, CountVectorizer, TfidfVectorizer
from sklearn.metrics import mean_squared_error, r2_score, accuracy_score, classification_report
from sklearn.model_selection import train_test_split, StratifiedShuffleSplit, StratifiedKFold, cross_val_predict, GridSearchCV
from sklearn.metrics import roc_auc_score, precision_score, recall_score, f1_score, confusion_matrix
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, BaggingClassifier, GradientBoostingClassifier
from sklearn.linear_model import Lasso, LassoCV, SGDClassifier, LogisticRegression, RidgeCV, RidgeClassifierCV, HuberRegressor, LinearRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.neural_network import MLPClassifier
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.calibration import CalibratedClassifierCV

# NLTK Packages #
import nltk
from nltk.corpus import stopwords
from textblob import TextBlob, Word

# Import necessary libraries for handling imbalanced data
from imblearn.combine import SMOTETomek
from imblearn.metrics import geometric_mean_score
from imblearn.ensemble import RUSBoostClassifier
from imblearn.ensemble import smote
from imblearn.pipeline import make_pipeline
from imblearn.over_sampling import SMOTE



In [19]:
# Turn off warnings, just to avoid pesky messages that might cause confusion here
# Remove when testing your own code #
import warnings
warnings.filterwarnings("ignore")

In [20]:
# Change to Working Directory with Training Data # 
os.chdir("/Users/Artur/Desktop/thesis_HIR_versie5/coding")

# Load Training Data #
df_train = pd.read_csv("./data/train_adjusted.csv", header = 0)
df_test = pd.read_csv("./data/test_adjusted.csv", header = 0)

# inspect the data
df_train.head(5)

Unnamed: 0,id,Headline,category
0,194578,Head Line: US Patent granted to BASF SE (Delaw...,
1,564295,Societe Generale Launches a Next-Generation Ca...,
2,504138,BARCLAYS PLC Form 8.3 - EUTELSAT COMMUNICATION...,
3,91379,ASML: 4Q Earnings Snapshot,
4,265750,Form 8.3 - AXA INVESTMENT MANAGERS : Booker Gr...,


# 1.1 Bag of words

## 1.1.2 Define the BOW vectorizer

Define the tokenizer that we will use for the vectorizers.

*** to do: Dollar signs + names of the company ***

In [21]:
def textblob_tokenizer(str_input):
    # Convert list to string
    input_str = str_input
    if isinstance(input_str, list):
        input_str = ' '.join(input_str)
    
    # Remove punctuation
    str_input = str_input.translate(str.maketrans('', '', string.punctuation))
    
    # Define stop words
    stop_words = set(stopwords.words('english'))

    # Tokenize text
    blob = TextBlob(str_input.lower())
    tokens = blob.words
    
    # Remove numbers, stop words, and words with one character
    words = [word for word in tokens if not re.match('^\d+$', word) and word not in stop_words and len(word) > 1]
    
    # Lemmatize words
    words = [Word(word).lemmatize() for word in words]
    
    return words

In [22]:
# inspect the cleaned data
df_train['cleaned_headline'] = df_train['Headline'].apply(textblob_tokenizer)

# check the data
#df_train[["cleaned_headline", "Headline"]]

Define the parameters of the vectorizer

In [23]:
# Minimum Document Frequency -- Minimum number of times a word needs to occur to be considered #
MINDF = 10

# Maximum Document Frequency -- Maximum share of documents where a word needs to occur to be considered #
MAXDF = 0.9

# Maximum number of features we would want to consider -- ranked by most frequently occuring #
MF= 800

# NGrams -- Number of Word Pairs. Takes the form (Min, Max). E.g. (1, 2) means single words and word pairs # 
NGrams = (1,2)

Define the vectorizer itself

In [24]:
# define the vectorizer
vec_bow = CountVectorizer(max_features= MF,
                          min_df = MINDF,
                          max_df = MAXDF,
                          ngram_range=(1, 2),
                          tokenizer=textblob_tokenizer)

Define the dependent variables

In [25]:
# create independent variable with BOW
X_train_bow = vec_bow.fit_transform(df_train['Headline'])

# Transform the test set using the same vectorizer
X_test_bow = vec_bow.transform(df_test['Headline'])

In [26]:
# inspect the vectorizer and the cleaned data
print(vec_bow.vocabulary_)

{'head': 312, 'line': 407, 'u': 751, 'patent': 524, 'granted': 296, 'basf': 85, 'se': 646, 'delaware': 177, 'may': 443, 'titled': 730, 'composition': 143, 'comprising': 145, 'method': 451, 'using': 768, 'head line': 313, 'line u': 408, 'u patent': 752, 'patent granted': 527, 'basf se': 86, 'societe': 677, 'generale': 282, 'launch': 398, 'security': 650, 'societe generale': 678, 'barclays': 82, 'plc': 548, 'form': 259, 'communication': 139, 'barclays plc': 83, 'plc form': 553, 'earnings': 206, 'axa': 74, 'investment': 366, 'booker': 102, 'group': 300, 'booker group': 103, 'group plc': 303, 'glencore': 289, 'transaction': 742, 'share': 660, 'glencore plc': 290, 'plc transaction': 561, 'transaction share': 743, 'trust': 748, 'corporation': 158, 'annual': 43, 'financial': 251, 'report': 612, 'plc annual': 550, 'annual financial': 44, 'financial report': 252, 'infineon': 340, 'technology': 714, 'ag': 16, 'applies': 49, 'sensor': 655, 'system': 710, 'infineon technology': 341, 'applies u': 5

## 1.1.3 Classification for each category

In [37]:
# Create a list with the different categories
categories = df_train["category"].unique()
print(categories)

['None' 'Financing' 'Production-related actions' 'Merger & \nacquisitions'
 'Corporate \ngovernance' 'Strategic alliance'
 'Expansion in existing market (product/service/geographical)'
 'Divestiture' 'Human resources'
 'New product introduction/\nservice offering' 'External venturing'
 'Marketing' 'Product/\nservice improvement'
 'New geographical market entry' 'R&D-related actions']


In [38]:
def create_results_df():
    # Create an empty dataframe with the categories to store the results of the rfc
    results_df = pd.DataFrame(index=categories)

    # Add columns for the metrics
    columns = ['accuracy', 'precision', 'recall', 'f1', 'FN', 'FP', 'TN', 'TP']
    for col in columns:
        results_df[col] = 0

    return results_df


In [49]:
def get_classification_metrics(y_true, y_pred, category):
    
    # Calculate Model Performance Metrics
    accuracy = accuracy_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred, pos_label = category)
    recall = recall_score(y_true, y_pred, pos_label = category)
    f1 = f1_score(y_true, y_pred, pos_label = category)
    CM = confusion_matrix(y_true, y_pred, labels= [category, 'other'])
    
    TP = np.round(CM[0][0], 3)
    FP = np.round(CM[0][1], 3)
    FN = np.round(CM[1][0], 3)
    TN = np.round(CM[1][1], 3) 
    
    return accuracy, precision, recall, f1, FN, FP, TN, TP

In [50]:
# Define the number of folds for cross-validation
n_splits = 5

# Initialize the stratified k-fold object
skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42) # ensures class balances are kept


In [57]:
def evaluate_categories(model, results_df):
    
    # Empty the dictionary with optimal parameters for each category
    best_params_dict = {}
    
    for category in tqdm_notebook(categories, desc="Evaluating categories"):
        
        # Set a random seed for reproducibility
        random.seed(123)
        
        # Create a new column in the training and test dataframes to store the modified category
        df_train['modified_category'] = df_train['category'].apply(lambda x: x if x == category else 'other')
        df_test['modified_category'] = df_test['category'].apply(lambda x: x if x == category else 'other')

        # Get the indices of the category observations and non-category observations
        cat_i = np.where(df_train['modified_category'] == category)[0]
        noncat_i = np.where(df_train['modified_category'] != category)[0]

        # Shuffle the non-category indices and select the same number as the category observations
        np.random.shuffle(noncat_i)
        noncat_i = noncat_i[:len(cat_i)]

        # Combine the category and non-category indices into one list
        indices = np.concatenate((cat_i, noncat_i))

        # Shuffle the indices to ensure randomness
        np.random.shuffle(indices)

        # Extract the features and dependent variable for the selected indices
        X_train = X_train_bow[indices]
        y_train = df_train['modified_category'][indices]

        # Perform the grid search using cross-validation
        grid_search = GridSearchCV(model, param_grid, cv=skf, scoring='accuracy')
        grid_search.fit(X_train, y_train)

        # Get the best model and its hyperparameters
        best_model = grid_search.best_estimator_
        best_params = grid_search.best_params_

        # Store the best parameters for the current category in the dictionary
        best_params_dict[category] = best_params

        # Retrain the best model with the whole training set
        best_model.fit(X_train, y_train)
        
        # Calibrate the model using Platt scaling
        #calibrated_model = CalibratedClassifierCV(best_model, cv='prefit', method='sigmoid')
        #calibrated_model.fit(X_train, y_train)

        # Make predictions on the test set
        #y_pred = calibrated_model.predict(X_test_bow)

        # Make predictions on the test set
        y_pred = best_model.predict(X_test_bow)
        y_pred_proba = best_model.predict_proba(X_test_bow)

        # define the dep variable of the test set
        y_test = df_test['modified_category']

        # Calculate the classification metrics
        accuracy, precision, recall, f1, FN, FP, TN, TP = get_classification_metrics(y_test, y_pred, category)

        # Store the results in a dataframe
        results_df.loc[category] = [accuracy, precision, recall, f1, FN, FP, TN, TP]
        
    # return the dictionary with the best parameters
    return best_params_dict, best_params, y_pred_proba
    

### A. Random forest classifier

In [58]:
# Create an empty dataframe with the categories to store the results of the rfc
results_rfc_df = create_results_df()

In [59]:
# Initialize the classifier
rfc = RandomForestClassifier(random_state = 1234)

# Define the parameter grid for the grid search
param_grid = {
    'n_estimators': [100, 500],
    'max_depth': [None],
    'min_samples_leaf': [1, 2],
    'max_features': ['sqrt']
}


In [61]:

#
best_params_dict, best_params, y_pred_proba = evaluate_categories(rfc, results_rfc_df)

Evaluating categories:   0%|          | 0/15 [00:00<?, ?it/s]

In [62]:
print(y_pred_proba)

[[0.32756061 0.67243939]
 [0.         1.        ]
 [0.017      0.983     ]
 ...
 [0.06821429 0.93178571]
 [0.00625    0.99375   ]
 [0.5499881  0.4500119 ]]


In [67]:
# Create a new dataframe to store the best hyperparameters for each category
best_params_df = pd.DataFrame.from_dict(best_params_dict, orient='index')

# Rename the columns of the best_params_df
best_params_df.columns = [f'{param}' for param in best_params.keys()]

# Display the best parameters
print("\nBest hyperparameters for each category:")
display(best_params_df)


Best hyperparameters for each category:


Unnamed: 0,C,penalty
,1.0,l2
Financing,10.0,l2
Production-related actions,1.0,l2
Merger & \nacquisitions,1.0,l2
Corporate \ngovernance,1.0,l2
Strategic alliance,1.0,l2
Expansion in existing market (product/service/geographical),1.0,l2
Divestiture,1.0,l2
Human resources,0.1,l2
New product introduction/\nservice offering,1.0,l2


In [68]:
# display the results of classification
results_rfc_df

Unnamed: 0,accuracy,precision,recall,f1,FN,FP,TN,TP
,0.921768,0.932119,0.984671,0.957675,697,149,397,9571
Financing,0.910856,0.054563,0.833333,0.102421,953,11,9795,55
Production-related actions,0.831977,0.027331,0.962264,0.053153,1815,2,8946,51
Merger & \nacquisitions,0.927594,0.116092,0.878261,0.205076,769,14,9930,101
Corporate \ngovernance,0.925652,0.149733,0.939597,0.258303,795,9,9870,140
Strategic alliance,0.843259,0.099143,0.929648,0.179177,1681,14,8934,185
Expansion in existing market (product/service/geographical),0.706029,0.018238,0.951613,0.03579,3176,3,7576,59
Divestiture,0.796468,0.020508,0.92,0.040122,2197,4,8567,46
Human resources,0.618088,0.009357,0.975,0.018536,4129,1,6645,39
New product introduction/\nservice offering,0.863233,0.077646,0.953846,0.143602,1473,6,9211,124


In [None]:
# add the model we used to the results
results_rfc_df['Model'] = 'BOW + rf'

In [26]:
# write away the results
results_rfc_df.to_excel('./Output/Model performance/BOW_rf.xlsx', index = False, header = True)
best_params_df.to_excel('./Output/parameters/BOW_rf.xlsx', index = False, header = True)

### B. Logistic regression

In [63]:
# Create an empty dataframe with the categories to store the results of the logistic regression
results_log_df = create_results_df()

In [64]:
# Initialize the logistic regression classifier
logreg = LogisticRegression()

# Define the parameter grid
param_grid = {
    'penalty': ['None','l1', 'l2'], # normal, lasso or ridge
    'C': [0.1, 1, 10]               # The inverse penalization term (smaller is higher penalization)
}


In [65]:
# Set a random seed for reproducibility
random.seed(123)

# 
best_params_dict, best_params, y_pred_proba = evaluate_categories(logreg, results_log_df)

Evaluating categories:   0%|          | 0/15 [00:00<?, ?it/s]

In [66]:
y_pred_proba

array([[0.41902674, 0.58097326],
       [0.1114994 , 0.8885006 ],
       [0.20779439, 0.79220561],
       ...,
       [0.27172306, 0.72827694],
       [0.22462194, 0.77537806],
       [0.48671195, 0.51328805]])

In [30]:
# Create a new dataframe to store the best hyperparameters for each category
best_params_df = pd.DataFrame.from_dict(best_params_dict, orient='index')

# Rename the columns of the best_params_df
best_params_df.columns = [f'{param}' for param in best_params.keys()]

# Display the best parameters
print("\nBest hyperparameters for each category:")
display(best_params_df)


Best hyperparameters for each category:


Unnamed: 0,C,penalty
,1.0,l2
Financing,1.0,l2
Production-related actions,1.0,l2
Merger & \nacquisitions,1.0,l2
Corporate \ngovernance,1.0,l2
Strategic alliance,1.0,l2
Expansion in existing market (product/service/geographical),1.0,l2
Divestiture,0.1,l2
Human resources,1.0,l2
New product introduction/\nservice offering,0.1,l2


In [31]:
# display the results of classification
results_log_df

Unnamed: 0,accuracy,precision,recall,f1,FN,FP,TN,TP
,0.921491,0.938507,0.976646,0.957197,622,227,472,9493
Financing,0.932402,0.070968,0.833333,0.130797,720,11,10028,55
Production-related actions,0.84705,0.030499,0.981132,0.059158,1653,1,9108,52
Merger & \nacquisitions,0.926114,0.113995,0.878261,0.201798,785,14,9914,101
Corporate \ngovernance,0.944239,0.190736,0.939597,0.317101,594,9,10071,140
Strategic alliance,0.859441,0.103303,0.864322,0.184549,1493,27,9122,172
Expansion in existing market (product/service/geographical),0.746162,0.020379,0.919355,0.039874,2740,5,8012,57
Divestiture,0.924635,0.04947,0.84,0.093437,807,8,9957,42
Human resources,0.72739,0.012408,0.925,0.024487,2945,3,7829,37
New product introduction/\nservice offering,0.927224,0.131313,0.9,0.229187,774,13,9910,117


In [32]:
# add the model we used to the results
results_log_df['Model'] = 'BOW + log'

# write away the results
results_log_df.to_excel('./Output/Model performance/BOW_log.xlsx', index = False, header = True)
best_params_df.to_excel('./Output/parameters/BOW_log.xlsx', index = False, header = True)

### C. Support Vector Machine

In [33]:
# Create an empty dataframe with the categories to store the results of the svm
results_svm_df = create_results_df()

In [34]:
# Initialize the SVM classifier
svm = SVC(random_state = 123)

# Create a parameter grid for the SVM
param_grid = {
    'C': [0.1, 1, 10, 100], # inverse regularization parameter
    'kernel': ['linear', 'poly', 'rbf'], # what type of kernel need to be used
}


In [None]:
# Set a random seed for reproducibility
random.seed(123)

# 
best_params_dict, best_params = evaluate_categories(svm, results_svm_df)

Evaluating categories:   0%|          | 0/15 [00:00<?, ?it/s]

In [None]:
# Create a new dataframe to store the best hyperparameters for each category
best_params_df = pd.DataFrame.from_dict(best_params_dict, orient='index')

# Rename the columns of the best_params_df
best_params_df.columns = [f'{param}' for param in best_params.keys()]

# Display the best parameters
print("\nBest hyperparameters for each category:")
display(best_params_df)

In [None]:
# display the results of classification
results_svm_df

In [None]:
# add the model we used to the results
results_svm_df['Model'] = 'BOW + svm'

# write away the results
results_svm_df.to_excel('./Output/Model performance/BOW_svm.xlsx', index = False, header = True)
best_params_df.to_excel('./Output/parameters/BOW_svm.xlsx', index = False, header = True)

### D. Adaboost classifier

In [None]:
# Create an empty dataframe with the categories to store the results of the ada
results_ada_df = create_results_df()

In [None]:
# Initialize decision tree base estimator for AdaBoost
base_estimator = DecisionTreeClassifier()

# Initialize AdaBoost classifier
ada = AdaBoostClassifier(base_estimator=base_estimator, random_state = 123)

# Define parameter grid for AdaBoost
param_grid = {
    'n_estimators': [50, 100, 250],   # the maximum number of estimators before boosting is terminated
    'learning_rate': [0.1, 0.5, 1.0], # weight applied to each classifier at boosting iteration
                                      # A higher learning rate increases the contribution of each classifier. 
}

In [None]:
# Set a random seed for reproducibility
random.seed(123)

# 
best_params_dict, best_params = evaluate_categories(ada, results_ada_df)

In [None]:
# Create a new dataframe to store the best hyperparameters for each category
best_params_df = pd.DataFrame.from_dict(best_params_dict, orient='index')

# Rename the columns of the best_params_df
best_params_df.columns = [f'{param}' for param in best_params.keys()]

# Display the best parameters
print("\nBest hyperparameters for each category:")
display(best_params_df)

In [None]:
# display the results of classification
results_ada_df

In [None]:
# add the model we used to the results
results_ada_df['Model'] = 'BOW + ada'

# write away the results
results_ada_df.to_excel('./Output/Model performance/BOW_ada.xlsx', index = False, header = True)
best_params_df.to_excel('./Output/parameters/BOW_ada.xlsx', index = False, header = True)

### E. Gradient boosting classifier

In [None]:
# Create an empty dataframe with the categories to store the results of the svm
results_gbc_df = create_results_df()

In [None]:
# Initialize the classifier
gbc = GradientBoostingClassifier(random_state = 123)

param_grid = {
    'n_estimators': [100, 250], # the number of boosting stages to perform
    'learning_rate': [0.1, 0.5, 1], # the contribution of each tree
    'max_depth': [1, 3, 5], # The maximum depth of individual estimators
    'subsample': [0.3, 0.6, 1],    # The fraction of samples used of fitting, if smaller than one it becomes stochastic gradient boosting
    'max_features': ['sqrt']
}

In [None]:
# Set a random seed for reproducibility
random.seed(123)

# 
best_params_dict, best_params = evaluate_categories(gbc, results_gbc_df)

In [None]:
# Create a new dataframe to store the best hyperparameters for each category
best_params_df = pd.DataFrame.from_dict(best_params_dict, orient='index')

# Rename the columns of the best_params_df
best_params_df.columns = [f'{param}' for param in best_params.keys()]

# Display the best parameters
print("\nBest hyperparameters for each category:")
display(best_params_df)

In [None]:
# display the results of classification
results_gbc_df

In [None]:
# add the model we used to the results
results_gbc_df['Model'] = 'BOW + gbc'

# write away the results
results_gbc_df.to_excel('./Output/Model performance/BOW_gbc.xlsx', index = False, header = True)
best_params_df.to_excel('./Output/parameters/BOW_gbc.xlsx', index = False, header = True)

# 1.2 TFIDTF vectorizer

In [None]:
# Define TF IDF Vectorizer #
vec_tf_idf = TfidfVectorizer(max_features= MF,
                      max_df = MAXDF,
                      ngram_range=(1, 2),
                      tokenizer=textblob_tokenizer)

In [None]:
# create independent variable with TF IDF
X_tf_idf = vec_tf_idf.fit_transform(df_train['Headline'])
print(X_tf_idf[:2])