# Vectorization

## 1. Data loading

In [74]:
# General Packages #
import os
import pandas as pd
import numpy as np
import string
import re
from scipy.stats import randint
import random

# Load TQDM to Show Progress Bars #
from tqdm import tqdm
from tqdm.notebook import tqdm as tqdm_notebook

# Sklearn Packages #
from sklearn.feature_extraction.text import TfidfTransformer, CountVectorizer, TfidfVectorizer
from sklearn.metrics import mean_squared_error, r2_score, accuracy_score, classification_report
from sklearn.model_selection import train_test_split, StratifiedShuffleSplit, StratifiedKFold, cross_val_predict, GridSearchCV
from sklearn.metrics import roc_auc_score, precision_score, recall_score, f1_score, confusion_matrix
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, BaggingClassifier, GradientBoostingClassifier
from sklearn.linear_model import Lasso, LassoCV, SGDClassifier, LogisticRegression, RidgeCV, RidgeClassifierCV, HuberRegressor, LinearRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.neural_network import MLPClassifier
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import LabelEncoder

# NLTK Packages #
import nltk
from nltk.corpus import stopwords
from textblob import TextBlob, Word

# Import necessary libraries for handling imbalanced data
from imblearn.combine import SMOTETomek
from imblearn.metrics import geometric_mean_score
from imblearn.ensemble import RUSBoostClassifier
from imblearn.ensemble import smote
from imblearn.pipeline import make_pipeline
from imblearn.over_sampling import SMOTE



In [75]:
# Turn off warnings, just to avoid pesky messages that might cause confusion here
# Remove when testing your own code #
import warnings
warnings.filterwarnings("ignore")

In [76]:
# Change to Working Directory with Training Data # 
os.chdir("/Users/Artur/Desktop/thesis_HIR_versie5/coding")

# Load Training Data #
df_train = pd.read_csv("./data/train_adjusted.csv", header = 0)
df_test = pd.read_csv("./data/test_adjusted.csv", header = 0)

# inspect the data
df_train.head(5)

Unnamed: 0,id,Headline,category
0,194578,Head Line: US Patent granted to BASF SE (Delaw...,
1,564295,Societe Generale Launches a Next-Generation Ca...,
2,504138,BARCLAYS PLC Form 8.3 - EUTELSAT COMMUNICATION...,
3,91379,ASML: 4Q Earnings Snapshot,
4,265750,Form 8.3 - AXA INVESTMENT MANAGERS : Booker Gr...,


# 1.1 Bag of words

## 1.1.2 Define the BOW vectorizer

Define the tokenizer that we will use for the vectorizers.

In [77]:
def textblob_tokenizer(str_input):
    # Convert list to string
    input_str = str_input
    if isinstance(input_str, list):
        input_str = ' '.join(input_str)
    
    # Remove punctuation
    str_input = str_input.translate(str.maketrans('', '', string.punctuation))
    
    # Define stop words
    stop_words = set(stopwords.words('english'))

    # Tokenize text
    blob = TextBlob(str_input.lower())
    tokens = blob.words
    
    # Remove numbers, stop words, and words with one character
    words = [word for word in tokens if not re.match('^\d+$', word) and word not in stop_words and len(word) > 1]
    
    # Lemmatize words
    words = [Word(word).lemmatize() for word in words]
    
    return words

In [78]:
# inspect the cleaned data
df_train['cleaned_headline'] = df_train['Headline'].apply(textblob_tokenizer)

# check the data
#df_train[["cleaned_headline", "Headline"]]

Define the parameters of the vectorizer

In [79]:
# Minimum Document Frequency -- Minimum number of times a word needs to occur to be considered #
MINDF = 10

# Maximum Document Frequency -- Maximum share of documents where a word needs to occur to be considered #
MAXDF = 0.9

# Maximum number of features we would want to consider -- ranked by most frequently occuring #
MF= 800

# NGrams -- Number of Word Pairs. Takes the form (Min, Max). E.g. (1, 2) means single words and word pairs # 
NGrams = (1,2)

Define the vectorizer itself

In [80]:
# define the vectorizer
vec_bow = CountVectorizer(max_features= MF,
                          min_df = MINDF,
                          max_df = MAXDF,
                          ngram_range=(1, 2),
                          tokenizer=textblob_tokenizer)

Define the dependent variables

In [81]:
# create independent variable with BOW
X_train_bow = vec_bow.fit_transform(df_train['Headline'])

# Transform the test set using the same vectorizer
X_test_bow = vec_bow.transform(df_test['Headline'])

In [82]:
# inspect the vectorizer and the cleaned data
print(vec_bow.vocabulary_)

{'head': 312, 'line': 407, 'u': 751, 'patent': 524, 'granted': 296, 'basf': 85, 'se': 646, 'delaware': 177, 'may': 443, 'titled': 730, 'composition': 143, 'comprising': 145, 'method': 451, 'using': 768, 'head line': 313, 'line u': 408, 'u patent': 752, 'patent granted': 527, 'basf se': 86, 'societe': 677, 'generale': 282, 'launch': 398, 'security': 650, 'societe generale': 678, 'barclays': 82, 'plc': 548, 'form': 259, 'communication': 139, 'barclays plc': 83, 'plc form': 553, 'earnings': 206, 'axa': 74, 'investment': 366, 'booker': 102, 'group': 300, 'booker group': 103, 'group plc': 303, 'glencore': 289, 'transaction': 742, 'share': 660, 'glencore plc': 290, 'plc transaction': 561, 'transaction share': 743, 'trust': 748, 'corporation': 158, 'annual': 43, 'financial': 251, 'report': 612, 'plc annual': 550, 'annual financial': 44, 'financial report': 252, 'infineon': 340, 'technology': 714, 'ag': 16, 'applies': 49, 'sensor': 655, 'system': 710, 'infineon technology': 341, 'applies u': 5

## 1.1.3 Classification for each category

In [83]:
# Create a list with the different categories
categories = df_train["category"].unique()
print(categories)

['None' 'Financing' 'Production-related actions' 'Merger & \nacquisitions'
 'Corporate \ngovernance' 'Strategic alliance'
 'Expansion in existing market (product/service/geographical)'
 'Divestiture' 'Human resources'
 'New product introduction/\nservice offering' 'External venturing'
 'Marketing' 'Product/\nservice improvement'
 'New geographical market entry' 'R&D-related actions']


In [84]:
def get_classification_metrics(y_true, y_pred, category):
    
    # Calculate Model Performance Metrics
    accuracy = accuracy_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred, pos_label = category)
    recall = recall_score(y_true, y_pred, pos_label = category)
    f1 = f1_score(y_true, y_pred, pos_label = category)
    CM = confusion_matrix(y_true, y_pred, labels= [category, 'other'])
    
    TP = np.round(CM[0][0], 3)
    FP = np.round(CM[0][1], 3)
    FN = np.round(CM[1][0], 3)
    TN = np.round(CM[1][1], 3) 
    
    return accuracy, precision, recall, f1, FN, FP, TN, TP

### A. Random forest classifier

In [85]:
# Create an empty dataframe with the categories to store the results of the rfc
results_rfc_df = pd.DataFrame(index=categories)

# Add columns for the metrics
columns = ['accuracy', 'precision', 'recall', 'f1', 'FN', 'FP', 'TN', 'TP']
for col in columns:
    results_rfc_df[col] = 0
    
# Define an empty dictionary to store the best parameters for each category
best_params_dict = {}

In [86]:
# Initialize the classifier
rfc = RandomForestClassifier()

# Define the parameter grid for the grid search
param_grid = {
    'n_estimators': [100, 500],
    'max_depth': [None],
    'min_samples_leaf': [1, 2],
    'max_features': ['sqrt']
}


In [87]:
# Define the number of folds for cross-validation
n_splits = 5

# Initialize the stratified k-fold object
skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42) # ensures class balances are kept

In [88]:
# Set a random seed for reproducibility
random.seed(1234)

# Loop through each category
for category in tqdm_notebook(categories, desc="Evaluating categories"):

    # Create a new column in the training and test dataframes to store the modified category
    df_train['modified_category'] = df_train['category'].apply(lambda x: x if x == category else 'other')
    df_test['modified_category'] = df_test['category'].apply(lambda x: x if x == category else 'other')

    # Get the indices of the category observations and non-category observations
    cat_i = np.where(df_train['modified_category'] == category)[0]
    noncat_i = np.where(df_train['modified_category'] != category)[0]

    # Shuffle the non-category indices and select the same number as the category observations
    np.random.shuffle(noncat_i)
    noncat_i = noncat_i[:len(cat_i)]

    # Combine the category and non-category indices into one list
    indices = np.concatenate((cat_i, noncat_i))

    # Shuffle the indices to ensure randomness
    np.random.shuffle(indices)

    # Extract the features and dependent variable for the selected indices
    X_train = X_train_bow[indices]
    y_train = df_train['modified_category'][indices]

    # Perform the grid search using cross-validation
    grid_search = GridSearchCV(rfc, param_grid, cv=skf, scoring='accuracy')
    grid_search.fit(X_train, y_train)

    # Get the best model and its hyperparameters
    best_model = grid_search.best_estimator_
    best_params = grid_search.best_params_

    # Store the best parameters for the current category in the dictionary
    best_params_dict[category] = best_params

    # Retrain the best model with the whole training set
    best_model.fit(X_train, y_train)

    # Make predictions on the test set
    y_pred = best_model.predict(X_test_bow)
    y_pred_proba = best_model.predict_proba(X_test_bow)

    # define the dep variable of the test set
    y_test = df_test['modified_category']

    # Calculate the classification metrics
    accuracy, precision, recall, f1, FN, FP, TN, TP = get_classification_metrics(y_test, y_pred, category)

    # Store the results in a dataframe
    results_rfc_df.loc[category] = [accuracy, precision, recall, f1, FN, FP, TN, TP]



Evaluating categories:   0%|          | 0/15 [00:00<?, ?it/s]

In [89]:
# Create a new dataframe to store the best hyperparameters for each category
best_params_df = pd.DataFrame.from_dict(best_params_dict, orient='index')

# Rename the columns of the best_params_df
best_params_df.columns = [f'{param} ({category})' for param in best_params.keys()]

# Display the best parameters
print("\nBest hyperparameters for each category:")
display(best_params_df)


Best hyperparameters for each category:


Unnamed: 0,max_depth (R&D-related actions),max_features (R&D-related actions),min_samples_leaf (R&D-related actions),n_estimators (R&D-related actions)
,,sqrt,2,500
Financing,,sqrt,2,100
Production-related actions,,sqrt,2,100
Merger & \nacquisitions,,sqrt,2,100
Corporate \ngovernance,,sqrt,2,100
Strategic alliance,,sqrt,1,500
Expansion in existing market (product/service/geographical),,sqrt,1,500
Divestiture,,sqrt,2,500
Human resources,,sqrt,2,100
New product introduction/\nservice offering,,sqrt,1,100


In [72]:
# display the results of classification
results_rfc_df

Unnamed: 0,accuracy,precision,recall,f1,FN,FP,TN,TP
,0.922878,0.932957,0.984979,0.958262,688,146,406,9574
Financing,0.903643,0.049815,0.818182,0.093913,1030,12,9718,54
Production-related actions,0.792399,0.022648,0.981132,0.044274,2244,1,8517,52
Merger & \nacquisitions,0.942482,0.141443,0.869565,0.243309,607,15,10092,100
Corporate \ngovernance,0.924542,0.147835,0.939597,0.255474,807,9,9858,140
Strategic alliance,0.835953,0.093443,0.909548,0.169476,1756,18,8859,181
Expansion in existing market (product/service/geographical),0.735343,0.01989,0.935484,0.038952,2858,4,7894,58
Divestiture,0.89902,0.038121,0.86,0.073005,1085,7,9679,43
Human resources,0.68513,0.010759,0.925,0.02127,3402,3,7372,37
New product introduction/\nservice offering,0.891992,0.092622,0.907692,0.168091,1156,12,9528,118


#### With a lower threshold

In [91]:
# Create an empty dataframe with the categories to store the results of the rfc
results_rfc_threshold_df = pd.DataFrame(index=categories)

# Add columns for the metrics
columns = ['accuracy', 'precision', 'recall', 'f1', 'FN', 'FP', 'TN', 'TP']
for col in columns:
    results_rfc_threshold_df[col] = 0
    
# Define an empty dictionary to store the best parameters for each category
best_params_dict_threshold = {}

In [None]:
# Set a random seed for reproducibility
random.seed(1234)

# Loop through each category
for category in tqdm_notebook(categories, desc="Evaluating categories"):

    # Create a new column in the training and test dataframes to store the modified category
    df_train['modified_category'] = df_train['category'].apply(lambda x: x if x == category else 'other')
    df_test['modified_category'] = df_test['category'].apply(lambda x: x if x == category else 'other')

    # Get the indices of the category observations and non-category observations
    cat_i = np.where(df_train['modified_category'] == category)[0]
    noncat_i = np.where(df_train['modified_category'] != category)[0]

    # Shuffle the non-category indices and select the same number as the category observations
    np.random.shuffle(noncat_i)
    noncat_i = noncat_i[:len(cat_i)]

    # Combine the category and non-category indices into one list
    indices = np.concatenate((cat_i, noncat_i))

    # Shuffle the indices to ensure randomness
    np.random.shuffle(indices)

    # Extract the features and dependent variable for the selected indices
    X_train = X_train_bow[indices]
    y_train = df_train['modified_category'][indices]

    # Perform the grid search using cross-validation
    grid_search = GridSearchCV(rfc, param_grid, cv=skf, scoring='accuracy')
    grid_search.fit(X_train, y_train)

    # Get the best model and its hyperparameters
    best_model = grid_search.best_estimator_
    best_params = grid_search.best_params_

    # Store the best parameters for the current category in the dictionary
    best_params_dict_threshold[category] = best_params

    # Retrain the best model with the whole training set
    best_model.fit(X_train, y_train)

    # Make predictions on the test set
    y_pred_proba = best_model.predict_proba(X_test_bow)
    
    # Set a threshold to identify more observations of the category
    threshold = 0.4
    
    # Convert the probabilities to binary predictions using the threshold
    y_pred = (y_pred_proba[:, 1] >= threshold).astype(int)

    # Define the dep variable of the test set
    y_test = df_test['modified_category']

    # Calculate the classification metrics
    accuracy, precision, recall, f1, FN, FP, TN, TP = get_classification_metrics(y_test, y_pred, category)

    # Store the results in a dataframe
    results_rfc_threshold_df.loc[category] = [accuracy, precision, recall, f1, FN, FP, TN, TP]



Evaluating categories:   0%|          | 0/15 [00:00<?, ?it/s]

In [None]:
# Create a new dataframe to store the best hyperparameters for each category
best_params_df_dictionary = pd.DataFrame.from_dict(best_params_dict_threshold, orient='index')

# Rename the columns of the best_params_df
best_params_df_dictionary.columns = [f'{param} ({category})' for param in best_params.keys()]

# Display the best parameters
print("\nBest hyperparameters for each category:")
display(best_params_df)

In [None]:
# inspect the results
results_rfc_threshold_df

### B. Logistic regression

In [None]:
# Initialize the logistic regression classifier
logreg = LogisticRegression()

# Define the parameter grid
param_grid = {
    'penalty': ['None','l1', 'l2'], # normal, lasso or ridge
    'C': [0.1, 1, 10]               # The inverse penalization term (smaller is higher penalization)
}


In [None]:
# Set a random seed for reproducibility
random.seed(1234)

# Loop through each category
for category in tqdm_notebook(categories, desc="Evaluating categories"):

    # Create a new column in the training and test dataframes to store the modified category
    df_train['modified_category'] = df_train['category'].apply(lambda x: x if x == category else 'other')
    df_test['modified_category'] = df_test['category'].apply(lambda x: x if x == category else 'other')

    # Get the indices of the category observations and non-category observations
    cat_i = np.where(df_train['modified_category'] == category)[0]
    noncat_i = np.where(df_train['modified_category'] != category)[0]

    # Shuffle the non-category indices and select the same number as the category observations
    np.random.shuffle(noncat_i)
    noncat_i = noncat_i[:len(cat_i)]

    # Combine the category and non-category indices into one list
    indices = np.concatenate((cat_i, noncat_i))

    # Shuffle the indices to ensure randomness
    np.random.shuffle(indices)

    # Extract the features and dependent variable for the selected indices
    X_train = X_train_bow[indices]
    y_train = df_train['modified_category'][indices]

    # Perform the grid search using cross-validation
    grid_search = GridSearchCV(rfc, param_grid, cv=skf, scoring='accuracy')
    grid_search.fit(X_train, y_train)

    # Get the best model and its hyperparameters
    best_model = grid_search.best_estimator_
    best_params = grid_search.best_params_

    # Store the best parameters for the current category in the dictionary
    best_params_dict[category] = best_params

    # Retrain the best model with the whole training set
    best_model.fit(X_train, y_train)

    # Make predictions on the test set
    y_pred = best_model.predict(X_test_bow)
    y_pred_proba = best_model.predict_proba(X_test_bow)

    # define the dep variable of the test set
    y_test = df_test['modified_category']

    # Calculate the classification metrics
    accuracy, precision, recall, f1, FN, FP, TN, TP = get_classification_metrics(y_test, y_pred, category)

    # Store the results in a dataframe
    results_rfc_df.loc[category] = [accuracy, precision, recall, f1, FN, FP, TN, TP]



# 1.2 TFIDTF vectorizer

In [None]:
# Define TF IDF Vectorizer #
vec_tf_idf = TfidfVectorizer(max_features= MF,
                      max_df = MAXDF,
                      ngram_range=(1, 2),
                      tokenizer=textblob_tokenizer)

In [None]:
# create independent variable with TF IDF
X_tf_idf = vec_tf_idf.fit_transform(df_train['Headline'])
print(X_tf_idf[:2])

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import StratifiedKFold, GridSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

# Get the indices of the category observations and non-category observations
    cat_i = np.where(df_train['category'] == category)[0]
    noncat_i = np.where(df_train['category'] != category)[0]
    
    # Shuffle the non-category indices and select the same number as the category observations
    np.random.shuffle(noncat_i)
    noncat_i = noncat_i[:len(cat_i)]
    
    # Combine the category and non-category indices into one list
    indices = np.concatenate((cat_i, noncat_i))
    
    # Shuffle the indices to ensure randomness
    np.random.shuffle(indices)
    
    # Extract the features and dependent variable for the selected indices
    X_train = X_train_bow[indices]
    y_train = df_train['category'][indices]



In [None]:
 # Loop through each fold of the stratified k-fold object
    for train_index, test_index in tqdm_notebook(skf.split(X_train, y_train), desc = 'k-fold'):
        
        # convert the dependent variable to an array
        Y = np.asarray(y_train)
        
        # Split the data into training and validation sets using the current fold indices
        X_train_c, X_val_c = X_train[train_index], X_train[test_index]
        y_train_c, y_val_c = Y[train_index], Y[test_index]
        
        # Train a random forest classifier on the training set
        rf = RandomForestClassifier()
        rf.fit(X_train_c, y_train_c)
        
        # Evaluate the model on the validation set
        score = rf.score(X_val_c, y_val_c)
        print(f"Category {category},: Accuracy = {score}")