# Vectorization

## 1. Data loading

In [1]:
# General Packages #
import os
import pandas as pd
import numpy as np
import string
import re
from scipy.stats import randint
import random

# Load TQDM to Show Progress Bars #
from tqdm import tqdm
from tqdm.notebook import tqdm as tqdm_notebook

# Sklearn Packages #
from sklearn.feature_extraction.text import TfidfTransformer, CountVectorizer, TfidfVectorizer
from sklearn.metrics import mean_squared_error, r2_score, accuracy_score, classification_report
from sklearn.model_selection import train_test_split, StratifiedShuffleSplit, StratifiedKFold, cross_val_predict, GridSearchCV
from sklearn.metrics import roc_auc_score, precision_score, recall_score, f1_score, confusion_matrix
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, BaggingClassifier, GradientBoostingClassifier
from sklearn.linear_model import Lasso, LassoCV, SGDClassifier, LogisticRegression, RidgeCV, RidgeClassifierCV, HuberRegressor, LinearRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.neural_network import MLPClassifier
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.calibration import CalibratedClassifierCV

# NLTK Packages #
import nltk
from nltk.corpus import stopwords
from textblob import TextBlob, Word

# Import necessary libraries for handling imbalanced data
from imblearn.combine import SMOTETomek
from imblearn.metrics import geometric_mean_score
from imblearn.ensemble import RUSBoostClassifier
from imblearn.ensemble import smote
from imblearn.pipeline import make_pipeline
from imblearn.over_sampling import SMOTE



In [2]:
# Turn off warnings, just to avoid pesky messages that might cause confusion here
# Remove when testing your own code #
import warnings
warnings.filterwarnings("ignore")

In [3]:
# Change to Working Directory with Training Data # 
os.chdir("/Users/Artur/Desktop/thesis_HIR_versie5/coding")

# Load Training Data #
df_train = pd.read_csv("./data/train_adjusted.csv", header = 0)
df_test = pd.read_csv("./data/test_adjusted.csv", header = 0)

# inspect the data
df_train.head(5)

Unnamed: 0,id,Headline,category
0,194578,Head Line: US Patent granted to BASF SE (Delaw...,
1,564295,Societe Generale Launches a Next-Generation Ca...,
2,504138,BARCLAYS PLC Form 8.3 - EUTELSAT COMMUNICATION...,
3,91379,ASML: 4Q Earnings Snapshot,
4,265750,Form 8.3 - AXA INVESTMENT MANAGERS : Booker Gr...,


In [4]:
# read in the data about the different companies
companies = pd.read_excel("./data/companies.xlsx", header = 0)
companies.head(5)

Unnamed: 0,NameCompany
0,Andritz AG
1,ams AG
2,voestalpine AG
3,OMV AG
4,Wienerberger AG
5,Verbund AG
6,Telekom Austria AG
7,BUWOG Group
8,Immofinanz AG
9,Raiffeisen Bank International


# 1.1 Bag of words

## 1.1.2 Define the BOW vectorizer

Define the tokenizer that we will use for the vectorizers.

In [5]:
# define a list with the companies names to use in the tokenizer
# Do not use this information in your model
company_info = set()

for name in companies['NameCompany']:
    name = name.lower()
    words = name.split()
    company_info.update(words)

len(company_info) 

1137

Determine a set of words that does not contain any information about the company, such as legal prefixes. Therefore, we inspect which words are most frequently used in the names of the companies. When we inspected the results, we saw 1021 of the 1113 words were only used once in all the company names. These words can be seen as to company specific and will be removed from our headlines.

In [6]:
# Define a list with only the companies names
def company_name_tokenizer(name):
    # Remove special characters and digits
    name = re.sub(r"[^\w\s]", "", name)
    
    # Convert the name to lowercase
    name = name.lower()
    
    # Split the name into individual words
    words = name.split()
    
    return words

companies['cleaned_name'] = companies['NameCompany'].apply(company_name_tokenizer)
companies.head(5)

Unnamed: 0,NameCompany,cleaned_name
0,Andritz AG,"[andritz, ag]"
1,ams AG,"[ams, ag]"
2,voestalpine AG,"[voestalpine, ag]"
3,OMV AG,"[omv, ag]"
4,Wienerberger AG,"[wienerberger, ag]"


In [7]:
# Assuming 'column' is the column name containing the arrays
company_names_array = np.concatenate(companies['cleaned_name'].values)

# Perform value counts on the resulting array
frequent_company_info = np.unique(company_names_array, return_counts=True)

# Create a dataframe from the value counts
df_value_counts = pd.DataFrame({'Word': frequent_company_info[0], 
                                'Count': frequent_company_info[1]})

# Sort the dataframe in descending order by the 'Count' column
df_value_counts = df_value_counts.sort_values('Count', ascending=False)
#df_value_counts['Count'].value_counts()

Create a set of these words

In [8]:
# Filter the dataframe based on the count threshold
general_voc = df_value_counts[df_value_counts['Count'] >= 2]['Word'].tolist()

# Create a set from the filtered values
general_voc = set(general_voc)

In [9]:
# remove these words from the set company info
company_info = company_info.difference(general_voc)

In [10]:
def textblob_tokenizer(str_input):
    # Convert list to string
    input_str = str_input
    if isinstance(input_str, list):
        input_str = ' '.join(input_str)
        
    # Remove currency symbols
    str_input = re.sub(r'\$|£|€', '', str_input)
    
    # Remove punctuation
    str_input = str_input.translate(str.maketrans('', '', string.punctuation))
    
    # Define stop words
    stop_words = set(stopwords.words('english'))

    # Tokenize text
    blob = TextBlob(str_input.lower())
    tokens = blob.words
    
    # Remove numbers, stop words, company information and words with one character
    words = [word for word in tokens if not re.match('^\d+$', word) and word not in stop_words
                                        and word not in company_info and len(word) > 1]

    
    # Lemmatize words
    words = [Word(word).lemmatize() for word in words]
    
    return words

In [11]:
# inspect the cleaned data
df_train['cleaned_headline'] = df_train['Headline'].apply(textblob_tokenizer)

# check the data
#df_train[["cleaned_headline", "Headline"]]

Define the parameters of the vectorizer

In [12]:
# Minimum Document Frequency -- Minimum number of times a word needs to occur to be considered #
MINDF = 10

# Maximum Document Frequency -- Maximum share of documents where a word needs to occur to be considered #
MAXDF = 0.9

# Maximum number of features we would want to consider -- ranked by most frequently occuring #
MF= 800

# NGrams -- Number of Word Pairs. Takes the form (Min, Max). E.g. (1, 2) means single words and word pairs # 
NGrams = (1,2)

Define the vectorizer itself

In [13]:
# define the vectorizer
vec_bow = CountVectorizer(max_features= MF,
                          min_df = MINDF,
                          max_df = MAXDF,
                          ngram_range=(1, 2),
                          tokenizer=textblob_tokenizer)

Define the dependent variables

In [14]:
# inspect the vectorizer and the cleaned data
#print(vec_bow.vocabulary_)

## 1.1.3 Classification for each category

In [15]:
# Create a list with the different categories
categories = df_train["category"].unique()
print(categories)

['None' 'Financing' 'Production-related actions' 'Merger & \nacquisitions'
 'Corporate \ngovernance' 'Strategic alliance'
 'Expansion in existing market (product/service/geographical)'
 'Divestiture' 'Human resources'
 'New product introduction/\nservice offering' 'External venturing'
 'Marketing' 'Product/\nservice improvement'
 'New geographical market entry' 'R&D-related actions']


In [16]:
# Define the prior probabilities for each category

# Calculate the prior probabilities
prior_probabilities_df = df_train['category'].value_counts() / len(df_train)

# Convert prior probabilities to a dictionary
prior_probabilities_dict = prior_probabilities_df.to_dict()


In [17]:
def create_results_df():
    # Create an empty dataframe with the categories to store the results of the rfc
    results_df = pd.DataFrame(index=categories)

    # Add columns for the metrics
    columns = ['accuracy', 'precision', 'recall', 'f1', 'auc', 'FN', 'TN', 'FP', 'TP']
    for col in columns:
        results_df[col] = 0

    return results_df

In [18]:
def get_classification_metrics(y_true, y_pred, y_pred_proba, category):
    
    # Calculate Model Performance Metrics
    accuracy = accuracy_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred, pos_label = category)
    recall = recall_score(y_true, y_pred, pos_label = category)
    f1 = f1_score(y_true, y_pred, pos_label = category)
    auc = roc_auc_score(y_true, y_pred_proba)
    
    CM = confusion_matrix(y_true, y_pred, labels= ['other', category])
    TN, FP, FN, TP = CM.ravel()
    
    return accuracy, precision, recall, f1, auc, TN, FP, FN, TP

In [19]:
# Define the number of folds for cross-validation
n_splits = 5

# Initialize the stratified k-fold object
skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42) # ensures class balances are kept


In [20]:
# Create an empty dataframe with the categories to store the results of the logistic regression
results_log_df = create_results_df()

In [21]:
# Initialize the logistic regression classifier
logreg = LogisticRegression()

# Define the parameter grid
param_grid = {
    'penalty': ['None','l1', 'l2'], # normal, lasso or ridge
    'C': [0.1, 1, 10]               # The inverse penalization term (smaller is higher penalization)
}


In [22]:
from sklearn.metrics import make_scorer, f1_score

# Empty the dictionary with optimal parameters for each category
best_params_dict = {}
    
# Initialize a list to store the final predictions
final_predictions = []

# Create an empty dataframe with columns for each category
proba_df = pd.DataFrame(columns=categories)

    
for category in tqdm_notebook(categories, desc="Evaluating categories"):
    
    # Set a random seed for reproducibility
    random.seed(123)
        
    # Create a new column in the training and test dataframes to store the modified category
    df_train['modified_category'] = df_train['category'].apply(lambda x: x if x == category else 'other')
    df_test['modified_category'] = df_test['category'].apply(lambda x: x if x == category else 'other')

    # Get the indices of the category observations and non-category observations
    cat_i = np.where(df_train['modified_category'] == category)[0]
    noncat_i = np.where(df_train['modified_category'] != category)[0]
    
    # Shuffle the non-category indices and select the same number as the category observations
    """
    if category == 'None':
        np.random.shuffle(cat_i)         # randomly shuffle category none
        cat_i = cat_i[:len(noncat_i)] # get the same nr of random observations as the others
    else:
        np.random.shuffle(noncat_i)
        noncat_i = noncat_i[:len(cat_i)]
    """
    
    # if the same for None category
    np.random.shuffle(noncat_i)
    rus_n = int(len(cat_i))
    noncat_i = noncat_i[:rus_n]
    
    
    # Combine the category and non-category indices into one list
    indices = np.concatenate((cat_i, noncat_i))

    # Shuffle the indices to ensure randomness
    np.random.shuffle(indices)
    
    # Extract the features and dependent variable for the selected indices
    X_train = df_train['Headline'][indices]
    y_train = df_train['modified_category'][indices]
    
    # create independent variable with BOW
    X_train_vec = vec_bow.fit_transform(X_train)
    #print(f'{category}: {X_train_vec.shape}')

    # Transform the test set using the same vectorizer
    X_test_vec = vec_bow.transform(df_test['Headline'])
    
    # create a training to train the model and on to train the calibration classifier
    #X_train_model, X_train_cal, y_train_model, y_train_cal = train_test_split(df_train, df_train['category'], test_size= 0.2, stratify = df_train['category'], random_state = 123)
    # 
    
    # Define the scoring metric
    scoring = make_scorer(f1_score, average= 'macro')

    # Perform the grid search using cross-validation
    grid_search = GridSearchCV(logreg, param_grid, cv=skf, scoring= scoring)
    grid_search.fit(X_train_vec, y_train)

    # Get the best model and its hyperparameters
    best_model = grid_search.best_estimator_
    best_params = grid_search.best_params_

    # Store the best parameters for the current category in the dictionary
    best_params_dict[category] = best_params

    # Retrain the best model with the whole training set
    best_model.fit(X_train_vec, y_train)
        
    # Calibrate the model using Platt scaling
    #calibrated_model = CalibratedClassifierCV(best_model, cv= 3, method='sigmoid')
        
    # Fit the calibrated model
    #calibrated_model.fit(X_train_bow, df_train['modified_category'])
    #calibrated_model.fit(X_train, y_train)

    # Make predictions on the test set
    y_pred = best_model.predict(X_test_vec)
    y_pred_proba = best_model.predict_proba(X_test_vec)
    #y_pred = calibrated_model.predict(X_test_vec)
    #y_pred_proba = calibrated_model.predict_proba(X_test_vec)

    # Store the predicted probabilities for the current category
    cat_proba = y_pred_proba[:, 1]
    #print(f'Category: {category}: {y_pred_proba}')
    #proba_df.loc[category, 'predicted_proba'] = cat_proba
    
    # Add the probabilities as a column to the dataframe
    for j in range(len(cat_proba)):
        proba_df.loc[j, category] = cat_proba[j]
        
    # define the dep variable of the test set
    y_test = df_test['modified_category']
    
    # Calculate the classification metrics
    accuracy, precision, recall, f1, auc, TN, FP, FN, TP = get_classification_metrics(y_test, y_pred, cat_proba, category)

    # Store the results in a dataframe
    results_log_df.loc[category] = [accuracy, precision, recall, f1, auc, FN, TN, FP, TP]
    

    

Evaluating categories:   0%|          | 0/15 [00:00<?, ?it/s]

In [23]:
# met precision als metric
results_log_df

Unnamed: 0,accuracy,precision,recall,f1,auc,FN,TN,FP,TP
,0.926947,0.943396,0.977366,0.960081,0.947804,220,524,570,9500
Financing,0.948308,0.092562,0.848485,0.166915,0.948237,10,10199,549,56
Production-related actions,0.937581,0.061972,0.830189,0.115334,0.952931,9,10095,666,44
Merger & \nacquisitions,0.954966,0.172535,0.852174,0.286969,0.96541,17,10229,470,98
Corporate \ngovernance,0.954041,0.224684,0.95302,0.363636,0.986721,7,10175,490,142
Strategic alliance,0.913538,0.157356,0.849246,0.265515,0.961665,30,9710,905,169
Expansion in existing market (product/service/geographical),0.655909,0.015107,0.919355,0.029726,0.891282,5,7036,3716,57
Divestiture,0.957463,0.081633,0.8,0.148148,0.944151,10,10314,450,40
Human resources,0.956908,0.044872,0.525,0.082677,0.895609,19,10327,447,21
New product introduction/\nservice offering,0.929536,0.135104,0.9,0.23494,0.972504,13,9935,749,117


In [24]:
# precision as metric
proba_df

Unnamed: 0,None,Financing,Production-related actions,Merger & \nacquisitions,Corporate \ngovernance,Strategic alliance,Expansion in existing market (product/service/geographical),Divestiture,Human resources,New product introduction/\nservice offering,External venturing,Marketing,Product/\nservice improvement,New geographical market entry,R&D-related actions
0,0.007527,0.838715,0.520561,0.141265,0.895209,0.705289,0.551509,0.782902,0.465889,0.919948,0.785517,0.846584,0.574004,0.515255,0.611305
1,0.000081,0.980899,0.99561,0.996054,0.998535,0.996482,0.956249,0.999355,0.901563,0.997754,0.994763,0.996572,0.94439,0.813921,0.902922
2,0.000023,0.969062,0.961693,0.974006,0.996641,0.957173,0.886668,0.995808,0.826499,0.982801,0.993548,0.981523,0.86565,0.723175,0.830439
3,0.000289,0.785079,0.578781,0.692923,0.90507,0.471874,0.410404,0.775043,0.531743,0.257556,0.834339,0.440935,0.389497,0.390609,0.455401
4,0.000206,0.81545,0.736584,0.838023,0.507327,0.935119,0.570805,0.823792,0.564356,0.897649,0.922844,0.60636,0.598257,0.515255,0.631191
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10809,0.000986,0.980899,0.99561,0.996054,0.998535,0.99393,0.956249,0.999355,0.901563,0.996711,0.994763,0.996572,0.94439,0.813921,0.902922
10810,0.000039,0.785079,0.926938,0.937819,0.998166,0.937258,0.569825,0.534713,0.503365,0.976878,0.834339,0.440935,0.444777,0.476437,0.483994
10811,0.000002,0.964013,0.975288,0.976916,0.999896,0.9879,0.873114,0.993994,0.768882,0.977274,0.993617,0.964747,0.83224,0.628846,0.730144
10812,0.000069,0.969283,0.969485,0.980339,0.997048,0.96774,0.844689,0.996929,0.806792,0.988647,0.976395,0.964538,0.829759,0.691023,0.781968


In [25]:
# calculate class frequencies to normalize
class_frequencies = df_train['category'].value_counts()/len(df_train['category'])

In [26]:
# Create a new dataframe to store the scaled probabilities
scaled_proba_df = pd.DataFrame(columns=proba_df.columns)

# Iterate over the columns of proba_df
for column in proba_df.columns:
    # Multiply the column with the corresponding class frequency
    scaled_proba_df[column] = proba_df[column] * class_frequencies[column]

scaled_proba_df


Unnamed: 0,None,Financing,Production-related actions,Merger & \nacquisitions,Corporate \ngovernance,Strategic alliance,Expansion in existing market (product/service/geographical),Divestiture,Human resources,New product introduction/\nservice offering,External venturing,Marketing,Product/\nservice improvement,New geographical market entry,R&D-related actions
0,0.006765,0.00508,0.002551,0.001502,0.012356,0.012996,0.003175,0.003602,0.001734,0.011017,0.00454,0.004012,0.003371,0.000905,0.00195
1,0.000073,0.005942,0.00488,0.010593,0.013782,0.018361,0.005505,0.004598,0.003356,0.011949,0.00575,0.004723,0.005546,0.00143,0.002881
2,0.000021,0.00587,0.004714,0.010358,0.013756,0.017637,0.005104,0.004581,0.003076,0.01177,0.005743,0.004652,0.005083,0.001271,0.002649
3,0.00026,0.004755,0.002837,0.007369,0.012492,0.008695,0.002363,0.003566,0.001979,0.003084,0.004822,0.00209,0.002287,0.000686,0.001453
4,0.000185,0.004939,0.00361,0.008912,0.007002,0.017231,0.003286,0.00379,0.002101,0.01075,0.005334,0.002874,0.003513,0.000905,0.002014
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10809,0.000887,0.005942,0.00488,0.010593,0.013782,0.018314,0.005505,0.004598,0.003356,0.011936,0.00575,0.004723,0.005546,0.00143,0.002881
10810,0.000035,0.004755,0.004543,0.009974,0.013777,0.01727,0.00328,0.00246,0.001874,0.011699,0.004822,0.00209,0.002612,0.000837,0.001544
10811,0.000002,0.005839,0.00478,0.010389,0.013801,0.018203,0.005026,0.004573,0.002862,0.011704,0.005743,0.004572,0.004887,0.001105,0.002329
10812,0.000062,0.005871,0.004752,0.010426,0.013761,0.017832,0.004863,0.004587,0.003003,0.01184,0.005643,0.004571,0.004873,0.001214,0.002495


In [35]:
# Convert the dtype of the dataframe to float
scaled_proba_df = scaled_proba_df.astype(float)

# Extract the highest value in each row
highest_values = scaled_proba_df.max(axis=1)

# Find the column name with the highest value in each row
category_predictions = scaled_proba_df.idxmax(axis=1)

# Create the final_predictions DataFrame
final_predictions = pd.DataFrame({'category': category_predictions, 'probability': highest_values})


In [36]:
y_true = df_test['category']
type(y_true)

pandas.core.series.Series

In [38]:
# calculate the accuracy of the model
accuracy = accuracy_score(y_true, final_predictions['category'])
accuracy

0.2341409284261143

In [41]:
# calculate the f1 score of the model
f1 = f1_score(y_true, final_predictions['category'], average = 'macro')
f1

0.02531303891429857

In [42]:
# inspect the final predictions
final_predictions['category'].value_counts()

Strategic alliance                             6841
None                                           3617
Corporate \ngovernance                          354
New product introduction/\nservice offering       2
Name: category, dtype: int64

#### Make the same model and calibrate the probabilities with Platt scaling

In [28]:
# split the dataset into a training set for the model and the calibration
train_model, train_cal = train_test_split(df_train, test_size = 0.25, stratify=df_train['category'],
                                          random_state = 7, shuffle = True)
# reset the index to extract indices in code
train_model = train_model.reset_index(drop=True)

In [30]:
from sklearn.metrics import make_scorer, f1_score

# Empty the dictionary with optimal parameters for each category
best_params_dict = {}
    
# Initialize a list to store the final predictions
final_predictions = []

# Create an empty dataframe with columns for each category
proba_df = pd.DataFrame(columns=categories)

    
for category in tqdm_notebook(categories, desc="Evaluating categories"):
    
    # Set a random seed for reproducibility
    random.seed(123)
        
    # Create a new column in the training and test dataframes to store the modified category
    train_model['modified_category'] = train_model['category'].apply(lambda x: x if x == category else 'other')
    df_test['modified_category'] = df_test['category'].apply(lambda x: x if x == category else 'other')
    train_cal['modified_category'] = train_cal['category'].apply(lambda x: x if x == category else 'other')

    
    # Get the indices of the category observations and non-category observations
    cat_i = np.where(train_model['modified_category'] == category)[0]
    noncat_i = np.where(train_model['modified_category'] != category)[0]
    
    # Shuffle the non-category indices and select the same number as the category observations
    """
    if category == 'None':
        np.random.shuffle(cat_i)         # randomly shuffle category none
        cat_i = cat_i[:len(noncat_i)] # get the same nr of random observations as the others
    else:
        np.random.shuffle(noncat_i)
        noncat_i = noncat_i[:len(cat_i)]
    """
    
    # if the same for None category
    np.random.shuffle(noncat_i)
    noncat_i = noncat_i[:len(cat_i)]
    
    # Combine the category and non-category indices into one list
    indices = np.concatenate((cat_i, noncat_i))

    # Shuffle the indices to ensure randomness
    np.random.shuffle(indices)
    
    # Extract the features and dependent variable for the selected indices
    X_train = train_model['Headline'][indices]
    y_train = train_model['modified_category'][indices]
    
    # create independent variable with BOW and transform the test set with the same vectorizer
    X_train_vec = vec_bow.fit_transform(X_train)
    X_test_vec = vec_bow.transform(df_test['Headline'])
    
    # Do the same for the calibration model
    X_train_cal = train_cal['Headline']
    X_train_cal_vec = vec_bow.transform(X_train_cal)
    
    # Define the scoring metric
    #scoring = make_scorer(f1_score, average= 'macro')

    # Perform the grid search using cross-validation
    grid_search = GridSearchCV(logreg, param_grid, cv=skf, scoring= 'accuracy')
    grid_search.fit(X_train_vec, y_train)

    # Get the best model and its hyperparameters
    best_model = grid_search.best_estimator_
    best_params = grid_search.best_params_

    # Store the best parameters for the current category in the dictionary
    best_params_dict[category] = best_params

    # Retrain the best model with the whole training set
    best_model.fit(X_train_vec, y_train)
        
    # Calibrate the model using Platt scaling
    calibrated_model = CalibratedClassifierCV(best_model, cv= 3, method='sigmoid')
        
    # Fit the calibrated model
    calibrated_model.fit(X_train_cal_vec, train_cal['modified_category'])
    #calibrated_model.fit(X_train, y_train)

    # Make predictions on the test set
    #y_pred = best_model.predict(X_test_vec)
    #y_pred_proba = best_model.predict_proba(X_test_vec)
    y_pred = calibrated_model.predict(X_test_vec)
    y_pred_proba = calibrated_model.predict_proba(X_test_vec)

    # Store the predicted probabilities for the current category
    cat_proba = y_pred_proba[:, 1]
    #print(f'Category: {category}: {y_pred_proba}')
    #proba_df.loc[category, 'predicted_proba'] = cat_proba
    
    # Add the probabilities as a column to the dataframe
    for j in range(len(cat_proba)):
        proba_df.loc[j, category] = cat_proba[j]
        
    # define the dep variable of the test set
    y_test = df_test['modified_category']
    
    # Calculate the classification metrics
    accuracy, precision, recall, f1, auc, TN, FP, FN, TP = get_classification_metrics(y_test, y_pred, cat_proba, category)

    # Store the results in a dataframe
    results_log_df.loc[category] = [accuracy, precision, recall, f1, auc, FN, TN, FP, TP]
    

    

Evaluating categories:   0%|          | 0/15 [00:00<?, ?it/s]

In [31]:
# probabilities after calibration
proba_df

Unnamed: 0,None,Financing,Production-related actions,Merger & \nacquisitions,Corporate \ngovernance,Strategic alliance,Expansion in existing market (product/service/geographical),Divestiture,Human resources,New product introduction/\nservice offering,External venturing,Marketing,Product/\nservice improvement,New geographical market entry,R&D-related actions
0,0.052131,0.992427,0.995764,0.873953,0.98703,0.98644,0.997206,0.997632,0.996234,0.995293,0.987353,0.99865,0.996847,0.999139,0.99858
1,0.00559,0.99879,0.999941,0.999494,0.99941,0.999871,0.999872,0.999841,0.999953,0.999788,0.999432,0.999935,0.99986,0.999899,0.999972
2,0.00814,0.999144,0.99971,0.998688,0.997538,0.998991,0.999623,0.999681,0.999871,0.999323,0.999513,0.999843,0.999633,0.999857,0.999905
3,0.054388,0.995522,0.995707,0.995028,0.99063,0.983714,0.991596,0.996627,0.995642,0.981226,0.996521,0.993134,0.991898,0.990637,0.995189
4,0.006516,0.996525,0.995764,0.984774,0.986209,0.995085,0.997167,0.997632,0.998192,0.996568,0.999842,0.997534,0.997236,0.999139,0.997571
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10809,0.014506,0.99879,0.999941,0.999494,0.999317,0.999871,0.999872,0.999841,0.999953,0.999788,0.999432,0.999935,0.99986,0.999899,0.999972
10810,0.00145,0.995522,0.995707,0.995044,0.99986,0.998918,0.993752,0.996361,0.995642,0.992045,0.996521,0.993134,0.991898,0.997702,0.995189
10811,0.000075,0.999201,0.99987,0.999448,0.99983,0.999731,0.999405,0.999403,0.999468,0.998913,0.999471,0.999544,0.99938,0.997702,0.999549
10812,0.009067,0.998853,0.999714,0.999191,0.999066,0.998895,0.999363,0.999545,0.999785,0.999063,0.997743,0.999614,0.999296,0.99973,0.999824


### A. Random forest classifier

In [35]:
# Create an empty dataframe with the categories to store the results of the rfc
results_rfc_df = create_results_df()

In [36]:
# Initialize the classifier
rfc = RandomForestClassifier(random_state = 1234)

# Define the parameter grid for the grid search
param_grid = {
    'n_estimators': [100, 500],
    'max_depth': [None],
    'min_samples_leaf': [1, 2],
    'max_features': ['sqrt']
}


In [37]:

#
best_params_dict, best_params, final_predictions = evaluate_categories(rfc, results_rfc_df)

Evaluating categories:   0%|          | 0/15 [00:00<?, ?it/s]

ValueError: Found input variables with inconsistent numbers of samples: [10814, 162210]

In [38]:
print(final_predictions)

NameError: name 'final_predictions' is not defined

In [67]:
# Create a new dataframe to store the best hyperparameters for each category
best_params_df = pd.DataFrame.from_dict(best_params_dict, orient='index')

# Rename the columns of the best_params_df
best_params_df.columns = [f'{param}' for param in best_params.keys()]

# Display the best parameters
print("\nBest hyperparameters for each category:")
display(best_params_df)


Best hyperparameters for each category:


Unnamed: 0,C,penalty
,1.0,l2
Financing,10.0,l2
Production-related actions,1.0,l2
Merger & \nacquisitions,1.0,l2
Corporate \ngovernance,1.0,l2
Strategic alliance,1.0,l2
Expansion in existing market (product/service/geographical),1.0,l2
Divestiture,1.0,l2
Human resources,0.1,l2
New product introduction/\nservice offering,1.0,l2


In [68]:
# display the results of classification
results_rfc_df

Unnamed: 0,accuracy,precision,recall,f1,FN,FP,TN,TP
,0.921768,0.932119,0.984671,0.957675,697,149,397,9571
Financing,0.910856,0.054563,0.833333,0.102421,953,11,9795,55
Production-related actions,0.831977,0.027331,0.962264,0.053153,1815,2,8946,51
Merger & \nacquisitions,0.927594,0.116092,0.878261,0.205076,769,14,9930,101
Corporate \ngovernance,0.925652,0.149733,0.939597,0.258303,795,9,9870,140
Strategic alliance,0.843259,0.099143,0.929648,0.179177,1681,14,8934,185
Expansion in existing market (product/service/geographical),0.706029,0.018238,0.951613,0.03579,3176,3,7576,59
Divestiture,0.796468,0.020508,0.92,0.040122,2197,4,8567,46
Human resources,0.618088,0.009357,0.975,0.018536,4129,1,6645,39
New product introduction/\nservice offering,0.863233,0.077646,0.953846,0.143602,1473,6,9211,124


In [None]:
# add the model we used to the results
results_rfc_df['Model'] = 'BOW + rf'

In [26]:
# write away the results
results_rfc_df.to_excel('./Output/Model performance/BOW_rf.xlsx', index = False, header = True)
best_params_df.to_excel('./Output/parameters/BOW_rf.xlsx', index = False, header = True)

### B. Logistic regression

In [104]:
# Create an empty dataframe with the categories to store the results of the logistic regression
results_log_df = create_results_df()

In [105]:
# Initialize the logistic regression classifier
logreg = LogisticRegression()

# Define the parameter grid
param_grid = {
    'penalty': ['None','l1', 'l2'], # normal, lasso or ridge
    'C': [0.1, 1, 10]               # The inverse penalization term (smaller is higher penalization)
}


In [65]:
# Set a random seed for reproducibility
random.seed(123)

# 
best_params_dict, best_params, y_pred_proba = evaluate_categories(logreg, results_log_df)

Evaluating categories:   0%|          | 0/15 [00:00<?, ?it/s]

In [66]:
y_pred_proba

array([[0.41902674, 0.58097326],
       [0.1114994 , 0.8885006 ],
       [0.20779439, 0.79220561],
       ...,
       [0.27172306, 0.72827694],
       [0.22462194, 0.77537806],
       [0.48671195, 0.51328805]])

In [30]:
# Create a new dataframe to store the best hyperparameters for each category
best_params_df = pd.DataFrame.from_dict(best_params_dict, orient='index')

# Rename the columns of the best_params_df
best_params_df.columns = [f'{param}' for param in best_params.keys()]

# Display the best parameters
print("\nBest hyperparameters for each category:")
display(best_params_df)


Best hyperparameters for each category:


Unnamed: 0,C,penalty
,1.0,l2
Financing,1.0,l2
Production-related actions,1.0,l2
Merger & \nacquisitions,1.0,l2
Corporate \ngovernance,1.0,l2
Strategic alliance,1.0,l2
Expansion in existing market (product/service/geographical),1.0,l2
Divestiture,0.1,l2
Human resources,1.0,l2
New product introduction/\nservice offering,0.1,l2


In [31]:
# display the results of classification
results_log_df

Unnamed: 0,accuracy,precision,recall,f1,FN,FP,TN,TP
,0.921491,0.938507,0.976646,0.957197,622,227,472,9493
Financing,0.932402,0.070968,0.833333,0.130797,720,11,10028,55
Production-related actions,0.84705,0.030499,0.981132,0.059158,1653,1,9108,52
Merger & \nacquisitions,0.926114,0.113995,0.878261,0.201798,785,14,9914,101
Corporate \ngovernance,0.944239,0.190736,0.939597,0.317101,594,9,10071,140
Strategic alliance,0.859441,0.103303,0.864322,0.184549,1493,27,9122,172
Expansion in existing market (product/service/geographical),0.746162,0.020379,0.919355,0.039874,2740,5,8012,57
Divestiture,0.924635,0.04947,0.84,0.093437,807,8,9957,42
Human resources,0.72739,0.012408,0.925,0.024487,2945,3,7829,37
New product introduction/\nservice offering,0.927224,0.131313,0.9,0.229187,774,13,9910,117


In [32]:
# add the model we used to the results
results_log_df['Model'] = 'BOW + log'

# write away the results
results_log_df.to_excel('./Output/Model performance/BOW_log.xlsx', index = False, header = True)
best_params_df.to_excel('./Output/parameters/BOW_log.xlsx', index = False, header = True)

### C. Support Vector Machine

In [33]:
# Create an empty dataframe with the categories to store the results of the svm
results_svm_df = create_results_df()

In [34]:
# Initialize the SVM classifier
svm = SVC(random_state = 123)

# Create a parameter grid for the SVM
param_grid = {
    'C': [0.1, 1, 10, 100], # inverse regularization parameter
    'kernel': ['linear', 'poly', 'rbf'], # what type of kernel need to be used
}


In [None]:
# Set a random seed for reproducibility
random.seed(123)

# 
best_params_dict, best_params = evaluate_categories(svm, results_svm_df)

Evaluating categories:   0%|          | 0/15 [00:00<?, ?it/s]

In [None]:
# Create a new dataframe to store the best hyperparameters for each category
best_params_df = pd.DataFrame.from_dict(best_params_dict, orient='index')

# Rename the columns of the best_params_df
best_params_df.columns = [f'{param}' for param in best_params.keys()]

# Display the best parameters
print("\nBest hyperparameters for each category:")
display(best_params_df)

In [None]:
# display the results of classification
results_svm_df

In [None]:
# add the model we used to the results
results_svm_df['Model'] = 'BOW + svm'

# write away the results
results_svm_df.to_excel('./Output/Model performance/BOW_svm.xlsx', index = False, header = True)
best_params_df.to_excel('./Output/parameters/BOW_svm.xlsx', index = False, header = True)

### D. Adaboost classifier

In [None]:
# Create an empty dataframe with the categories to store the results of the ada
results_ada_df = create_results_df()

In [None]:
# Initialize decision tree base estimator for AdaBoost
base_estimator = DecisionTreeClassifier()

# Initialize AdaBoost classifier
ada = AdaBoostClassifier(base_estimator=base_estimator, random_state = 123)

# Define parameter grid for AdaBoost
param_grid = {
    'n_estimators': [50, 100, 250],   # the maximum number of estimators before boosting is terminated
    'learning_rate': [0.1, 0.5, 1.0], # weight applied to each classifier at boosting iteration
                                      # A higher learning rate increases the contribution of each classifier. 
}

In [None]:
# Set a random seed for reproducibility
random.seed(123)

# 
best_params_dict, best_params = evaluate_categories(ada, results_ada_df)

In [None]:
# Create a new dataframe to store the best hyperparameters for each category
best_params_df = pd.DataFrame.from_dict(best_params_dict, orient='index')

# Rename the columns of the best_params_df
best_params_df.columns = [f'{param}' for param in best_params.keys()]

# Display the best parameters
print("\nBest hyperparameters for each category:")
display(best_params_df)

In [None]:
# display the results of classification
results_ada_df

In [None]:
# add the model we used to the results
results_ada_df['Model'] = 'BOW + ada'

# write away the results
results_ada_df.to_excel('./Output/Model performance/BOW_ada.xlsx', index = False, header = True)
best_params_df.to_excel('./Output/parameters/BOW_ada.xlsx', index = False, header = True)

### E. Gradient boosting classifier

In [None]:
# Create an empty dataframe with the categories to store the results of the svm
results_gbc_df = create_results_df()

In [None]:
# Initialize the classifier
gbc = GradientBoostingClassifier(random_state = 123)

param_grid = {
    'n_estimators': [100, 250], # the number of boosting stages to perform
    'learning_rate': [0.1, 0.5, 1], # the contribution of each tree
    'max_depth': [1, 3, 5], # The maximum depth of individual estimators
    'subsample': [0.3, 0.6, 1],    # The fraction of samples used of fitting, if smaller than one it becomes stochastic gradient boosting
    'max_features': ['sqrt']
}

In [None]:
# Set a random seed for reproducibility
random.seed(123)

# 
best_params_dict, best_params = evaluate_categories(gbc, results_gbc_df)

In [None]:
# Create a new dataframe to store the best hyperparameters for each category
best_params_df = pd.DataFrame.from_dict(best_params_dict, orient='index')

# Rename the columns of the best_params_df
best_params_df.columns = [f'{param}' for param in best_params.keys()]

# Display the best parameters
print("\nBest hyperparameters for each category:")
display(best_params_df)

In [None]:
# display the results of classification
results_gbc_df

In [None]:
# add the model we used to the results
results_gbc_df['Model'] = 'BOW + gbc'

# write away the results
results_gbc_df.to_excel('./Output/Model performance/BOW_gbc.xlsx', index = False, header = True)
best_params_df.to_excel('./Output/parameters/BOW_gbc.xlsx', index = False, header = True)

# 1.2 TFIDTF vectorizer

In [None]:
# Define TF IDF Vectorizer #
vec_tf_idf = TfidfVectorizer(max_features= MF,
                      max_df = MAXDF,
                      ngram_range=(1, 2),
                      tokenizer=textblob_tokenizer)

In [None]:
# create independent variable with TF IDF
X_tf_idf = vec_tf_idf.fit_transform(df_train['Headline'])
print(X_tf_idf[:2])