# B. Vectorization

# 0. Data loading

In [1]:
# General Packages #
import os
import pandas as pd
import numpy as np
import string
import re
from scipy.stats import randint
import random
import json
from collections import Counter

# Load TQDM to Show Progress Bars #
from tqdm import tqdm
from tqdm.notebook import tqdm as tqdm_notebook

# Sklearn Packages #
from sklearn.feature_extraction.text import TfidfTransformer, CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split, StratifiedShuffleSplit, StratifiedKFold, cross_val_predict, GridSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report, make_scorer
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier

# NLTK Packages #
import nltk
from nltk.corpus import stopwords
from textblob import TextBlob, Word
from nltk.tokenize import word_tokenize

# Import necessary libraries for handling imbalanced data
from imblearn.metrics import geometric_mean_score
from imblearn.pipeline import make_pipeline
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler



In [2]:
# Turn off warnings, just to avoid pesky messages that might cause confusion here
# Remove when testing your own code #
import warnings
warnings.filterwarnings("ignore")

In [3]:
# Change to Working Directory with Training Data # 
os.chdir("/Users/Artur/Desktop/thesis_HIR_versie5/coding")
#os.chdir("/Users/juarel/Desktop/studies artur/thesis_HIR/coding")


# Load Training Data #
df_train = pd.read_csv("./data/silver_data/train.csv", header = 0)
df_test = pd.read_csv("./data/silver_data/test.csv", header = 0)

# inspect the data
df_train.head(5)

Unnamed: 0,id,Headline,category
0,194578,Head Line: US Patent granted to BASF SE (Delaw...,
1,564295,Societe Generale Launches a Next-Generation Ca...,
2,504138,BARCLAYS PLC Form 8.3 - EUTELSAT COMMUNICATION...,
3,91379,ASML: 4Q Earnings Snapshot,
4,265750,Form 8.3 - AXA INVESTMENT MANAGERS : Booker Gr...,


In [4]:
# read in the data about the different companies
companies = pd.read_excel("./data/companies.xlsx", header = 0)
companies.head(5)

Unnamed: 0,NameCompany
0,Andritz AG
1,ams AG
2,voestalpine AG
3,OMV AG
4,Wienerberger AG


# 1. Data cleaning

## 1.1 Define custom tokenizer

The first step when building our model is to clean the data. To perform this step, we need to define a custom tokenizer that will serve as input in our vectorizers. This tokenizer needs to fulfill the following criteria:

1. We convert the strings to lowercase
2. We remove currency symbols
3. We remove punctuation from the text
4. We remove English stopwords from the text as these do not provide any information to our model
5. We remove numbers from the text. Note that numbers in combination with text are kept as they can provide information. For example, 4q gives an indication about an action in the fourth quarter.
6. We remove words that with information linked to a specific company
7. We remove words that consist out of a single character
8. We lemmatize the words to reduce the words to their base form

#### Create a list with words linked to a specific company

First, we create a set of all the unique words that could be linked to a company. These are the full company names (including legal suffixes) of which a part were used to retrieve the newswires and press releases from the NexisUni database. 

In [5]:
# define a list with the companies names to use in the tokenizer
# Do not use this information in your model
company_info = set()

for name in companies['NameCompany']:
    name = name.lower()
    words = name.split()
    company_info.update(words)

len(company_info) 

1137

Then, we want to determine the set of words that does not contain any information about the company, such as legal prefixes. Therefore, we inspect which words are most frequently used in the names of the companies. When we inspected the results, we saw 1021 of the 1113 words were only used once over all the company names. These words can be seen as too company specific and will be removed from our headlines.

Determine a set of words that does not contain any information about the company:

1. First, I convert the company names into lowercase and split the words. These results are stored in the column 'cleaned_name'. 
2. Then, I retrieve all these words and store them in the array 'company_names_array'. Note that these are not unique words, but just all the cleaned words from each company joined into one array.
3. Next, I store the frequency of each word in a dataframe. 
4. Finally, I can determine which words are to company specific to be included in the data.

In [6]:
# Define a function to clean the company names
def company_name_tokenizer(name):
    # Remove special characters and digits
    name = re.sub(r"[^\w\s]", "", name)
    
    # Convert the name to lowercase
    name = name.lower()
    
    # Split the name into individual words
    words = name.split()
    
    return words

In [7]:
# Create a variable with the cleaned company names
companies['cleaned_name'] = companies['NameCompany'].apply(company_name_tokenizer)
companies.head(5)

Unnamed: 0,NameCompany,cleaned_name
0,Andritz AG,"[andritz, ag]"
1,ams AG,"[ams, ag]"
2,voestalpine AG,"[voestalpine, ag]"
3,OMV AG,"[omv, ag]"
4,Wienerberger AG,"[wienerberger, ag]"


In [8]:
# Concatenate all the values from the cleaned company names and store them in an array
company_names_array = np.concatenate(companies['cleaned_name'].values)

# Count the frequency of each word in the array
frequent_company_info = np.unique(company_names_array, return_counts=True)

# Store the results in da dataframe
word_frequencies = pd.DataFrame({'Word': frequent_company_info[0], 
                                'Count': frequent_company_info[1]})

# Sort the dataframe in descending order by the 'Count' column
word_frequencies = word_frequencies.sort_values('Count', ascending=False)
word_frequencies['Count'].value_counts().head(5)

1    1021
2      61
3      20
4       5
6       4
Name: Count, dtype: int64

Create a set of the words that are not linked to a specific company. In other terms, words that were used more than once in a company description.

In [9]:
# Filter the dataframe based on the count threshold
general_voc = word_frequencies[word_frequencies['Count'] >= 2]['Word'].tolist()

# Create a set from the filtered values
general_voc = set(general_voc)

Remove these words from the initial set of words with all the company information

In [10]:
# remove these words from the set company info
company_info = company_info.difference(general_voc)

#### Define custom tokenizer

Next, define the customer tokenizer that will be used as input in our vectorizers. Therefore, we use the information in company_info that we have just defined.

In [11]:
def textblob_tokenizer(str_input):
    
    # Convert list to string
    input_str = str_input
    if isinstance(input_str, list):
        input_str = ' '.join(input_str)
        
    # Remove currency symbols
    str_input = re.sub(r'\$|£|€', '', str_input)
    
    # Remove punctuation
    str_input = str_input.translate(str.maketrans('', '', string.punctuation))
    
    # Define stop words
    stop_words = set(stopwords.words('english'))

    # Tokenize text
    blob = TextBlob(str_input.lower())
    tokens = blob.words
    
    # Remove numbers, stop words, company information and words with one character
    words = [word for word in tokens if not re.match('^\d+$', word) and word not in stop_words
                                        and word not in company_info and len(word) > 1]

    # Lemmatize words
    words = [Word(word).lemmatize() for word in words]
    
    return ' '.join(words)

In [12]:
# Clean the text data
df_train['cleaned_headline'] = df_train['Headline'].apply(textblob_tokenizer)
df_test['cleaned_headline'] = df_test['Headline'].apply(textblob_tokenizer)


# check the cleaned data
df_train[["cleaned_headline", "Headline"]].head(5)

Unnamed: 0,cleaned_headline,Headline
0,head u patent granted se delaware may titled c...,Head Line: US Patent granted to BASF SE (Delaw...
1,societe generale launch nextgeneration card in...,Societe Generale Launches a Next-Generation Ca...
2,plc form communication,BARCLAYS PLC Form 8.3 - EUTELSAT COMMUNICATION...
3,4q earnings snapshot,ASML: 4Q Earnings Snapshot
4,form investment manager group plc,Form 8.3 - AXA INVESTMENT MANAGERS : Booker Gr...


#### Define the number of unique words in our dataset

In [13]:
# Concatenate all the rows in the 'cleaned_headline' column
combined_text = ' '.join(df_train['cleaned_headline'])
#combined_text

In [14]:
# Tokenize the combined text into words
words = word_tokenize(combined_text)

# Calculate the count of unique words
unique_word_count = len(Counter(words))
print(f' Number of unique words: {unique_word_count}')

 Number of unique words: 31978


Check how many of these words have only one occurence: 19 094

In [15]:
# Define the count per word
word_counts = Counter(words)

In [16]:
# Convert word_counts dictionary to a DataFrame
df_word_counts = pd.DataFrame.from_dict(word_counts, orient='index', columns=['Count'])
df_word_counts = df_word_counts.sort_values(by='Count', ascending=False)
df_word_counts['Count'].value_counts().head(5)

1    19094
2     4144
3     1889
4     1081
5      726
Name: Count, dtype: int64

## 1.2 Remove empty strings

In the previous steps I have cleaned the data. However, it is possible that some headlines were so short that some cleaned headlines ended up being empty after the cleaning process. Therefore, I check if there are any empty strings present and removes these observations from my data.

In [17]:
# first, check the shape of the data before any operations
df_train.shape

(43254, 4)

In [18]:
# Check for empty strings after cleaning the data
missing_data_train = df_train['cleaned_headline'].apply(lambda x: x == '')
rows_with_empty_strings = df_train[missing_data_train]
rows_with_empty_strings

Unnamed: 0,id,Headline,category,cleaned_headline
10479,1115844,Sanofi,,
13982,882908,Novartis,,
18141,73986,/C O R R E C T I O N -- Allianz/,,
23796,603479,GlaxoSmithKline,,
28534,1396642,Vodafone - 2017,,
29702,1135568,Stada Arzneimittel,,
37343,693912,Inchcape - 2017,,
42065,872085,Nokia Corporation Nokia Corporation Financial -3-,,


In [19]:
# Removes these empty strings from the dataset
df_train = df_train[~missing_data_train]
df_train.shape

(43246, 4)

In [20]:
# apply the same process to the test set (4 test observations were removed)
missing_data_test = df_test['cleaned_headline'].apply(lambda x: x == '')
df_test = df_test[~missing_data_test]

## 1.3 Store the cleaned data 

The previous steps can be seen as preprocessing steps to clean the data. These steps will need to be performed each time we want to convert the textual data into numerical representations. Therefore, I write the data away as the gold data so I can easily retrieve the cleaned data in the other notebooks.

In this notebook, we will keep using the column 'Headline' as independent variable as the text will get cleaned by the vectorizers.

In [21]:
# Save the adjusted train and test sets to CSV files
df_train.to_csv('./data/gold_data/train.csv', index=False, encoding = 'utf-8')
df_test.to_csv('./data/gold_data/test.csv', index=False)

# 2. Define functions and parameters

Before we continue, we first define some useful functions, dataframes and parameters that we will use throughout this notebook:

1. get_classification_metrics: Create a function that return the classification metrics for each model. The precision, recall and f1 score are all determined using the average value of all classes, without adjusting weights to these classes.

2. Define a dataframe to store the results of the different models. Moreover, also define a dictionary that stores the best parameters for each model.

3. Define the number of splits, the stratified cross validator to ensure class frequencies are considered, and the scoring metric based on the average F1 score. We use an F1 score as scoring metric as accuracy is not a good evaluation metric in our case.

4. Define a function that trains the defined model, the input data, the classifier and its parameter grid. Besides, it will also take 4 parameters as input that give more information about the model that is being trained. This is usefull for the storage of the performance of the different algorithms.

In [18]:
# 1. Function that returns classication metrics
def get_classification_metrics(y_true, y_pred):
    
    # Calculate Model Performance Metrics
    accuracy = accuracy_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred, average='macro')
    recall = recall_score(y_true, y_pred, average='macro')
    f1 = f1_score(y_true, y_pred, average='macro')

    return accuracy, precision, recall, f1


In [19]:
# 2. Create an empty dataframe to store the results of all the models
results_all_df = pd.DataFrame()

# Add columns for the metrics
columns = ['vectorizer', 'FS', 'classifier', 'resampling','accuracy', 'precision', 'recall', 'f1']
for col in columns:
    results_all_df[col] = 0

# create an empty dictionary to store the optimal parameters
best_params_dict = {}

In [20]:
# 3. Define different parameters
# Define the number of folds for cross-validation
n_splits = 5

# Initialize the stratified k-fold object
skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42) # ensures class balances are kept

# Define the scoring metric
scoring = make_scorer(f1_score, average= 'macro')

In [21]:
# 4. Define a function to train and evaluate the different models
def perform_grid_search(name, model, param_grid, X_train, X_test, y_train, y_test,
                       vectorizer, FS, classifier, resampling):
    
    # Define a seed value
    random.seed(7)
        
    # Perform the grid search using cross-validation
    grid_search = GridSearchCV(model, param_grid, cv=skf, scoring=scoring)
    grid_search.fit(X_train, y_train)

    # Get the best model and its hyperparameters
    best_model = grid_search.best_estimator_
    best_params = grid_search.best_params_

    # Store the best parameters for the current category in the dictionary
    best_params_dict[name] = best_params
    print(f'best parameters: {best_params}')

    # Retrain the best model with the whole training set
    best_model.fit(X_train, y_train)

    # Make predictions on the test set
    y_pred = best_model.predict(X_test)
    
    # Calculate the probabilities (not for SVM as this is not possible)
    if classifier != 'SVM':
        y_pred_proba = best_model.predict_proba(X_test)
        
        # Find the highest probability for each observation
        highest_prob = np.amax(y_pred_proba, axis = 1)
    
        # Create a DataFrame with test observations, highest probabilities, and predicted classes
        predictions_df = pd.DataFrame({'Observation_nr': y_test.index, 'Probability': highest_prob, 'Prediction': y_pred})
        
    else:
        # Create a DataFrame with test observations and predicted classes
        predictions_df = pd.DataFrame({'Observation_nr': y_test.index, 'Prediction': y_pred})
        
    # Store the final predictions with its probability for the test set
    predictions_df.to_csv(f'./Output/predictions/{name}.csv', index = False, header = True)
    #predictions_df.to_excel(f'./Output/predictions/{name}.xlsx', index = False, header = True)

    # Calculate the classification metrics
    accuracy, precision, recall, f1 = get_classification_metrics(y_test, y_pred)
    
    # print the results
    print(f'Results for {name}:')
    print(f'Accuracy: {accuracy}')
    print(f'Precision: {precision}')
    print(f'Recall: {recall}')
    print(f'F1: {f1}')
    
    # add the results to the dataframe with all the results
    results_all_df.loc[name] = [vectorizer, FS, classifier, resampling, accuracy, precision, recall, f1]

In [22]:
# define the independent and dependent variables
X_train = df_train['Headline']
X_test = df_test['Headline']

y_train = df_train['category']
y_test = df_test['category']

# 3. Vectorizers

In this notebook, I will test the performance of two vectorizers to transform the textual data into numerical representations. For each vectorizer, I will train and evaluate the 5 following classifiers through cross validation:

1. Logistic Regression (normal + Ridge and Lasso variations)
2. Decision Tree
3. Support Vector Machine
4. Random Forest
5. Adaboost Classifier

Each model is initialized the first time they are used (section 3.1). Moreover, I hypertune the parameters of each model trough cross validation with GridSearchCV. These parameters are defined when a new model is initialized. More information about these parameters can be found in the comments or in their sklearn documentation.

Further, I tried to adress the class imbalance issue in the dataset with two resampling techniques:

1. Random undersampling
2. Oversampling through SMOTE

For each base classifier, I will train the 5 base classifiers once without resampling technique, once with random undersampling and once with oversampling through SMOTE. When I have defined the right parameters for each model, I train and evaluate each model with the previously defined function perform_grid_search.

## 3.1 Bag of words

The first vectorizer that we will use to transform our textual data into numerical presentations is the bag of words procedure or BOW. In this approach, each headline is treated as a collection of individual words. Then, the vectorizer just indicates the frequency of each feature (=word) in the document. This is a simple and inexpensive approach to represent textual data into a numerical form. However, note that this will result in a sparse matrix due to the high number of words that we use.

Define the parameters of the vectorizer:

1. Max_df = Maximum Document Frequency: Maximum share of documents where a word can occur to be considered as a feature. This value is set as 0.9 as to frequent occuring words lose their predictive value.

2. Max_features = Maximum number of features we would want to consider, ranked by most frequently occuring. This value is set at 10.000. This value was found to work best with the different models after try and error. Considering our dataset only consists out of 31.798 words, of which only 12.884 appear more than once, we consider this a good cut-off point. Moreover, we only have 43.254 training observations which is relatively litte for this number of features. However, a lower number of features led to lower results.

3. Ngram_range = Number of Word Pairs. In our case, we include unigrams and bigrams. 

4. Tokenizer = We use our customer defined tokenizer.

In [25]:
# Maximum Document Frequency -- Maximum share of documents where a word needs to occur to be considered #
MAXDF = 0.9

# Maximum number of features we would want to consider -- ranked by most frequently occuring #
MF= 10000

# NGrams -- Number of Word Pairs. Takes the form (Min, Max). E.g. (1, 2) means single words and word pairs # 
NGrams = (1,2)

Define the vectorizer itself

In [24]:
# define the vectorizer
vec_bow = CountVectorizer(max_features= MF,
                          min_df = MINDF,
                          max_df = MAXDF,
                          ngram_range= NGrams,
                          tokenizer=textblob_tokenizer)

Transform the textual data into numerical representations with the BOW vectorizer

In [21]:
# apply the BOW vectorizer (train on training data and transform test set after)
X_train_bow = vec_bow.fit_transform(X_train)
X_test_bow = vec_bow.transform(X_test)


In [22]:
# check the shape (= observations and features)
X_train_bow.shape

(43254, 10000)

In [23]:
# Define with what vectorizer we build the models for storage:
vectorizer = 'BOW'
FS = 'None' # Feature selection

### 2.1.1 Without resampling

In [24]:
# Define the resampling technique for storage
resampling = 'None'

#### A. Logistic regression

In [25]:
# define the model characteristics
model_name = 'bow_log_w'
classifier = 'logR'

# Initialize the classifier
logreg = LogisticRegression(random_state = 7)

# Define the parameter grid
param_grid_log = {
    'penalty': ['None','l1', 'l2'], # normal, lasso or ridge regression
    'C': [0.1, 1, 10]               # The inverse penalization term (smaller is higher penalization)
}

In [26]:
# perform a grid search for the logistic regression model
# the results are automatically stored in results_all_df and best_params_dict
perform_grid_search(model_name, logreg, param_grid_log, X_train_bow, X_test_bow, y_train, y_test,
                   vectorizer, FS, classifier, resampling)

best parameters: {'C': 10, 'penalty': 'l2'}
Results for bow_log_w:
Accuracy: 0.9250046236360274
Precision: 0.5754734113499761
Recall: 0.4509983700061958
F1: 0.5004020041019801


#### B. Decision Tree

In [27]:
# define the model characteristics
model_name = 'bow_DT_w'
classifier = 'DT'

# Initialize the classifier
tree = DecisionTreeClassifier(random_state = 7)

# Define the parameter grid
param_grid_DT = {
    'criterion': ['gini'],          # Define the splitting criteria: Gini index for node impurity
    'min_samples_leaf': [1, 2],     # Define the minimum number of samples required to be at leaf node
    'max_features': [None, 'sqrt']  # Define the number of features to consider when looking for the best split
}

In [28]:
# perform a grid search for the logistic regression model
# the results are automatically stored in results_all_df and best_params_dict
perform_grid_search(model_name, tree, param_grid_DT, X_train_bow, X_test_bow, y_train, y_test,
                   vectorizer, FS, classifier, resampling)

best parameters: {'criterion': 'gini', 'max_features': None, 'min_samples_leaf': 1}
Results for bow_DT_w:
Accuracy: 0.9028111707046421
Precision: 0.439957542538443
Recall: 0.38526227710768074
F1: 0.40142069121901386


#### C. Support Vector Machine

In [29]:
# define the model characteristics
model_name = 'bow_svm_w'
classifier = 'SVM'

# Initialize the classifier
svm = SVC(random_state = 7)

# Define the parameter grid
param_grid_svm = {
    'C': [0.1, 1, 10, 100], # inverse regularization parameter
    'kernel': ['linear', 'poly', 'rbf'], # what type of kernel need to be used (rbf = radial kernel)
}

In [30]:
# perform a grid search for the logistic regression model
# the results are automatically stored in results_all_df and best_params_dict
perform_grid_search(model_name, svm, param_grid_svm, X_train_bow, X_test_bow, y_train, y_test,
                   vectorizer, FS, classifier, resampling)

best parameters: {'C': 10, 'kernel': 'rbf'}
Results for bow_svm_w:
Accuracy: 0.9279637506935454
Precision: 0.645338613690608
Recall: 0.4175187919853814
F1: 0.49216364262856055


#### D. Random Forest Classifier

In [31]:
# define the model characteristics
model_name = 'bow_rf_w'
classifier = 'RF'

# Initialize the classifier
rfc = RandomForestClassifier(random_state = 7, n_jobs = -1)

# Define the parameter grid
param_grid_rf = {
    'criterion': ['gini'],          # Define the splitting criteria: Gini index for node impurity
    'n_estimators': [100, 500],     # the number of trees to use when building the model
    'min_samples_leaf': [1, 2],     # Define the minimum number of samples required to be at leaf node
    'max_features': [None, 'sqrt']  # Define the number of features to consider when looking for the best split
}

In [32]:
# perform a grid search for the logistic regression model
# the results are automatically stored in results_all_df and best_params_dict
perform_grid_search(model_name, rfc, param_grid_rf, X_train_bow, X_test_bow, y_train, y_test,
                   vectorizer, FS, classifier, resampling)

best parameters: {'criterion': 'gini', 'max_features': None, 'min_samples_leaf': 1, 'n_estimators': 500}
Results for bow_rf_w:
Accuracy: 0.9138154244497874
Precision: 0.5344243514571876
Recall: 0.38604668518371527
F1: 0.4274280615651013


#### E. Adaboost classifier

In [33]:
# define the model characteristics
model_name = 'bow_ada_w'
classifier = 'ADA'

# Initialize decision tree base estimator for AdaBoost
base_estimator = DecisionTreeClassifier(random_state = 7)

# Initialize AdaBoost classifier
ada = AdaBoostClassifier(base_estimator = base_estimator, random_state = 7)

# Define parameter grid for AdaBoost
param_grid_ada = {
    'n_estimators': [50, 100, 250],   # the maximum number of estimators before boosting is terminated
    'learning_rate': [0.1, 0.5, 1.0], # weight applied to each classifier at boosting iteration
                                      # A higher learning rate increases the contribution of each classifier. 
}

In [35]:
# perform a grid search for the logistic regression model
# the results are automatically stored in results_all_df and best_params_dict
perform_grid_search(model_name, ada, param_grid_ada, X_train_bow, X_test_bow, y_train, y_test,
                   vectorizer, FS, classifier, resampling)

best parameters: {'learning_rate': 0.1, 'n_estimators': 50}
Results for bow_ada_w:
Accuracy: 0.9073423340114666
Precision: 0.46030451726116195
Recall: 0.3582422517873078
F1: 0.39445416090086666


### 2.1.2 With Random undersampling

In [26]:
# Define the resampling technique
resampling = 'Und'

In [27]:
# Define the categories and the maximum number of samples
categories = df_train["category"].unique()

#### Random undersampling

To undersample the data, I decided to use the random undersampling technique. This technique just randomly eliminates observations from the majority class until the majority and minority class have the same number of observations. However, in the multiclass setting with multiple minority classes, it eliminates observations of all classes until they reach the same number of observations as the smallest minority class. In this scenario, a lot of the information in the dataset is thrown away. Therefore, I decided to follow another approach. I downsampled all categories to 4 times the number of observations of the smallest class. This way, the class imbalance is heavily reduced until a 4:1 imbalance ratio, but much more information is retained in the dataset. Besides, the second approach gave better results when evaluating my models.

#### Second strategy

In [38]:
# Calculate the number of samples in the smallest minority category
rus_n = df_train['category'].value_counts().sort_values(ascending=False)[1]

# Dictionary to store the actual maximum imbalance per class
max_imbalance_u = {}

# Calculate the actual maximum imbalance for each class
for category in categories:
    if category == 'None':
        max_imbalance_u[category] = rus_n
    else:
        # Set the actual maximum to the number of available samples
        max_imbalance_u[category] = y_train.value_counts()[category]

In [39]:
# Create the random undersampler with maximum imbalance
undersampler = RandomUnderSampler(sampling_strategy = max_imbalance_u, random_state = 7)

# Undersample the data
X_train_bow_und, y_train_bow_und = undersampler.fit_resample(X_train_bow, y_train)
y_train_bow_und.value_counts()

None                                                           797
Strategic alliance                                             797
Corporate \ngovernance                                         597
New product introduction/\nservice offering                    518
Merger & \nacquisitions                                        460
Financing                                                      262
Product/\nservice improvement                                  254
Venturing                                                      250
Expansion in existing market (product/service/geographical)    249
Production-related actions                                     212
Marketing                                                      205
Divestiture                                                    199
Human resources                                                161
R&D-related actions                                            138
Market entry                                                  

#### A. Logistic Regression

In [40]:
# define the model characteristics
model_name = 'bow_log_u'
classifier = 'logR'

In [41]:
# perform a grid search for the logistic regression model
# the results are automatically stored in results_all_df and best_params_dict
perform_grid_search(model_name, logreg, param_grid_log, X_train_bow_und, X_test_bow, y_train_bow_und, y_test,
                   vectorizer, FS, classifier, resampling)

best parameters: {'C': 1, 'penalty': 'l2'}
Results for bow_log_u:
Accuracy: 0.8240244127982245
Precision: 0.35076021791522727
Recall: 0.6572033669036993
F1: 0.43505098444332013


#### B. Decision Tree

In [42]:
# define the model characteristics
model_name = 'bow_DT_u'
classifier = 'DT'

In [43]:
# perform a grid search for the logistic regression model
# the results are automatically stored in results_all_df and best_params_dict
perform_grid_search(model_name, tree, param_grid_DT, X_train_bow_und, X_test_bow, y_train_bow_und, y_test,
                   vectorizer, FS, classifier, resampling)

best parameters: {'criterion': 'gini', 'max_features': None, 'min_samples_leaf': 2}
Results for bow_DT_u:
Accuracy: 0.7658590715738857
Precision: 0.2596338426130034
Recall: 0.5777130538199041
F1: 0.3364276176926325


#### C. Support Vector Machine

In [44]:
# define the model characteristics
model_name = 'bow_svm_u'
classifier = 'SVM'

In [45]:
# perform a grid search for the logistic regression model
# the results are automatically stored in results_all_df and best_params_dict
perform_grid_search(model_name, svm, param_grid_svm, X_train_bow_und, X_test_bow, y_train_bow_und, y_test,
                   vectorizer, FS, classifier, resampling)

best parameters: {'C': 1, 'kernel': 'linear'}
Results for bow_svm_u:
Accuracy: 0.7852783428888478
Precision: 0.2876350950340012
Recall: 0.6385497060350971
F1: 0.377265994719291


#### D. Random Forest Classifier

In [46]:
# define the model characteristics
model_name = 'bow_rf_u'
classifier = 'RF'

In [47]:
# perform a grid search for the logistic regression model
# the results are automatically stored in results_all_df and best_params_dict
perform_grid_search(model_name, rfc, param_grid_rf, X_train_bow_und, X_test_bow, y_train_bow_und, y_test,
                   vectorizer, FS, classifier, resampling)

best parameters: {'criterion': 'gini', 'max_features': None, 'min_samples_leaf': 2, 'n_estimators': 100}
Results for bow_rf_u:
Accuracy: 0.8017384871462918
Precision: 0.28664982532638317
Recall: 0.59171962553755
F1: 0.36732559232507


#### E. Adaboost classifier

In [48]:
# define the model characteristics
model_name = 'bow_ada_u'
classifier = 'ADA'

In [49]:
# perform a grid search for the logistic regression model
# the results are automatically stored in results_all_df and best_params_dict
perform_grid_search(model_name, ada, param_grid_ada, X_train_bow_und, X_test_bow, y_train_bow_und, y_test,
                   vectorizer, FS, classifier, resampling)

best parameters: {'learning_rate': 0.1, 'n_estimators': 50}
Results for bow_ada_u:
Accuracy: 0.7069539485851674
Precision: 0.23804587757702966
Recall: 0.5491769284022234
F1: 0.3019865445836417


### 2.1.3 With oversampling

In [50]:
# Define the resampling technique
resampling = 'Ove'

In [51]:
# Calculate the number of samples in the majority class
ove_n = df_train['category'].value_counts().sort_values(ascending=False)[0]

# Oversample until the number of observations equals a fourth of the majority class
max_samples = int(ove_n/4)

# Dictionary to store the actual maximum imbalance per class
max_imbalance = {}

# Calculate the actual maximum imbalance for each class
for category in categories:
    if category == 'None':
        max_imbalance[category] = y_train.value_counts()[category]
    else:
        # Set the actual maximum to the number of available samples
        max_imbalance[category] = max_samples

In [52]:
# Create the SMOTE oversampler
oversampler = SMOTE(sampling_strategy=max_imbalance, random_state=7)

# Undersample the data
X_train_bow_ove, y_train_bow_ove = oversampler.fit_resample(X_train_bow, y_train)
y_train_bow_ove.value_counts()

None                                                           38876
R&D-related actions                                             9719
Human resources                                                 9719
Venturing                                                       9719
Market entry                                                    9719
Divestiture                                                     9719
Product/\nservice improvement                                   9719
Merger & \nacquisitions                                         9719
Financing                                                       9719
Marketing                                                       9719
Expansion in existing market (product/service/geographical)     9719
Corporate \ngovernance                                          9719
Production-related actions                                      9719
New product introduction/\nservice offering                     9719
Strategic alliance                

#### A. Logistic Regression

In [53]:
# define the model characteristics
model_name = 'bow_log_o'
classifier = 'logR'


In [54]:
# perform a grid search for the logistic regression model
# the results are automatically stored in results_all_df and best_params_dict
perform_grid_search(model_name, logreg, param_grid_log, X_train_bow_ove, X_test_bow, y_train_bow_ove, y_test,
                   vectorizer, FS, classifier, resampling)

best parameters: {'C': 10, 'penalty': 'l2'}
Results for bow_log_o:
Accuracy: 0.8957832439430368
Precision: 0.41627749268059494
Recall: 0.4966884459241199
F1: 0.4492320023275047


#### B. Decision Tree

In [55]:
# define the model characteristics
model_name = 'bow_DT_o'
classifier = 'DT'

In [56]:
# perform a grid search for the logistic regression model
# the results are automatically stored in results_all_df and best_params_dict
perform_grid_search(model_name, tree, param_grid_DT, X_train_bow_ove, X_test_bow, y_train_bow_ove, y_test,
                   vectorizer, FS, classifier, resampling)


best parameters: {'criterion': 'gini', 'max_features': None, 'min_samples_leaf': 1}
Results for bow_DT_o:
Accuracy: 0.833179212132421
Precision: 0.2879363227925523
Recall: 0.5086591393544257
F1: 0.35134370930114506


#### C. Support Vector Machine

In [57]:
# define the model characteristics
model_name = 'bow_svm_o'
classifier = 'SVM'

In [58]:
# perform a grid search for the logistic regression model
# the results are automatically stored in results_all_df and best_params_dict
perform_grid_search(model_name, svm, param_grid_svm, X_train_bow_ove, X_test_bow, y_train_bow_ove, y_test,
                   vectorizer, FS, classifier, resampling)

best parameters: {'C': 10, 'kernel': 'rbf'}
Results for bow_svm_o:
Accuracy: 0.9032735343073793
Precision: 0.4533061079062824
Recall: 0.2613801709573333
F1: 0.31790519961634706


#### D. Random Forest Classifier

In [59]:
# define the model characteristics
model_name = 'bow_rf_o'
classifier = 'RF'

In [60]:
# perform a grid search for the logistic regression model
# the results are automatically stored in results_all_df and best_params_dict
perform_grid_search(model_name, rfc, param_grid_rf, X_train_bow_ove, X_test_bow, y_train_bow_ove, y_test,
                   vectorizer, FS, classifier, resampling)

best parameters: {'criterion': 'gini', 'max_features': 'sqrt', 'min_samples_leaf': 1, 'n_estimators': 100}
Results for bow_rf_o:
Accuracy: 0.8595339374884409
Precision: 0.32853107368024237
Recall: 0.5627928725918646
F1: 0.40031240229571474


#### E. Adaboost classifier

In [61]:
# define the model characteristics
model_name = 'bow_ada_o'
classifier = 'ADA'

In [62]:
# perform a grid search for the logistic regression model
# the results are automatically stored in results_all_df and best_params_dict
perform_grid_search(model_name, ada, param_grid_ada, X_train_bow_ove, X_test_bow, y_train_bow_ove, y_test,
                   vectorizer, FS, classifier, resampling)

best parameters: {'learning_rate': 0.1, 'n_estimators': 50}
Results for bow_ada_o:
Accuracy: 0.8404845570556686
Precision: 0.2947507499988248
Recall: 0.5070041802197532
F1: 0.35765280109674674


In [63]:
# write away results
results_all_df.to_csv('./Output/Model performance/results_BOW.csv', index = False, header = True)

In [65]:
# Write the dictionary with the best parameters away
with open('./Output/parameters/BOW.json', 'w') as file:
    json.dump(best_params_dict, file)