# Embeddings

# 0. Data loading

In [2]:
# General Packages #
import os
import pandas as pd
import numpy as np
import string
import re
from scipy.stats import randint
import random


# Load TQDM to Show Progress Bars #
from tqdm import tqdm
from tqdm.notebook import tqdm as tqdm_notebook

# Sklearn Packages #
from sklearn.feature_extraction.text import TfidfTransformer, CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split, StratifiedShuffleSplit, StratifiedKFold, cross_val_predict, GridSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report, make_scorer
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier

# NLTK Packages #
import nltk
from nltk.corpus import stopwords
from textblob import TextBlob, Word

# Import necessary libraries for handling imbalanced data
from imblearn.metrics import geometric_mean_score
from imblearn.pipeline import make_pipeline
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler



In [3]:
# Turn off warnings, just to avoid pesky messages that might cause confusion here
# Remove when testing your own code #
import warnings
warnings.filterwarnings("ignore")

In [4]:
# Change to Working Directory with Training Data # 
os.chdir("/Users/Artur/Desktop/thesis_HIR_versie5/coding")
#os.chdir("/Users/juarel/Desktop/studies artur/thesis_HIR/coding")


# Load Training Data #
df_train = pd.read_csv("./data/train_adjusted.csv", header = 0)
df_test = pd.read_csv("./data/test_adjusted.csv", header = 0)

# inspect the data
df_train.head(5)

Unnamed: 0,id,Headline,category
0,194578,Head Line: US Patent granted to BASF SE (Delaw...,
1,564295,Societe Generale Launches a Next-Generation Ca...,
2,504138,BARCLAYS PLC Form 8.3 - EUTELSAT COMMUNICATION...,
3,91379,ASML: 4Q Earnings Snapshot,
4,265750,Form 8.3 - AXA INVESTMENT MANAGERS : Booker Gr...,


In [5]:
# read in the data about the different companies
companies = pd.read_excel("./data/companies.xlsx", header = 0)
companies.head(5)

Unnamed: 0,NameCompany
0,Andritz AG
1,ams AG
2,voestalpine AG
3,OMV AG
4,Wienerberger AG


# 1. Data cleaning

The first step when building our model is to clean the data. To perform this step, we need to define a custom tokenizer that will serve as input in our vectorizers. This tokenizer needs to fulfill the following criteria:

1. We convert the strings to lowercase
2. We remove currency symbols
3. We remove punctuation from the text
4. We remove English stopwords from the text as these do not provide any information to our model
5. We remove words that with information linked to a specific company
6. We remove words that consist out of a single character
7. We lemmatize the words to reduce the words to their base form

#### Create a list with words linked to a specific company

First, we create a set of all the unique words that could be linked to a company. These are the full company names (including legal suffixes) of which part were used to retrieve the newswires and press releases from the NexisUni database. 

Then, we want to determine the set of words that does not contain any information about the company, such as legal prefixes. Therefore, we inspect which words are most frequently used in the names of the companies. When we inspected the results, we saw 1021 of the 1113 words were only used once in all the company names. These words can be seen as to company specific and will be removed from our headlines.

In [6]:
# define a list with the companies names to use in the tokenizer
# Do not use this information in your model
company_info = set()

for name in companies['NameCompany']:
    name = name.lower()
    words = name.split()
    company_info.update(words)

len(company_info) 

1137

Determine a set of words that does not contain any information about the company:

1. First, I convert the company names into lowercase and split the words. These results are stored in the column 'cleaned_name'. 
2. Then, I retrieve all these words and store them in the array 'company_names_array'. Note that these are not unique words, but just all the cleaned words from each company joined into one array.
3. Next, I store the frequency of each word in a dataframe. 
4. Finally, I can determine which words are to company specific to be included in the data.

In [7]:
# Define a list with only the companies names
def company_name_tokenizer(name):
    # Remove special characters and digits
    name = re.sub(r"[^\w\s]", "", name)
    
    # Convert the name to lowercase
    name = name.lower()
    
    # Split the name into individual words
    words = name.split()
    
    return words

companies['cleaned_name'] = companies['NameCompany'].apply(company_name_tokenizer)
companies.head(5)

Unnamed: 0,NameCompany,cleaned_name
0,Andritz AG,"[andritz, ag]"
1,ams AG,"[ams, ag]"
2,voestalpine AG,"[voestalpine, ag]"
3,OMV AG,"[omv, ag]"
4,Wienerberger AG,"[wienerberger, ag]"


In [8]:
# Concatenate all the values from the cleaned company names and store them in an array
company_names_array = np.concatenate(companies['cleaned_name'].values)

# Count the frequency of each word in the array
frequent_company_info = np.unique(company_names_array, return_counts=True)

# Store the results in da dataframe
word_frequencies = pd.DataFrame({'Word': frequent_company_info[0], 
                                'Count': frequent_company_info[1]})

# Sort the dataframe in descending order by the 'Count' column
word_frequencies = word_frequencies.sort_values('Count', ascending=False)
#word_frequencies['Count'].value_counts()

Create a set of the words that are not linked to a specific company. In other terms, words that were used more than once in a company description.

In [9]:
# Filter the dataframe based on the count threshold
general_voc = word_frequencies[word_frequencies['Count'] >= 2]['Word'].tolist()

# Create a set from the filtered values
general_voc = set(general_voc)

Remove these words from the initial set of words with all the company information

In [10]:
# remove these words from the set company info
company_info = company_info.difference(general_voc)

Next, define the customer tokenizer that will be used as input in our vectorizers. Therefore, we use the information in company_info that we have just defined.

In [11]:
def textblob_tokenizer(str_input):
    
    # Convert list to string
    input_str = str_input
    if isinstance(input_str, list):
        input_str = ' '.join(input_str)
        
    # Remove currency symbols
    str_input = re.sub(r'\$|£|€', '', str_input)
    
    # Remove punctuation
    str_input = str_input.translate(str.maketrans('', '', string.punctuation))
    
    # Define stop words
    stop_words = set(stopwords.words('english'))

    # Tokenize text
    blob = TextBlob(str_input.lower())
    tokens = blob.words
    
    # Remove numbers, stop words, company information and words with one character
    words = [word for word in tokens if not re.match('^\d+$', word) and word not in stop_words
                                        and word not in company_info and len(word) > 1]

    
    # Lemmatize words
    words = [Word(word).lemmatize() for word in words]
    
    return ' '.join(words)

In [12]:
# inspect the cleaned data
df_train['cleaned_headline'] = df_train['Headline'].apply(textblob_tokenizer)

# check the data
df_train[["cleaned_headline", "Headline"]]

Unnamed: 0,cleaned_headline,Headline
0,head u patent granted se delaware may titled c...,Head Line: US Patent granted to BASF SE (Delaw...
1,societe generale launch nextgeneration card in...,Societe Generale Launches a Next-Generation Ca...
2,plc form communication,BARCLAYS PLC Form 8.3 - EUTELSAT COMMUNICATION...
3,4q earnings snapshot,ASML: 4Q Earnings Snapshot
4,form investment manager group plc,Form 8.3 - AXA INVESTMENT MANAGERS : Booker Gr...
...,...,...
43249,system asa tom purchase share,Tomra Systems ASA: TOM: Purchase of own shares
43250,swiss federal institute intellectual granted p...,Swiss Federal Institute of Intellectual Proper...
43251,icon pfizer join addplan df consortiumnew memb...,ICON: Pfizer and Roche Join ADDPLAN DF Consort...
43252,plc transaction share,Rio Tinto PLC Transaction in Own Shares -3-


#### define functions needed in this notebook

Before we continue, we first define some useful functions and parameters that we will need later in this notebook:

1. get_classification_metrics: Create a function that return the classification metrics for each model. The precision, recall and f1 score are all determined using the average value of all classes, without adjusting weights to these classes.

2. create_results_df(): Create a function that creates a dataframe where the main classification metrics can be stored for each model.

3. Define the number of splits, the stratified cross validator to ensure class frequencies are considered, and the scoring metric.

4. Define a function that trains the defined model, depending on the vectorizer, the input data, the classifier and its parameter grid.


In [13]:
# 1. Function that returns classication metrics
def get_classification_metrics(y_true, y_pred):
    # Calculate Model Performance Metrics
    accuracy = accuracy_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred, average='macro')
    recall = recall_score(y_true, y_pred, average='macro')
    f1 = f1_score(y_true, y_pred, average='macro')


    return accuracy, precision, recall, f1


#2. Function that stores classifcation results 
def create_results_df(model_name):
    
    # Create an empty DataFrame with the model name as the index
    results_df = pd.DataFrame(index=[model_name])

    # Add columns for the metrics
    columns = ['accuracy', 'precision', 'recall', 'f1']
    for col in columns:
        results_df[col] = 0

    return results_df


In [14]:
# Create an empty dataframe to store the results of all the models
results_all_df = pd.DataFrame()

# Add columns for the metrics
columns = ['vectorizer', 'FS', 'classifier', 'resampling','accuracy', 'precision', 'recall', 'f1']
for col in columns:
    results_all_df[col] = 0


In [15]:
# Define the number of folds for cross-validation
n_splits = 5

# Initialize the stratified k-fold object
skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42) # ensures class balances are kept

# Define the scoring metric
scoring = make_scorer(f1_score, average= 'macro')


In order to create embeddings, I need to an array of words as in input for my model.

In [16]:
# clean the headlines from the test set
df_test['cleaned_headline'] = df_test['Headline'].apply(textblob_tokenizer)

In [17]:
# define the independent and dependent variables
X_train = df_train['cleaned_headline']
X_test = df_test['cleaned_headline']

y_train = df_train['category']
y_test = df_test['category']

In [18]:
# create an empty dictionary to store the optimal parameters
best_params_dict = {}

In [19]:
def perform_grid_search(name, model, param_grid, X_train, X_test, y_train, y_test,
                       vectorizer, FS, classifier, resampling):
    
    # Define a seed value
    random.seed(7)
        
    # Perform the grid search using cross-validation
    grid_search = GridSearchCV(model, param_grid, cv=skf, scoring=scoring)
    grid_search.fit(X_train, y_train)

    # Get the best model and its hyperparameters
    best_model = grid_search.best_estimator_
    best_params = grid_search.best_params_

    # Store the best parameters for the current category in the dictionary
    best_params_dict[name] = best_params
    print(f'best parameters: {best_params}')

    # Retrain the best model with the whole training set
    best_model.fit(X_train, y_train)

    # Make predictions on the test set
    y_pred = best_model.predict(X_test)
    
    # Calculate the probabilities (not for SVM as this is not possible)
    if classifier != 'SVM':
        y_pred_proba = best_model.predict_proba(X_test)
        
        # Find the highest probability for each observation
        highest_prob = np.amax(y_pred_proba, axis = 1)
    
        # Create a DataFrame with test observations, highest probabilities, and predicted classes
        predictions_df = pd.DataFrame({'Observation_nr': y_test.index, 'Probability': highest_prob, 'Prediction': y_pred})
        
    else:
        # Create a DataFrame with test observations and predicted classes
        predictions_df = pd.DataFrame({'Observation_nr': y_test.index, 'Prediction': y_pred})
        
    # Store the final predictions with its probability for the test set
    predictions_df.to_csv(f'./Output/predictions/{name}.csv', index = False, header = True)
    #predictions_df.to_excel(f'./Output/predictions/{name}.xlsx', index = False, header = True)

    # Calculate the classification metrics
    accuracy, precision, recall, f1 = get_classification_metrics(y_test, y_pred)
    
    # print the results
    print(f'Results for {name}:')
    print(f'Accuracy: {accuracy}')
    print(f'Precision: {precision}')
    print(f'Recall: {recall}')
    print(f'F1: {f1}')
    
    # add the results to the dataframe with all the results
    results_all_df.loc[name] = [vectorizer, FS, classifier, resampling, accuracy, precision, recall, f1]

# 2. Word2Vec

In [20]:
import sys
import re
import gensim
from gensim.models import Word2Vec, Doc2Vec
from gensim.models.phrases import Phraser, Phrases
from gensim.models import KeyedVectors
import gensim.downloader
from gensim.models.doc2vec import Doc2Vec, TaggedDocument

## 2.1 Create word embeddings

In [21]:
headlines = [sentence.split() for sentence in X_train]


In [22]:
cbow_model = Word2Vec(headlines, 
                 min_count = 2,          # Ignore words that appear less than this
                 vector_size = 100,      # Dimensionality of word embeddings
                 workers = 8,            # Number of processors (parallelisation)
                 window = 5,             # Context window for words during training
                 epochs = 20,            # Number of epochs training over corpus
                 sg = 0)                 # 0 for CBOW and 1 for skipgram

In [23]:
def vectorize_cbow(headline):
    
    words = headline.split()
    words_vecs = [cbow_model.wv[word] for word in words if word in cbow_model.wv]
    
    if len(words_vecs) == 0:
        return np.zeros(100)
    
    words_vecs = np.array(words_vecs)
    return words_vecs.mean(axis=0)

In [24]:
X_train_cbow = np.array([vectorize_cbow(headline) for headline in X_train])
X_test_cbow = np.array([vectorize_cbow(headline) for headline in X_test])

In [63]:
# Define with what vectorizer we build the models:
vectorizer = 'Word2Vec'
FS = 'cbow' # Feature selection

### 2.1.1 Without resampling

In [62]:
# Define the resampling technique
resampling = 'None'

#### A. Logistic regression

In [27]:
# define the model characteristics
model_name = 'cbow_log_w'
classifier = 'logR'

# Initialize the classifier
logreg = LogisticRegression(random_state = 7)

# Define the parameter grid
param_grid_log = {
    'penalty': ['None','l1', 'l2'], # normal, lasso or ridge
    'C': [0.1, 1, 10]               # The inverse penalization term (smaller is higher penalization)
}


In [28]:
# perform a grid search for the logistic regression model
# the results are automatically stored in results_all_df and best_params_dict
perform_grid_search(model_name, logreg, param_grid_log, X_train_cbow, X_test_cbow, y_train, y_test,
                   vectorizer, FS, classifier, resampling)

best parameters: {'C': 10, 'penalty': 'l2'}
Results for cbow_log_w:
Accuracy: 0.9118734973182911
Precision: 0.5102294625518764
Recall: 0.2890478595383949
F1: 0.34817734213745877


#### B. Decision Tree

In [29]:
# define the model characteristics
model_name = 'cbow_DT_w'
classifier = 'DT'

# Initialize the classifier
tree = DecisionTreeClassifier(random_state = 7)

# Define the parameter grid
param_grid_DT = {
    'criterion': ['gini'],          # Define the splitting criteria: Gini index for node impurity
    'min_samples_leaf': [1, 2],     # Define the minimum number of samples required to be at leaf node
    'max_features': [None, 'sqrt']  # Define the number of features to consider when looking for the best split
}

In [31]:
# perform a grid search for the logistic regression model
# the results are automatically stored in results_all_df and best_params_dict
perform_grid_search(model_name, tree, param_grid_DT, X_train_cbow, X_test_cbow, y_train, y_test,
                   vectorizer, FS, classifier, resampling)


best parameters: {'criterion': 'gini', 'max_features': None, 'min_samples_leaf': 1}
Results for bow_DT_w:
Accuracy: 0.858701683003514
Precision: 0.2045113932272664
Recall: 0.211491191832594
F1: 0.20662451402036236


#### C. Support Vector Machine

In [32]:
# define the model characteristics
model_name = 'cbow_svm_w'
classifier = 'SVM'

# Initialize the classifier
svm = SVC(random_state = 7)

# Define the parameter grid
param_grid_svm = {
    'C': [0.1, 1, 10, 100], # inverse regularization parameter
    'kernel': ['linear', 'poly', 'rbf'], # what type of kernel need to be used (rbf = radial kernel)
}

In [33]:
# perform a grid search for the logistic regression model
# the results are automatically stored in results_all_df and best_params_dict
perform_grid_search(model_name, svm, param_grid_svm, X_train_cbow, X_test_cbow, y_train, y_test,
                   vectorizer, FS, classifier, resampling)

best parameters: {'C': 100, 'kernel': 'rbf'}
Results for cbow_svm_w:
Accuracy: 0.9098390974662475
Precision: 0.44819245214385195
Recall: 0.34387471318844837
F1: 0.38272525112293937


#### D. Random Forest Classifier

In [34]:
# define the model characteristics
model_name = 'bow_rf_w'
classifier = 'RF'

# Initialize the classifier
rfc = RandomForestClassifier(random_state = 7, n_jobs = -1)

# Define the parameter grid
param_grid_rf = {
    'criterion': ['gini'],          # Define the splitting criteria: Gini index for node impurity
    'n_estimators': [100, 500],     # the number of trees to use when building the model
    'min_samples_leaf': [1, 2],     # Define the minimum number of samples required to be at leaf node
    'max_features': [None, 'sqrt']  # Define the number of features to consider when looking for the best split
}

In [35]:
# perform a grid search for the logistic regression model
# the results are automatically stored in results_all_df and best_params_dict
perform_grid_search(model_name, rfc, param_grid_rf, X_train_cbow, X_test_cbow, y_train, y_test,
                   vectorizer, FS, classifier, resampling)

best parameters: {'criterion': 'gini', 'max_features': None, 'min_samples_leaf': 1, 'n_estimators': 500}
Results for bow_rf_w:
Accuracy: 0.9137229517292399
Precision: 0.6453322791499141
Recall: 0.2088847365256517
F1: 0.27457331994533407


#### E. Adaboost classifier

In [36]:
# define the model characteristics
model_name = 'bow_ada_w'
classifier = 'ADA'

# Initialize decision tree base estimator for AdaBoost
base_estimator = DecisionTreeClassifier(random_state = 7)

# Initialize AdaBoost classifier
ada = AdaBoostClassifier(base_estimator = base_estimator, random_state = 7)

# Define parameter grid for AdaBoost
param_grid_ada = {
    'n_estimators': [50, 100, 250],   # the maximum number of estimators before boosting is terminated
    'learning_rate': [0.1, 0.5, 1.0], # weight applied to each classifier at boosting iteration
                                      # A higher learning rate increases the contribution of each classifier. 
}

In [37]:
# perform a grid search for the logistic regression model
# the results are automatically stored in results_all_df and best_params_dict
perform_grid_search(model_name, ada, param_grid_ada, X_train_cbow, X_test_cbow, y_train, y_test,
                   vectorizer, FS, classifier, resampling)

NameError: name 'X_train_bow' is not defined

### 2.1.2 With undersampling

In [36]:
# Define the resampling technique
resampling = 'Und'

In [37]:
# Define the categories and the maximum number of samples
categories = df_train["category"].unique()

#### First strategy

#Define the minimum number of samples for a class
min_samples = min(y_train.value_counts())

#Define the maximum number of samples per class to keep the ratio 1 to 4 for each class
max_samples = min_samples*4

#Create a dictionary to store the actual maximum imbalance per class
max_imbalance = {}

#Calculate the actual maximum imbalance for each class
for category in categories:
    
    # Check if the number of samples for the category is lower than the desired maximum
    if (y_train.value_counts()[category]) < max_samples:
        
        # Set the actual maximum to the number of available samples
        max_imbalance[category] = y_train.value_counts()[category]
        
    else:
        # Set the actual maximum to the desired maximum
        max_imbalance[category] = max_samples

#### Second strategy

In [38]:
# Calculate the number of samples in the biggest minority category
rus_n = df_train['category'].value_counts().sort_values(ascending=False)[1]

# Dictionary to store the actual maximum imbalance per class
max_imbalance = {}

# Calculate the actual maximum imbalance for each class
for category in categories:
    if category == 'None':
        max_imbalance[category] = rus_n
    else:
        # Set the actual maximum to the number of available samples
        max_imbalance[category] = y_train.value_counts()[category]

In [39]:
# Create the random undersampler with maximum imbalance
undersampler = RandomUnderSampler(sampling_strategy = max_imbalance, random_state = 7)

# Undersample the data
X_train_bow_und, y_train_bow_und = undersampler.fit_resample(X_train_bow, y_train)
y_train_bow_und.value_counts()

None                                                           797
Strategic alliance                                             797
Corporate \ngovernance                                         597
New product introduction/\nservice offering                    518
Merger & \nacquisitions                                        460
Financing                                                      262
Product/\nservice improvement                                  254
Venturing                                                      250
Expansion in existing market (product/service/geographical)    249
Production-related actions                                     212
Marketing                                                      205
Divestiture                                                    199
Human resources                                                161
R&D-related actions                                            138
Market entry                                                  

#### A. Logistic Regression

In [40]:
# define the model characteristics
model_name = 'bow_log_u'
classifier = 'logR'


In [41]:
# perform a grid search for the logistic regression model
# the results are automatically stored in results_all_df and best_params_dict
perform_grid_search(model_name, logreg, param_grid_log, X_train_bow_und, X_test_bow, y_train_bow_und, y_test,
                   vectorizer, FS, classifier, resampling)

best parameters: {'C': 1, 'penalty': 'l2'}
Results for bow_log_u:
Accuracy: 0.8240244127982245
Precision: 0.35076021791522727
Recall: 0.6572033669036993
F1: 0.43505098444332013


#### B. Decision Tree

In [42]:
# define the model characteristics
model_name = 'bow_DT_u'
classifier = 'DT'


In [43]:
# perform a grid search for the logistic regression model
# the results are automatically stored in results_all_df and best_params_dict
perform_grid_search(model_name, tree, param_grid_DT, X_train_bow_und, X_test_bow, y_train_bow_und, y_test,
                   vectorizer, FS, classifier, resampling)


best parameters: {'criterion': 'gini', 'max_features': None, 'min_samples_leaf': 2}
Results for bow_DT_u:
Accuracy: 0.7658590715738857
Precision: 0.2596338426130034
Recall: 0.5777130538199041
F1: 0.3364276176926325


#### C. Support Vector Machine

In [44]:
# define the model characteristics
model_name = 'bow_svm_u'
classifier = 'SVM'


In [45]:
# perform a grid search for the logistic regression model
# the results are automatically stored in results_all_df and best_params_dict
perform_grid_search(model_name, svm, param_grid_svm, X_train_bow_und, X_test_bow, y_train_bow_und, y_test,
                   vectorizer, FS, classifier, resampling)

best parameters: {'C': 1, 'kernel': 'linear'}
Results for bow_svm_u:
Accuracy: 0.7852783428888478
Precision: 0.2876350950340012
Recall: 0.6385497060350971
F1: 0.377265994719291


#### D. Random Forest Classifier

In [46]:
# define the model characteristics
model_name = 'bow_rf_u'
classifier = 'RF'


In [47]:
# perform a grid search for the logistic regression model
# the results are automatically stored in results_all_df and best_params_dict
perform_grid_search(model_name, rfc, param_grid_rf, X_train_bow_und, X_test_bow, y_train_bow_und, y_test,
                   vectorizer, FS, classifier, resampling)

best parameters: {'criterion': 'gini', 'max_features': None, 'min_samples_leaf': 2, 'n_estimators': 100}
Results for bow_rf_u:
Accuracy: 0.8017384871462918
Precision: 0.28664982532638317
Recall: 0.59171962553755
F1: 0.36732559232507


#### E. Adaboost classifier

In [48]:
# define the model characteristics
model_name = 'bow_ada_u'
classifier = 'ADA'


In [49]:
# perform a grid search for the logistic regression model
# the results are automatically stored in results_all_df and best_params_dict
perform_grid_search(model_name, ada, param_grid_ada, X_train_bow_und, X_test_bow, y_train_bow_und, y_test,
                   vectorizer, FS, classifier, resampling)

best parameters: {'learning_rate': 0.1, 'n_estimators': 50}
Results for bow_ada_u:
Accuracy: 0.7069539485851674
Precision: 0.23804587757702966
Recall: 0.5491769284022234
F1: 0.3019865445836417


### 2.1.3 With oversampling

In [50]:
# Define the resampling technique
resampling = 'Ove'

In [51]:
# Calculate the number of samples in the majority class
ove_n = df_train['category'].value_counts().sort_values(ascending=False)[0]

# Oversample until the number of observations equals a fourth of the majority class
max_samples = int(ove_n/4)

# Dictionary to store the actual maximum imbalance per class
max_imbalance = {}

# Calculate the actual maximum imbalance for each class
for category in categories:
    if category == 'None':
        max_imbalance[category] = y_train.value_counts()[category]
    else:
        # Set the actual maximum to the number of available samples
        max_imbalance[category] = max_samples

In [52]:
# Create the SMOTE oversampler
oversampler = SMOTE(sampling_strategy=max_imbalance, random_state=7)

# Undersample the data
X_train_bow_ove, y_train_bow_ove = oversampler.fit_resample(X_train_bow, y_train)
y_train_bow_ove.value_counts()

None                                                           38876
R&D-related actions                                             9719
Human resources                                                 9719
Venturing                                                       9719
Market entry                                                    9719
Divestiture                                                     9719
Product/\nservice improvement                                   9719
Merger & \nacquisitions                                         9719
Financing                                                       9719
Marketing                                                       9719
Expansion in existing market (product/service/geographical)     9719
Corporate \ngovernance                                          9719
Production-related actions                                      9719
New product introduction/\nservice offering                     9719
Strategic alliance                

#### A. Logistic Regression

In [53]:
# define the model characteristics
model_name = 'bow_log_o'
classifier = 'logR'


In [54]:
# perform a grid search for the logistic regression model
# the results are automatically stored in results_all_df and best_params_dict
perform_grid_search(model_name, logreg, param_grid_log, X_train_bow_ove, X_test_bow, y_train_bow_ove, y_test,
                   vectorizer, FS, classifier, resampling)

best parameters: {'C': 10, 'penalty': 'l2'}
Results for bow_log_o:
Accuracy: 0.8957832439430368
Precision: 0.41627749268059494
Recall: 0.4966884459241199
F1: 0.4492320023275047


#### B. Decision Tree

In [55]:
# define the model characteristics
model_name = 'bow_DT_o'
classifier = 'DT'

In [56]:
# perform a grid search for the logistic regression model
# the results are automatically stored in results_all_df and best_params_dict
perform_grid_search(model_name, tree, param_grid_DT, X_train_bow_ove, X_test_bow, y_train_bow_ove, y_test,
                   vectorizer, FS, classifier, resampling)


best parameters: {'criterion': 'gini', 'max_features': None, 'min_samples_leaf': 1}
Results for bow_DT_o:
Accuracy: 0.833179212132421
Precision: 0.2879363227925523
Recall: 0.5086591393544257
F1: 0.35134370930114506


#### C. Support Vector Machine

In [57]:
# define the model characteristics
model_name = 'bow_svm_o'
classifier = 'SVM'

In [None]:
# perform a grid search for the logistic regression model
# the results are automatically stored in results_all_df and best_params_dict
perform_grid_search(model_name, svm, param_grid_svm, X_train_bow_ove, X_test_bow, y_train_bow_ove, y_test,
                   vectorizer, FS, classifier, resampling)

#### D. Random Forest Classifier

In [None]:
# define the model characteristics
model_name = 'bow_rf_o'
classifier = 'RF'

In [None]:
# perform a grid search for the logistic regression model
# the results are automatically stored in results_all_df and best_params_dict
perform_grid_search(model_name, rfc, param_grid_rf, X_train_bow_ove, X_test_bow, y_train_bow_ove, y_test,
                   vectorizer, FS, classifier, resampling)

#### E. Adaboost classifier

In [None]:
# define the model characteristics
model_name = 'bow_ada_o'
classifier = 'ADA'

In [None]:
# perform a grid search for the logistic regression model
# the results are automatically stored in results_all_df and best_params_dict
perform_grid_search(model_name, ada, param_grid_ada, X_train_bow_ove, X_test_bow, y_train_bow_ove, y_test,
                   vectorizer, FS, classifier, resampling)

In [None]:
# write away results
results_all_df.to_csv('./Output/Model performance/results_BOW.csv', index = False, header = True)

In [None]:
# Write the dictionary with the best parameters away
with open('./Output/parameters/BOW.json', 'w') as file:
    json.dump(best_params_dict, file)

## 3. Pretrained embeddings

### 3.1 Google news embeddings

Uses continious skipgram: http://vectors.nlpl.eu/repository/?ref=blog.paperspace.com

https://www.analyticsvidhya.com/blog/2020/03/pretrained-word-embeddings-nlp/


In [26]:
# import pretrained model
google_news = KeyedVectors.load_word2vec_format('./data/pretrained/GoogleNews-vectors-negative300.bin', binary=True)

In [27]:
# Download the "word2vec-google-news-300" embeddings
#w2v_google_news = gensim.downloader.load('word2vec-google-news-300')

In [71]:
def vectorize_pretrained(headline, pre_trained_model, size):
    
    words = headline.split()
    words_vecs = [pre_trained_model[word] for word in words if word in pre_trained_model]
    
    if len(words_vecs) == 0:
        return np.zeros(size)
    
    words_vecs = np.array(words_vecs)
    return words_vecs.mean(axis=0)

In [67]:
X_train_google_news = np.vstack([vectorize_pretrained(headline, google_news, 300) for headline in X_train])
X_test_google_news = np.vstack([vectorize_pretrained(headline, google_news, 300) for headline in X_test])

In [48]:
len(X_train_google_news)

43254

In [49]:
len(X_train_cbow)

43254

In [60]:
# define the model characteristics
model_name = 'cbow_log_w'
classifier = 'logR'

# Initialize the classifier
logreg = LogisticRegression(random_state = 7)

# Define the parameter grid
param_grid_log = {
    'penalty': ['None','l1', 'l2'], # normal, lasso or ridge
    'C': [0.1, 1, 10]               # The inverse penalization term (smaller is higher penalization)
}


In [64]:
# perform a grid search for the logistic regression model
# the results are automatically stored in results_all_df and best_params_dict
perform_grid_search(model_name, logreg, param_grid_log, X_train_google_news, X_test_google_news, y_train, y_test,
                   vectorizer, FS, classifier, resampling)

best parameters: {'C': 10, 'penalty': 'l2'}
Results for cbow_log_w:
Accuracy: 0.9141853153319771
Precision: 0.48931567788333724
Recall: 0.3452856247391152
F1: 0.39778611534634684


In [74]:
# Download the "glove-twitter-25" embeddings
glove_twitter = gensim.downloader.load('glove-twitter-100')



In [75]:
X_train_glove = np.vstack([vectorize_pretrained(headline, glove_twitter, 50) for headline in X_train])
X_test_glove = np.vstack([vectorize_pretrained(headline, glove_twitter, 50) for headline in X_test])

In [76]:
# perform a grid search for the logistic regression model
# the results are automatically stored in results_all_df and best_params_dict
perform_grid_search(model_name, logreg, param_grid_log, X_train_glove, X_test_glove, y_train, y_test,
                   vectorizer, FS, classifier, resampling)

best parameters: {'C': 10, 'penalty': 'l2'}
Results for cbow_log_w:
Accuracy: 0.9040133160717588
Precision: 0.404330300307501
Recall: 0.18227815053730115
F1: 0.2288104955249781


## 4. Glove embeddings

In [68]:
!wget --no-check-certificate http://nlp.stanford.edu/data/glove.6B.zip-O /tmp/glove.6B.zip

zsh:1: command not found: wget


In [67]:
# load the whole embedding into memory
embeddings_index = dict()
f = open('../input/glove6b/glove.6B.300d.txt')

for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs

f.close()
print('Loaded %s word vectors.' % len(embeddings_index))

### 5. CNN model