# Vectorization

# 0. Data loading

In [1]:
# General Packages #
import os
import pandas as pd
import numpy as np
import string
import re
from scipy.stats import randint
import random


# Load TQDM to Show Progress Bars #
from tqdm import tqdm
from tqdm.notebook import tqdm as tqdm_notebook

# Sklearn Packages #
from sklearn.feature_extraction.text import TfidfTransformer, CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split, StratifiedShuffleSplit, StratifiedKFold, cross_val_predict, GridSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report, make_scorer
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier

# NLTK Packages #
import nltk
from nltk.corpus import stopwords
from textblob import TextBlob, Word

# Import necessary libraries for handling imbalanced data
from imblearn.metrics import geometric_mean_score
from imblearn.pipeline import make_pipeline
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler



In [2]:
# Turn off warnings, just to avoid pesky messages that might cause confusion here
# Remove when testing your own code #
import warnings
warnings.filterwarnings("ignore")

In [3]:
# Change to Working Directory with Training Data # 
os.chdir("/Users/Artur/Desktop/thesis_HIR_versie5/coding")
#os.chdir("/Users/juarel/Desktop/studies artur/thesis_HIR/coding")

# Load Training Data #
df_train = pd.read_csv("./data/train_adjusted.csv", header = 0)
df_test = pd.read_csv("./data/test_adjusted.csv", header = 0)

# inspect the data
df_train.head(5)

Unnamed: 0,id,Headline,category
0,194578,Head Line: US Patent granted to BASF SE (Delaw...,
1,564295,Societe Generale Launches a Next-Generation Ca...,
2,504138,BARCLAYS PLC Form 8.3 - EUTELSAT COMMUNICATION...,
3,91379,ASML: 4Q Earnings Snapshot,
4,265750,Form 8.3 - AXA INVESTMENT MANAGERS : Booker Gr...,


In [4]:
# read in the data about the different companies
companies = pd.read_excel("./data/companies.xlsx", header = 0)
companies.head(5)

Unnamed: 0,NameCompany
0,Andritz AG
1,ams AG
2,voestalpine AG
3,OMV AG
4,Wienerberger AG


# 1. Data cleaning

The first step when building our model is to clean the data. To perform this step, we need to define a custom tokenizer that will serve as input in our vectorizers. This tokenizer needs to fulfill the following criteria:

1. We convert the strings to lowercase
2. We remove currency symbols
3. We remove punctuation from the text
4. We remove English stopwords from the text as these do not provide any information to our model
5. We remove words that with information linked to a specific company
6. We remove words that consist out of a single character
7. We lemmatize the words to reduce the words to their base form

#### Create a list with words linked to a specific company

First, we create a set of all the unique words that could be linked to a company. These are the full company names (including legal suffixes) of which part were used to retrieve the newswires and press releases from the NexisUni database. 

Then, we want to determine the set of words that does not contain any information about the company, such as legal prefixes. Therefore, we inspect which words are most frequently used in the names of the companies. When we inspected the results, we saw 1021 of the 1113 words were only used once in all the company names. These words can be seen as to company specific and will be removed from our headlines.

In [5]:
# define a list with the companies names to use in the tokenizer
# Do not use this information in your model
company_info = set()

for name in companies['NameCompany']:
    name = name.lower()
    words = name.split()
    company_info.update(words)

len(company_info) 

1137

Determine a set of words that does not contain any information about the company:

1. First, I convert the company names into lowercase and split the words. These results are stored in the column 'cleaned_name'. 
2. Then, I retrieve all these words and store them in the array 'company_names_array'. Note that these are not unique words, but just all the cleaned words from each company joined into one array.
3. Next, I store the frequency of each word in a dataframe. 
4. Finally, I can determine which words are to company specific to be included in the data.

In [6]:
# Define a list with only the companies names
def company_name_tokenizer(name):
    # Remove special characters and digits
    name = re.sub(r"[^\w\s]", "", name)
    
    # Convert the name to lowercase
    name = name.lower()
    
    # Split the name into individual words
    words = name.split()
    
    return words

companies['cleaned_name'] = companies['NameCompany'].apply(company_name_tokenizer)
companies.head(5)

Unnamed: 0,NameCompany,cleaned_name
0,Andritz AG,"[andritz, ag]"
1,ams AG,"[ams, ag]"
2,voestalpine AG,"[voestalpine, ag]"
3,OMV AG,"[omv, ag]"
4,Wienerberger AG,"[wienerberger, ag]"


In [7]:
# Concatenate all the values from the cleaned company names and store them in an array
company_names_array = np.concatenate(companies['cleaned_name'].values)

# Count the frequency of each word in the array
frequent_company_info = np.unique(company_names_array, return_counts=True)

# Store the results in da dataframe
word_frequencies = pd.DataFrame({'Word': frequent_company_info[0], 
                                'Count': frequent_company_info[1]})

# Sort the dataframe in descending order by the 'Count' column
word_frequencies = word_frequencies.sort_values('Count', ascending=False)
#word_frequencies['Count'].value_counts()

Create a set of the words that are not linked to a specific company. In other terms, words that were used more than once in a company description.

In [8]:
# Filter the dataframe based on the count threshold
general_voc = word_frequencies[word_frequencies['Count'] >= 2]['Word'].tolist()

# Create a set from the filtered values
general_voc = set(general_voc)

Remove these words from the initial set of words with all the company information

In [9]:
# remove these words from the set company info
company_info = company_info.difference(general_voc)

Next, define the customer tokenizer that will be used as input in our vectorizers. Therefore, we use the information in company_info that we have just defined.

In [10]:
def textblob_tokenizer(str_input):
    
    # Convert list to string
    input_str = str_input
    if isinstance(input_str, list):
        input_str = ' '.join(input_str)
        
    # Remove currency symbols
    str_input = re.sub(r'\$|£|€', '', str_input)
    
    # Remove punctuation
    str_input = str_input.translate(str.maketrans('', '', string.punctuation))
    
    # Define stop words
    stop_words = set(stopwords.words('english'))

    # Tokenize text
    blob = TextBlob(str_input.lower())
    tokens = blob.words
    
    # Remove numbers, stop words, company information and words with one character
    words = [word for word in tokens if not re.match('^\d+$', word) and word not in stop_words
                                        and word not in company_info and len(word) > 1]

    
    # Lemmatize words
    words = [Word(word).lemmatize() for word in words]
    
    return words

In [11]:
# inspect the cleaned data
df_train['cleaned_headline'] = df_train['Headline'].apply(textblob_tokenizer)

# check the data
df_train[["cleaned_headline", "Headline"]]

Unnamed: 0,cleaned_headline,Headline
0,"[head, u, patent, granted, se, delaware, may, ...",Head Line: US Patent granted to BASF SE (Delaw...
1,"[societe, generale, launch, nextgeneration, ca...",Societe Generale Launches a Next-Generation Ca...
2,"[plc, form, communication]",BARCLAYS PLC Form 8.3 - EUTELSAT COMMUNICATION...
3,"[4q, earnings, snapshot]",ASML: 4Q Earnings Snapshot
4,"[form, investment, manager, group, plc]",Form 8.3 - AXA INVESTMENT MANAGERS : Booker Gr...
...,...,...
43249,"[system, asa, tom, purchase, share]",Tomra Systems ASA: TOM: Purchase of own shares
43250,"[swiss, federal, institute, intellectual, gran...",Swiss Federal Institute of Intellectual Proper...
43251,"[icon, pfizer, join, addplan, df, consortiumne...",ICON: Pfizer and Roche Join ADDPLAN DF Consort...
43252,"[plc, transaction, share]",Rio Tinto PLC Transaction in Own Shares -3-


#### define functions needed in this notebook

Before we continue, we first define some useful functions and parameters that we will need later in this notebook:

1. get_classification_metrics: Create a function that return the classification metrics for each model. The precision, recall and f1 score are all determined using the average value of all classes, without adjusting weights to these classes.

2. create_results_df(): Create a function that creates a dataframe where the main classification metrics can be stored for each model.

3. Define the number of splits, the stratified cross validator to ensure class frequencies are considered, and the scoring metric.

4. Define a function that trains the defined model, depending on the vectorizer, the input data, the classifier and its parameter grid.


In [12]:
# 1. Function that returns classication metrics
def get_classification_metrics(y_true, y_pred):
    # Calculate Model Performance Metrics
    accuracy = accuracy_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred, average='macro')
    recall = recall_score(y_true, y_pred, average='macro')
    f1 = f1_score(y_true, y_pred, average='macro')


    return accuracy, precision, recall, f1


#2. Function that stores classifcation results 
def create_results_df(model_name):
    
    # Create an empty DataFrame with the model name as the index
    results_df = pd.DataFrame(index=[model_name])

    # Add columns for the metrics
    columns = ['accuracy', 'precision', 'recall', 'f1']
    for col in columns:
        results_df[col] = 0

    return results_df


In [13]:
# Create an empty dataframe to store the results of all the models
results_all_df = pd.DataFrame()

# Add columns for the metrics
columns = ['vectorizer', 'FS', 'classifier', 'resampling','accuracy', 'precision', 'recall', 'f1']
for col in columns:
    results_all_df[col] = 0


In [14]:
# Create a dataframe to store the results of different max features in vectorizer
results_vec_df = pd.DataFrame()

# Add columns for the metrics
columns = ['vectorizer', 'FS', 'classifier', 'resampling', 'max_features', 'accuracy', 'precision', 'recall', 'f1']
for col in columns:
    results_vec_df[col] = 0

In [15]:
# Define the number of folds for cross-validation
n_splits = 5

# Initialize the stratified k-fold object
skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42) # ensures class balances are kept

# Define the scoring metric
scoring = make_scorer(f1_score, average= 'macro')


In [16]:
# define the independent and dependent variables
X_train = df_train['Headline']
X_test = df_test['Headline']

y_train = df_train['category']
y_test = df_test['category']

In [17]:
# create an empty dictionary to store the optimal parameters
best_params_dict = {}

In [18]:
def perform_grid_search(name, model, param_grid, X_train, X_test, y_train, y_test,
                       vectorizer, FS, classifier, resampling, max_features):
    
    # Define a seed value
    random.seed(7)
        
    # Perform the grid search using cross-validation
    grid_search = GridSearchCV(model, param_grid, cv=skf, scoring=scoring)
    grid_search.fit(X_train, y_train)

    # Get the best model and its hyperparameters
    best_model = grid_search.best_estimator_
    best_params = grid_search.best_params_

    # Store the best parameters for the current category in the dictionary
    best_params_dict[name] = best_params
    #print(f'best parameters: {best_params}')

    # Retrain the best model with the whole training set
    best_model.fit(X_train, y_train)

    # Make predictions on the test set
    y_pred = best_model.predict(X_test)
    
    # Calculate the probabilities (not for SVM as this is not possible)
    if classifier != 'SVM':
        y_pred_proba = best_model.predict_proba(X_test)
        
        # Find the highest probability for each observation
        highest_prob = np.amax(y_pred_proba, axis = 1)
    
        # Create a DataFrame with test observations, highest probabilities, and predicted classes
        predictions_df = pd.DataFrame({'Observation_nr': y_test.index, 'Probability': highest_prob, 'Prediction': y_pred})
        
    else:
        # Create a DataFrame with test observations and predicted classes
        predictions_df = pd.DataFrame({'Observation_nr': y_test.index, 'Prediction': y_pred})
        
    # Store the final predictions with its probability for the test set
    predictions_df.to_csv(f'./Output/predictions/{name}.csv', index = False, header = True)
    #predictions_df.to_excel(f'./Output/predictions/{name}.xlsx', index = False, header = True)

    # Calculate the classification metrics
    accuracy, precision, recall, f1 = get_classification_metrics(y_test, y_pred)
    
    # print the results
    #print(f'Results for {name}:')
    #print(f'Accuracy: {accuracy}')
    #print(f'Precision: {precision}')
    #print(f'Recall: {recall}')
    print(f'F1: {f1}')
    
    # add the results to the dataframe with the results for different max_features
    results_vec_df.loc[name] = [vectorizer, FS, classifier, resampling, max_features, accuracy, precision, recall, f1]


# 2. Vectorizers

In this notebook, I will test the performance of two vectorizers to transform the textual data into numerical representations. For each vectorizer, I will test 5 base classifiers. Further, will try to adress the class imbalance issue in the dataset with two resampling techniques. For each base classifier, I will train the model once without resampling technique, once with random undersampling and once with oversampling through SMOTE. 

## 2.1 Bag of words

The first vectorizer that we will use to transform our textual data into numerical presentations is the bag of words procedure or BOW. In this approach, each headline is treated as a collection of individual words. For each feature of the model, each document has a zero or one score indicating if the feature (=the word) is present in the headline. This is a simple and inexpensive approach to represent textual data into a numerical form. 

Define the parameters of the vectorizer

In [19]:
# Minimum Document Frequency -- Minimum number of times a word needs to occur to be considered #
MINDF = 2

# Maximum Document Frequency -- Maximum share of documents where a word needs to occur to be considered #
MAXDF = 0.9

# Maximum number of features we would want to consider -- ranked by most frequently occuring #
MF= 10000

# NGrams -- Number of Word Pairs. Takes the form (Min, Max). E.g. (1, 2) means single words and word pairs # 
NGrams = (1,2)

Define the vectorizer itself

In [20]:
# define the vectorizer
vec_bow = CountVectorizer(max_features= MF,
                          min_df = MINDF,
                          max_df = MAXDF,
                          ngram_range=(1, 2),
                          tokenizer=textblob_tokenizer)

In [21]:
# Define the resampling technique
vectorizer = 'BOW'
resampling = 'None'
FS = 'None' # Feature selection

Transform the textual data into numerical representations with the BOW vectorizer

In [22]:
# define the model characteristics
model_name = 'test_max_features'
classifier = 'logR'

# Initialize the classifier
logreg = LogisticRegression(random_state = 7)

# Define the parameter grid
param_grid_log = {
    'penalty': ['None','l1', 'l2'], # normal, lasso or ridge
    'C': [0.1, 1, 10]               # The inverse penalization term (smaller is higher penalization)
}


In [23]:
for i in range(2000, 20001, 1000):
    
    # define the vectorizer
    vec_bow = CountVectorizer(max_features= i,
                          min_df = MINDF,
                          max_df = MAXDF,
                          ngram_range=(1, 2),
                          tokenizer=textblob_tokenizer)
        
    X_train_bow = vec_bow.fit_transform(X_train)
    X_test_bow = vec_bow.transform(X_test)
    perform_grid_search(model_name, logreg, param_grid_log, X_train_bow, X_test_bow, y_train, y_test,
                   vectorizer, FS, classifier, resampling, i)
        

best parameters: {'C': 10, 'penalty': 'l2'}
Results for bow_log_w:
Accuracy: 0.9183465877566118
Precision: 0.5293491393225163
Recall: 0.4198549950027333
F1: 0.46548206280088783
best parameters: {'C': 10, 'penalty': 'l2'}
Results for bow_log_w:
Accuracy: 0.9179766968744221
Precision: 0.526456775712849
Recall: 0.4138661059394787
F1: 0.4597201378938184


KeyboardInterrupt: 

## 2.1.1 Test performance nr of features

## 2.1.2 Evaluate SVD performance

In [24]:
from sklearn.datasets import make_classification
from sklearn.decomposition import TruncatedSVD
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_score
from scipy.sparse import csr_matrix

In [25]:
# Minimum Document Frequency -- Minimum number of times a word needs to occur to be considered #
MINDF = 2

# Maximum Document Frequency -- Maximum share of documents where a word needs to occur to be considered #
MAXDF = 0.9

# Maximum number of features we would want to consider -- ranked by most frequently occuring #
MF= 10000

# NGrams -- Number of Word Pairs. Takes the form (Min, Max). E.g. (1, 2) means single words and word pairs # 
NGrams = (1,2)

Define the vectorizer itself

In [26]:
# define the vectorizer
vec_bow = CountVectorizer(max_features= MF,
                          min_df = MINDF,
                          max_df = MAXDF,
                          ngram_range=(1, 2),
                          tokenizer=textblob_tokenizer)

In [27]:
X_train_bow = vec_bow.fit_transform(X_train)
X_test_bow = vec_bow.transform(X_test)

In [28]:
# Convert the sparse matrix to a dense array
X_train_bow_arr = X_train_bow.toarray()

In [29]:
y_train_arr = y_train.values
y_train_arr

array(['None', 'None', 'None', ..., 'Strategic alliance', 'None', 'None'],
      dtype=object)

In [30]:
# get a list of models
def get_models_svd():
    models = dict()
    for i in range(1000, 9001, 1000):
        steps = [('svd', TruncatedSVD(n_components=i)), ('m', LogisticRegression())]
        models[str(i)] = Pipeline(steps=steps)
    return models

In [31]:
# evaluate a give model using cross-validation
def evaluate_model(model, X, y):
    cv = StratifiedKFold(n_splits=5)
    scores = cross_val_score(model, X, y, scoring= scoring, cv=cv, n_jobs=-1, error_score='raise')
    return scores

In [32]:
# get the models to evaluate
models = get_models_svd()

# evaluate the models and store results
results, names = list(), list()
for name, model in models.items():
    scores = evaluate_model(model, X_train_bow_arr, y_train_arr)
    results.append(scores)
    names.append(name)
    print('>%s %.3f (%.3f)' % (name, np.mean(scores), np.std(scores)))

KeyboardInterrupt: 

In [None]:
# plot model performance for comparison
import matplotlib.pyplot as plt

plt.boxplot(results, labels=names, showmeans=True)
plt.xticks(rotation=45)
plt.show() 

In [62]:
# load predictions best model
#predictions = pd.read_csv("./Output/predictions/bow_log_w.csv", header = 0)
#predictions

Unnamed: 0,Observation_nr,Probability,Prediction
0,0,0.928801,
1,1,0.999932,
2,2,0.999938,
3,3,0.999660,
4,4,0.999934,
...,...,...,...
10809,10809,0.999587,
10810,10810,0.999996,
10811,10811,0.999998,
10812,10812,0.999971,


In [63]:
# Reset index of df_test and create a new column 'Observation_nr'
#df_test = df_test.reset_index().rename(columns={'index': 'Observation_nr'})

# Merge the DataFrames based on 'Observation_nr'
#merged_df = df_test.merge(predictions, on='Observation_nr', how='inner')

In [64]:
# Print the merged DataFrame
#merged_df = merged_df.drop(['id', 'Observation_nr'], axis=1)

In [67]:
#merged_df.to_excel('Bettina.xlsx', index=False)

Exception ignored in: <function SeekableUnicodeStreamReader.__del__ at 0x7ff518f170d0>
Traceback (most recent call last):
  File "/Users/Artur/opt/anaconda3/lib/python3.8/site-packages/nltk/data.py", line 1159, in __del__
    if not self.closed:
  File "/Users/Artur/opt/anaconda3/lib/python3.8/site-packages/nltk/data.py", line 1173, in closed
    return self.stream.closed
AttributeError: 'SeekableUnicodeStreamReader' object has no attribute 'stream'
