# Vectorization

# 0. Data loading

In [1]:
# General Packages #
import os
import pandas as pd
import numpy as np
import string
import re
from scipy.stats import randint
import random


# Load TQDM to Show Progress Bars #
from tqdm import tqdm
from tqdm.notebook import tqdm as tqdm_notebook

# Sklearn Packages #
from sklearn.feature_extraction.text import TfidfTransformer, CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split, StratifiedShuffleSplit, StratifiedKFold, cross_val_predict, GridSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report, make_scorer
from sklearn.linear_model import LogisticRegression

# NLTK Packages #
import nltk
from nltk.corpus import stopwords
from textblob import TextBlob, Word

# for SVD
from sklearn.datasets import make_classification
from sklearn.decomposition import TruncatedSVD
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_score
from scipy.sparse import csr_matrix


In [2]:
# Turn off warnings, just to avoid pesky messages that might cause confusion here
# Remove when testing your own code #
import warnings
warnings.filterwarnings("ignore")

In [3]:
# Change to Working Directory with Training Data # 
#os.chdir("/Users/Artur/Desktop/thesis_HIR_versie5/coding")
os.chdir("/Users/juarel/Desktop/studies artur/thesis_HIR/coding")

# Load Training Data #
df_train = pd.read_csv("./data/gold_data/train.csv", header = 0)
df_test = pd.read_csv("./data/gold_data/test.csv", header = 0)

# inspect the data
df_train.head(5)

Unnamed: 0,id,Headline,category,cleaned_headline
0,194578,Head Line: US Patent granted to BASF SE (Delaw...,,head u patent granted se delaware may titled c...
1,564295,Societe Generale Launches a Next-Generation Ca...,,societe generale launch nextgeneration card in...
2,504138,BARCLAYS PLC Form 8.3 - EUTELSAT COMMUNICATION...,,plc form communication
3,91379,ASML: 4Q Earnings Snapshot,,4q earnings snapshot
4,265750,Form 8.3 - AXA INVESTMENT MANAGERS : Booker Gr...,,form investment manager group plc


# 1. Define functions and parameters

Before we continue, we first define some useful functions and parameters that we will need later in this notebook:

1. get_classification_metrics: Create a function that return the classification metrics for each model. The precision, recall and f1 score are all determined using the average value of all classes, without adjusting weights to these classes.

2. Define a dataframe to store the results of training the optimal model with a different number of features.

3. Define a dataframe that stores the results of the models trained with singular value decomposition.

4. Define the number of splits, the stratified cross validator to ensure class frequencies are considered, and the scoring metric.

5. Define a function that trains the defined model, depending on the vectorizer, the input data, the classifier and its parameter grid.


In [4]:
# 1. Function that returns classication metrics
def get_classification_metrics(y_true, y_pred):
    # Calculate Model Performance Metrics
    accuracy = accuracy_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred, average='macro')
    recall = recall_score(y_true, y_pred, average='macro')
    f1 = f1_score(y_true, y_pred, average='macro')


    return accuracy, precision, recall, f1


In [6]:
# Create an empty dataframe to store the results of all the models
results_log_df = pd.DataFrame()

# Add columns for the metrics
columns = ['features', 'performance']
for col in columns:
    results_log_df[col] = 0


In [7]:
# Create a dataframe to store the results of different max features in vectorizer
results_vec_df = pd.DataFrame()

# Add columns for the metrics
columns = ['vectorizer', 'FS', 'classifier', 'resampling', 'max_features', 'accuracy', 'precision', 'recall', 'f1']
for col in columns:
    results_vec_df[col] = 0

In [8]:
# Define the number of folds for cross-validation
n_splits = 5

# Initialize the stratified k-fold object
skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42) # ensures class balances are kept

# Define the scoring metric
scoring = make_scorer(f1_score, average= 'macro')


In [9]:
# define the independent and dependent variables
X_train = df_train['cleaned_headline']
X_test = df_test['cleaned_headline']

y_train = df_train['category']
y_test = df_test['category']

In [10]:
# create an empty dictionary to store the optimal parameters
best_params_dict = {}

In [11]:
def perform_grid_search(name, model, param_grid, X_train, X_test, y_train, y_test,
                       vectorizer, FS, classifier, resampling, max_features):
    
    # Define a seed value
    random.seed(7)
        
    # Perform the grid search using cross-validation
    grid_search = GridSearchCV(model, param_grid, cv=skf, scoring=scoring)
    grid_search.fit(X_train, y_train)

    # Get the best model and its hyperparameters
    best_model = grid_search.best_estimator_
    best_params = grid_search.best_params_

    # Store the best parameters for the current category in the dictionary
    best_params_dict[name] = best_params
    #print(f'best parameters: {best_params}')

    # Retrain the best model with the whole training set
    best_model.fit(X_train, y_train)

    # Make predictions on the test set
    y_pred = best_model.predict(X_test)
    
    # Calculate the probabilities (not for SVM as this is not possible)
    if classifier != 'SVM':
        y_pred_proba = best_model.predict_proba(X_test)
        
        # Find the highest probability for each observation
        highest_prob = np.amax(y_pred_proba, axis = 1)
    
        # Create a DataFrame with test observations, highest probabilities, and predicted classes
        predictions_df = pd.DataFrame({'Observation_nr': y_test.index, 'Probability': highest_prob, 'Prediction': y_pred})
        
    else:
        # Create a DataFrame with test observations and predicted classes
        predictions_df = pd.DataFrame({'Observation_nr': y_test.index, 'Prediction': y_pred})
        
    # Store the final predictions with its probability for the test set
    predictions_df.to_csv(f'./Output/predictions/{name}.csv', index = False, header = True)
    #predictions_df.to_excel(f'./Output/predictions/{name}.xlsx', index = False, header = True)

    # Calculate the classification metrics
    accuracy, precision, recall, f1 = get_classification_metrics(y_test, y_pred)
    
    # print the results
    #print(f'Results for {name}:')
    #print(f'Accuracy: {accuracy}')
    #print(f'Precision: {precision}')
    #print(f'Recall: {recall}')
    print(f'F1: {f1}')
    
    # add the results to the dataframe with the results for different max_features
    results_vec_df.loc[name] = [vectorizer, FS, classifier, resampling, max_features, accuracy, precision, recall, f1]


# 2. Vectorizers

## 2.1 Bag of words

Define the parameters of the vectorizer

In [12]:
# Maximum Document Frequency -- Maximum share of documents where a word needs to occur to be considered #
MAXDF = 0.9

# Maximum number of features we would want to consider -- ranked by most frequently occuring #
MF= 10000

# NGrams -- Number of Word Pairs. Takes the form (Min, Max). E.g. (1, 2) means single words and word pairs # 
NGrams = (1,2)

Define the vectorizer itself

In [13]:
# define the vectorizer
vec_bow = CountVectorizer(max_features= MF,
                          max_df = MAXDF,
                          ngram_range=NGrams)

Transform the textual data into numerical representations with the BOW vectorizer

In [14]:
# Initialize the classifier
logreg = LogisticRegression(random_state = 7, penalty = 'l2', C = 10)


In [17]:
# Iterate over each value of 'i'
for i in range(2000, 20001, 500):
    
    # Train your logistic regression model with the current 'i'
    model = logreg
    
    # define the vectorizer
    vec_bow = CountVectorizer(max_features= i,
                          max_df = MAXDF,
                          ngram_range=NGrams)
    
    # Define X_train and X_test
    X_train_bow = vec_bow.fit_transform(X_train)
    X_test_bow = vec_bow.transform(X_test)
    
    # Train the best model with the whole training set
    model.fit(X_train_bow, y_train)
    
    # Make predictions on your data
    y_pred = model.predict(X_test_bow)
    
    # Calculate the F1 score
    f1 = f1_score(y_test, y_pred, average='macro')  # Replace y_true with your true labels

    # Store the results in a dictionary
    results = {
        'i': i,
        'f1': f1
    }

    # Append the results to the dataframe
    results_log_df = results_log_df.append(results, ignore_index=True)

## 2.1.1 Test performance nr of features

## 2.1.2 Evaluate SVD performance

In [19]:
# Maximum Document Frequency -- Maximum share of documents where a word needs to occur to be considered #
MAXDF = 0.9

# Maximum number of features we would want to consider -- ranked by most frequently occuring #
MF= 10000

# NGrams -- Number of Word Pairs. Takes the form (Min, Max). E.g. (1, 2) means single words and word pairs # 
NGrams = (1,2)

Define the vectorizer itself

In [20]:
# define the vectorizer
vec_bow = CountVectorizer(max_features= MF,
                          max_df = MAXDF,
                          ngram_range=NGrams)

In [21]:
X_train_bow = vec_bow.fit_transform(X_train)
X_test_bow = vec_bow.transform(X_test)

In [22]:
# Convert the sparse matrix to a dense array
X_train_bow_arr = X_train_bow.toarray()

In [23]:
y_train_arr = y_train.values
y_train_arr

array(['None', 'None', 'None', ..., 'Strategic alliance', 'None', 'None'],
      dtype=object)

In [24]:
# get a list of models
def get_models_svd():
    models = dict()
    for i in range(3000, 9001, 1000):
        steps = [('svd', TruncatedSVD(n_components=i)), ('m', LogisticRegression())]
        models[str(i)] = Pipeline(steps=steps)
    return models

In [25]:
# evaluate a give model using cross-validation
def evaluate_model(model, X, y):
    cv = StratifiedKFold(n_splits=5)
    scores = cross_val_score(model, X, y, scoring= scoring, cv=cv, n_jobs= 2, error_score='raise')
    return scores

In [26]:
# get the models to evaluate
models = get_models_svd()

# evaluate the models and store results
results, names = list(), list()
for name, model in models.items():
    scores = evaluate_model(model, X_train_bow_arr, y_train_arr)
    results.append(scores)
    names.append(name)
    print('>%s %.3f (%.3f)' % (name, np.mean(scores), np.std(scores)))

KeyboardInterrupt: 

In [None]:
# plot model performance for comparison
import matplotlib.pyplot as plt

plt.boxplot(results, labels=names, showmeans=True)
plt.xticks(rotation=45)
plt.show() 

In [62]:
# load predictions best model
#predictions = pd.read_csv("./Output/predictions/bow_log_w.csv", header = 0)
#predictions

Unnamed: 0,Observation_nr,Probability,Prediction
0,0,0.928801,
1,1,0.999932,
2,2,0.999938,
3,3,0.999660,
4,4,0.999934,
...,...,...,...
10809,10809,0.999587,
10810,10810,0.999996,
10811,10811,0.999998,
10812,10812,0.999971,


In [33]:
# Reset index of df_test and create a new column 'Observation_nr'
#df_test = df_test.reset_index().rename(columns={'index': 'Observation_nr'})

# Merge the DataFrames based on 'Observation_nr'
#merged_df = df_test.merge(predictions, on='Observation_nr', how='inner')

In [34]:
# Print the merged DataFrame
#merged_df = merged_df.drop(['id', 'Observation_nr'], axis=1)

In [35]:
#merged_df.to_excel('Bettina.xlsx', index=False)