# 1. Bag of words

## A. Data loading

In [1]:
# General Packages #
import os
import pandas as pd
import numpy as np

# Load TQDM to Show Progress Bars #
from tqdm import tqdm
from tqdm.notebook import tqdm as tqdm_notebook

# Sklearn Packages #
from sklearn.feature_extraction.text import TfidfTransformer, CountVectorizer, TfidfVectorizer
from sklearn.metrics import mean_squared_error, r2_score, accuracy_score, classification_report

from sklearn.model_selection import train_test_split, StratifiedShuffleSplit, StratifiedKFold, cross_val_predict
from sklearn.metrics import accuracy_score, roc_auc_score, precision_score, recall_score, f1_score, confusion_matrix

# Import SkLearn Classifiers #
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, BaggingClassifier, GradientBoostingClassifier
from sklearn.linear_model import Lasso, LassoCV, SGDClassifier, LogisticRegression, RidgeCV, RidgeClassifierCV, HuberRegressor, LinearRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier

import re
from textblob import TextBlob, Word
from nltk.corpus import stopwords
import string

In [2]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /Users/Artur/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [3]:
# Turn off warnings, just to avoid pesky messages that might cause confusion here
# Remove when testing your own code #
import warnings
warnings.filterwarnings("ignore")

In [47]:
# Change to Working Directory with Training Data # 
os.chdir("/Users/Artur/Desktop/thesis_HIR_versie5/coding")

# Load Training Data #
df_data = pd.read_excel("./data/STOXX EU 600.xlsx", header = 0)

# drop all unnecessary columns
df_data = df_data.drop(df_data.columns[1:13], axis = 1)
df_data = df_data.drop(df_data.columns[31], axis = 1)


In [5]:
# inspect the data
df_data.head(5)


Unnamed: 0,id,Headline,None,New product introduction/\nservice offering,Product/\nservice improvement,Product/\nservice deletion,New geographical market entry,New product/service\n market entry,Expansion in existing market (product/service/geographical),Reducing market presence or\n exiting a market (product/service/geographical),...,Financing,Human resources,R&D-related actions,Licensing,Production-related actions,Supplier/\noutsourcing,Corporate \ngovernance,Legal,Signaling,Symbolic
0,34,Multiutility A2A appoints Andrea Crenna CFO,,,,,,,,,...,,,,,,,1.0,,,
1,39,Italian A2A Reti Elettriche gives application ...,1.0,,,,,,,,...,,,,,,,,,,
2,61,"Italian M&A Deals: Telecom Italia, A2A, Fiat C...",1.0,,,,,,,,...,,,,,,,,,,
3,139,Press release - ORDINARY MEETING OF THE A2A S....,1.0,,,,,,,,...,,,,,,,,,,
4,145,"--Alcoa (AA) Q4 Earnings of $0.04 Ex Items, Re...",1.0,,,,,,,,...,,,,,,,,,,


In [10]:

# Define Model Parameters # 

# Minimum Document Frequency -- Minimum number of times a word needs to occur to be considered #
MINDF = 10

# Maximum Document Frequency -- Maximum share of documents where a word needs to occur to be considered #
MAXDF = 0.8

# Maximum number of features we would want to consider -- ranked by most frequently occurring #
MF=1200

# NGrams -- Number of Word Pairs. Takes the form (Min, Max). E.g. (1, 2) means single words and word pairs # 
NGrams = (1,2)


# Define Tokenizer -- This is a Custom Function that we use #


def textblob_tokenizer(str_input):
    
    # Convert list to string
    input_str = str_input
    
    if isinstance(input_str, list):
        input_str = ' '.join(input_str)
    
    # Remove punctuation
    str_input = str_input.translate(str.maketrans('', '', string.punctuation))
    
    # Define stop words
    stop_words = set(stopwords.words('english'))

    # Tokenize and lemmatize text
    blob = TextBlob(str_input.lower())
    tokens = [Word(token).lemmatize() for token in blob.words]

    # Remove numbers and stop words
    words = [token for token in tokens if not re.match('^\d+$', token) and token not in stop_words]

    return words



# Define Vectorizer #
vec = TfidfVectorizer(max_features= MF,
                      max_df = MAXDF,
                      ngram_range=(1, 2),
                      tokenizer=textblob_tokenizer)



In [11]:
# inspect the cleaned data
df_data['cleaned_headline'] = df_data['Headline'].apply(textblob_tokenizer)

# check the data
df_data[["cleaned_headline", "Headline"]]

Unnamed: 0,cleaned_headline,Headline
0,"[multiutility, a2a, appoints, andrea, crenna, ...",Multiutility A2A appoints Andrea Crenna CFO
1,"[italian, a2a, reti, elettriche, give, applica...",Italian A2A Reti Elettriche gives application ...
2,"[italian, deal, telecom, italia, a2a, fiat, ch...","Italian M&A Deals: Telecom Italia, A2A, Fiat C..."
3,"[press, release, ordinary, meeting, a2a, spa, ...",Press release - ORDINARY MEETING OF THE A2A S....
4,"[alcoa, aa, q4, earnings, ex, item, result, ne...","--Alcoa (AA) Q4 Earnings of $0.04 Ex Items, Re..."
...,...,...
54053,"[zurich, insurance, earns, outstanding, achiev...",Zurich Insurance earns Outstanding Achievement...
54054,"[moody, assigns, p1, commercial, paper, rating...",Moody's assigns P-1 commercial paper ratings t...
54055,"[uk, contract, award, zurich, insurance, uk, w...",UK Contract Awards: Zurich Insurance UK Wins 2...
54056,"[zurich, insurance, back, project, restoration...",Zurich Insurance to Back Project on Restoratio...


In [50]:
# Melt category columns into a single column
category_columns = ['None', 'New product introduction/\nservice offering',
                    'Product/\nservice improvement', 'Product/\nservice deletion',
                    'New geographical market entry', 'New product/service\n market entry',
                    'Expansion in existing market (product/service/geographical)',
                    'Reducing market presence or\n exiting a market (product/service/geographical)',
                    'Advertising actions', 'Pricing actions', 'Customer service/relations',
                    'Merger & \nacquisitions', 'Joint venture', 'Strategic alliance', 
                    'External venturing', 'De-venturing', 'Divestiture', 'Reorganisation/\nrestructuring',
                    'Spin-off', 'Financing', 'Human resources', 'R&D-related actions', 'Licensing', 
                    'Production-related actions', 'Supplier/\noutsourcing', 'Corporate \ngovernance',
                    'Legal', 'Signaling', 'Symbolic']
df = pd.melt(df_data, id_vars=['id', 'Headline'], value_vars=category_columns, var_name='category', value_name='category_value')
df = df[df.category_value == 1]
df = df.drop('category_value', axis=1)

# Store Data in Lists for Text Classification #
IDs = np.array(df['id'].values.tolist())
headlines = np.array(df['Headline'].values.tolist())
Classes = df['category'].values.tolist()

# Perform vectorization and extract feature names #
Abstract_Vectors = vec.fit_transform(headlines)
FEATURENAMES = vec.get_feature_names_out()

In [13]:
# Perform vectorization and extract feature names #
Abstract_Vectors = vec.fit_transform(headlines)
FEATURENAMES = vec.get_feature_names_out()

In [14]:
print(FEATURENAMES)

['5g' 'ab' 'abb' ... 'york' '£' '£ ucits']


In [15]:
# define a function to calculate classification metrics

def get_classification_metrics(y_true, y_pred, y_pred_proba):
    # Calculate Model Performance Metrics
    accuracy = accuracy_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred, average='macro')
    recall = recall_score(y_true, y_pred, average='macro')
    f1 = f1_score(y_true, y_pred, average='macro')

    # Calculate the extended G-mean
    g_mean = geometric_mean_score(y_true, y_pred, average='weighted')

    # Calculate the MAUC score
    mauc = np.round(roc_auc_score(y_true, y_pred_proba, multi_class='ovo', average='macro'), 3)

    return accuracy, precision, recall, f1, g_mean, mauc



In [45]:
# Import necessary libraries
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score
from imblearn.combine import SMOTETomek
from imblearn.metrics import geometric_mean_score
from sklearn.metrics import roc_auc_score
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.preprocessing import LabelEncoder
from imblearn.ensemble import RUSBoostClassifier
from imblearn.ensemble import smote
from imblearn.pipeline import make_pipeline
from imblearn.over_sampling import SMOTE
from sklearn.ensemble import AdaBoostClassifier, GradientBoostingClassifier

# Define the Set of Classifiers and their parameters
CLASSIFIERS = [
               ["RUSBoost", RUSBoostClassifier(random_state=1)],
               ["SMOTEBoost (AdaBoost)", make_pipeline(SMOTE(random_state=1), AdaBoostClassifier(random_state=1))],
               ["SMOTEBoost (Gradient Boosting)", make_pipeline(SMOTE(random_state=1), GradientBoostingClassifier(random_state=1))],
               ["Random Forest", RandomForestClassifier(n_estimators=100)],
              ]

# Number of Folds (Splits) for Cross Validation #
NUM_OF_SPLITS = 5

# Define whether you want to manually reweight the sample by oversampling the smaller class 
Reweight = False

# Define arrays in which to store classification outputs # 
RESULTS = []
Classified_Values =[]
Classified_Values_p =[]

# Loop Through Different Classifiers #
for CL in tqdm_notebook(CLASSIFIERS, desc = "Evaluating Classifiers"):

    # Extract Classifier Names & Model #
    name  = CL[0]
    Model = CL[1]

    # Define Arrays to store Actual, Predicted and Ids variables (Because we are shuffling them in next step) # 
    y_actual = []
    y_predicted = []
    y_predicted_proba = []

    id_s = []

    # Loop through K Folds and Repeat Cross Validation #
    KFoldSplitter = StratifiedKFold(n_splits=NUM_OF_SPLITS, shuffle=True, random_state=1)
    
    for train_i, test_i in tqdm_notebook(KFoldSplitter.split(Abstract_Vectors, Classes), 
                                         desc = 'Cross-Validating',
                                         leave = False,
                                         total = NUM_OF_SPLITS):

        # Select Rows in Data Based on Indexes [train_i, test_i]
        Y = np.asarray(Classes)

        train_X, test_X = Abstract_Vectors[train_i], Abstract_Vectors[test_i]
        train_y, test_y = Y[train_i], Y[test_i]
        Train_IDs, Test_IDs = IDs[train_i], IDs[test_i]

        # solving class imbalance issues      
        temp_y = list(train_y)
        temp_X = train_X.todense().tolist()

        if Reweight == True:
            
            # Use SMOTE and Tomek links to handle class imbalance
            smt = SMOTETomek(random_state=1)
            X_train_balanced, y_train_balanced = smt.fit_resample(train_X, train_y)
            
        else:
            X_train_balanced, y_train_balanced = train_X, train_y


        # Train Model #
        Results = Model.fit(X_train_balanced, y_train_balanced)
        
        # Perform Prediction on Holdout Sample # 
        y_pred = Model.predict(test_X.toarray())
        y_pred_proba = Model.predict_proba(test_X.toarray())

        # Add to List with Final Results # 
        y_actual = y_actual + list(test_y)
        y_predicted = y_predicted + list(y_pred)
        y_predicted_proba = y_predicted_proba + list(y_pred_proba)
        id_s = id_s + list(Test_IDs)

    # ---------------------------------------------------------- #
    # This runs only after all of the folds have been classified # 
    # ---------------------------------------------------------- #

    # Calculate classification metrics
    Accuracy, Precision, Recall, F1, G_mean, MAUC = get_classification_metrics(y_actual, y_predicted, y_predicted_proba)


    # Add Classification Performance Metrics to List #
    RESULTS.append([name, np.round(Accuracy, 3),
                          np.round(Precision, 3),
                          np.round(Recall, 3),
                          np.round(F1, 3),
                          G_mean,
                          MAUC])

    # Add Classification Results to List # 
    Classified_Values.append(list(zip(len(id_s)*[name],id_s, y_actual, y_predicted, 
                                       len(id_s)*[G_mean], len(id_s)*[MAUC])))


Evaluating Classifiers:   0%|          | 0/4 [00:00<?, ?it/s]

Cross-Validating:   0%|          | 0/5 [00:00<?, ?it/s]

Cross-Validating:   0%|          | 0/5 [00:00<?, ?it/s]

Cross-Validating:   0%|          | 0/5 [00:00<?, ?it/s]

Cross-Validating:   0%|          | 0/5 [00:00<?, ?it/s]

In [None]:
/Users/Artur/opt/anaconda3/lib/python3.8/site-packages/imblearn/ensemble

In [46]:
print(RESULTS)

[['RUSBoost', 0.99, 0.4, 0.01, 0.6, 0.076, 0.1, 0.146, 0.06, 0.2732410795797511, 0.624], ['SMOTEBoost (AdaBoost)', nan, 0.169, nan, 0.831, 0.02, 0.1, 0.117, 0.053, 0.1403117599673827, 0.598], ['SMOTEBoost (Gradient Boosting)', 0.994, 0.719, 0.006, 0.281, 0.729, 0.224, 0.492, 0.282, 0.8463869574938179, 0.878], ['Random Forest', 0.992, 1.0, 0.008, 0.0, 0.904, 0.453, 0.232, 0.287, 0.7114593810370702, 0.813]]


In [49]:
# Convert List of Model Performance Metrics to Dataframe #
RESULTS_TABLE = pd.DataFrame(RESULTS, columns = ["Name", "True-Positives", 
                                                 "False-Negatives", "False-Positives", 
                                                 "True-Negatives","Accuracy", 
                                                 "Precision", "Recall", "F1", "G_mean", "MAUC"] )
RESULTS_TABLE["Type"] = "Bag of Words"
RESULTS_TABLE = RESULTS_TABLE[["Name", "True-Positives", 
                                                 "False-Negatives", "False-Positives", 
                                                 "True-Negatives","Accuracy", 
                                                 "Precision", "Recall", "F1", "G_mean", "MAUC"]]

# Output Results #
RESULTS_TABLE.sort_values("Accuracy", ascending = False ).to_csv("./Output/Model Performance/BOW Model Classification Performance.csv")

# Display Results -- Out of Sample (Holdout) prediction -- Sorted by Accuracy #
RESULTS_TABLE.sort_values("Accuracy", ascending = False )

Unnamed: 0,Name,True-Positives,False-Negatives,False-Positives,True-Negatives,Accuracy,Precision,Recall,F1,G_mean,MAUC
3,Random Forest,0.992,1.0,0.008,0.0,0.904,0.453,0.232,0.287,0.711459,0.813
2,SMOTEBoost (Gradient Boosting),0.994,0.719,0.006,0.281,0.729,0.224,0.492,0.282,0.846387,0.878
0,RUSBoost,0.99,0.4,0.01,0.6,0.076,0.1,0.146,0.06,0.273241,0.624
1,SMOTEBoost (AdaBoost),,0.169,,0.831,0.02,0.1,0.117,0.053,0.140312,0.598
