In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import xgboost as xgb
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
import lightgbm as lgb

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, classification_report, \
    confusion_matrix, roc_auc_score

In [None]:
path = '../RFQ_Data_Challenge_HEC.csv'
data = pd.read_csv(path)
data.head()

In [None]:
def preprocess_dataframe(df):
    """
    Preprocesses the input DataFrame with the following steps:
    1. Converts 'Deal_Date', 'maturity', columns to datetime.
    2. Converts 'B_Side' column to boolean.
    3. Converts 'B_Price' and 'Total_Requested_Volume' integers.
    4. Fills null values in 'Tier', 'AssumedMaturity'.
    5. Converts 'Frequency' feature values into integers.
    6. Drops the unused 'Cusip' column.
    Parameters:
    - df (DataFrame): Input DataFrame.

    Returns:
    - DataFrame: Processed DataFrame.
    """

    df = df.copy()

    # Drop null values only for columns below the threshold
    columns_to_delete_null_values = [
        'MidYTM', 'Coupon', 'Ccy', 'cusip',
        'cdcissuerShortName', 'Frequency', 'MidPrice', 'cdcissuer',
        'company_short_name', 'BloomIndustrySubGroup', 'B_Price',
        'Total_Traded_Volume_Natixis', 'B_Side',
        'Total_Traded_Volume_Away', 'Total_Requested_Volume',
        'Total_Traded_Volume', 'Type', 'Maturity', 'ISIN', 'Deal_Date']
    df = df.dropna(subset=columns_to_delete_null_values)

    df['Maturity'] = pd.to_datetime(df['Maturity'])


    # Convert 'B_Price', 'Total_Requested_Volume', 'Frequency' to integers
    df['Frequency'] = df['Frequency'].str.replace('M', '')
    numerical_columns = ['B_Price', 'Total_Requested_Volume', 'Frequency']
    df.dropna(subset=numerical_columns, inplace=True)
    for column in numerical_columns:
        df[column] = pd.to_numeric(df[column], errors='coerce').astype(int)

    # Fix the error in the B_Price column
    df = df[df['B_Price'] >= 20]

    # Replace NaT with null values in the 'Maturity' column
    df['maturity'].replace({pd.NaT: np.nan}, inplace=True)

    # Convert 'Deal_Date', 'maturity', 'AssumedMaturity', 'YTWDate' to datetime
    df['Deal_Date'] = pd.to_datetime(df['Deal_Date'])
    df['maturity'] = pd.to_datetime(df['maturity'], errors='coerce',                
                                    format='%Y-%m-%d %H:%M:%S.%f')
    df['AssumedMaturity'] = pd.to_datetime(df['AssumedMaturity'],
                                           errors='coerce')
    df['YTWDate'] = pd.to_datetime(df['YTWDate'], errors='coerce')

    # Add year, month, day for clustering
    df['Year_dealdate'] = df['Deal_Date'].dt.year
    df['Month_dealdate'] = df['Deal_Date'].dt.month
    df['Day_dealdate'] = df['Deal_Date'].dt.day
    df['Year_maturity'] = df['Maturity'].dt.year
    df['Month_maturity'] = df['Maturity'].dt.month
    df['Day_maturity'] = df['Maturity'].dt.day


    # Delete maturities smaller than 2021 (as deal dates start in 2021)
    df = df[df['Maturity'].dt.year >= 2021]

    # Compute the number of days between maturity and deal date
    df['Days_to_Maturity'] = (df['Maturity'] - df['Deal_Date']).dt.days

    # Replace null values in 'AssumedMaturity' with values from 'Maturity'
    df['AssumedMaturity'] = df['AssumedMaturity'].fillna(df['Maturity'])

    # Convert 'B_Side' column to boolean (1 for ' BUY', 0 for ' SELL')
    df = df[df['B_Side'].isin(['NATIXIS SELL', 'NATIXIS BUY'])]
    df['B_Side'] = df['B_Side'].replace({'NATIXIS BUY': 1, 'NATIXIS SELL': 0})

    # Convert null values of 'Tier'
    df['Tier'].fillna('UNKNOWN', inplace=True)

    # Lower string names
    df['Sales_Name'] = df['Sales_Name'].str.lower()
    df['company_short_name'] = df['company_short_name'].str.lower()

    # Drop unused columns
    columns_to_drop = ['maturity','Cusip']
    df.drop(columns=columns_to_drop, inplace=True)

    return df

In [None]:
# Apply preprocessing function

df = preprocess_dataframe(data)

In [None]:
def column_encoding(df):

    """
    Perform column encoding and data transformations on the input DataFrame.

    Parameters:
    - df (pd.DataFrame): Input DataFrame containing financial data.

    Returns:
    pd.DataFrame: Transformed DataFrame with column encoding.
    """

    # List of columns to delete
    columns_to_del = ['cusip', 'Instrument', 'Sales_Name', 'Sales_Initial',
                      'Total_Traded_Volume_Natixis',
                      'Total_Traded_Volume_Away',
                      'Total_Traded_Volume', 'cdcissuer',
                      'Tier']

    df.drop(columns=columns_to_del, axis=1, inplace=True)

    # Transform 'Ccy' to 'is_euro' boolean column
    df['is_euro'] = (df['Ccy'] == 'EUR').astype(int)

    # Transform 'Type' to 'is_fixed' boolean column
    df['is_fixed'] = (df['Type'] == 'Fixed').astype(int)

    # Drop the original 'Ccy' and 'Type' columns
    df = df.drop(['Ccy', 'Type'], axis=1, errors='ignore')

    # Ordinal encoding for 'Rating_Fitch'
    rating_mapping = {
        'AAA': 22,
        'AA+': 21,
        'AA': 20,
        'AA-': 19,
        'A+': 18,
        'A': 17,
        'A-': 16,
        'BBB+': 15,
        'BBB': 14,
        'BBB-': 13,
        'BB+': 12,
        'BB': 11,
        'BB-': 10,
        'B+': 9,
        'B': 8,
        'B-': 7,
        'CCC+': 6,
        'CCC': 5,
        'CCC-': 4,
        'CC': 3,
        'C': 2,
        'WD': 1,
        'D': 0,
        'NR': np.nan
    }

    rating_mapping_moodys = {
        'Aaa': 22,
        'Aa1': 21,
        'Aa2': 20,
        '(P)Aa2': 20,
        'Aa3': 19,
        '(P)Aa3': 19,
        'A1': 18,
        '(P)A1': 18,
        'A2': 17,
        '(P)A2': 17,
        'A3': 16,
        '(P)A3': 16,
        'Baa1': 15,
        '(P)Baa1': 15,
        'Baa2': 14,
        '(P)Baa2': 14,
        'Baa3': 13,
        'Ba1': 12,
        'Ba2': 11,
        'Ba3': 10,
        'B1': 9,
        'B2': 8,
        'B3': 7,
        'Caa1': 6,
        'Caa2': 5,
        'Caa3': 4,
        'Ca': 2.5,
        'C': 0
    }

    df['Rating_Fitch_encoded'] = df['Rating_Fitch'].map(rating_mapping)
    df['Rating_SP_encoded'] = df['Rating_SP'].map(rating_mapping)
    df['Rating_Moodys_encoded'] = df['Rating_Moodys'].map(
        rating_mapping_moodys
        )

    # Create a unique Rating that averages the 3 Ratings
    df['Rating'] = df[['Rating_Fitch_encoded', 'Rating_SP_encoded',
                       'Rating_Moodys_encoded']].mean(axis=1)
    df.drop(columns=['Rating_Fitch', 'Rating_SP',
                     'Rating_Moodys'], axis=1, inplace=True)

    # List of countries to encode
    encode_countries = ['ITALY', 'FRANCE', 'GERMANY', 'NETHERLANDS', 'BELGIUM']

    # Use the apply function with a lambda function to update the 'country'
    df['Country'] = df['Country'].apply(
        lambda x: x if x in encode_countries else 'Other')

    df = pd.get_dummies(df,
                        columns=['Classification', 'Country'], prefix='Class')

    # Convert booleans to numeric in df
    def convert_boolean_to_binary(df):
        for column in df.columns:
            # Check if the column contains boolean values
            if df[column].dtype == bool:
                # Convert boolean to binary integer (True -> 1, False -> 0)
                df[column] = df[column].astype(int)
        return df

    df = convert_boolean_to_binary(df)

    return df

In [None]:
# Apply encoding function
df = column_encoding(df)

In [None]:
df.sort_values(by=['Deal_Date'], inplace=True)
df.head()

## Modelling

In [None]:
# Encode categorical target variable
label_encoder = LabelEncoder()
df['y_encoded'] = label_encoder.fit_transform(df['company_short_name'])

# Split the data into features (X) and target (y) for Buy and Sell side
X = df.drop(['company_short_name', 'y_encoded',
             'Deal_Date', 'Maturity', 'YTWDate', 'AssumedMaturity'], axis=1)

y_buy = df.loc[df['B_Side'] == 1, 'y_encoded']
y_sell = df.loc[df['B_Side'] == 0, 'y_encoded']

for col in X.select_dtypes(include=['object']).columns:
    X[col] = label_encoder.fit_transform(X[col])

X_buy = X[X['B_Side'] == 1]
X_sell = X[X['B_Side'] == 0]

In [None]:
# Split the data into training and testing sets
X_train_buy, X_test_buy, y_train_buy, y_test_buy = \
    train_test_split(X_buy, y_buy, test_size=0.2, random_state=42)
X_train_sell, X_test_sell, y_train_sell, y_test_sell = \
    train_test_split(X_sell, y_sell, test_size=0.2, random_state=42)

## XGBoost

#### For Buy Side

In [None]:
# Define the XGBoost classifier with handling categorical variables
xgb_classifier = xgb.XGBClassifier(objective='multi:softmax',
                                   num_class=len(df['company_short_name']
                                                .unique()),
                                   eval_metric='mlogloss')

# Train the classifier
xgb_classifier.fit(X_train_buy, y_train_buy)

# Make predictions on the testing set
y_pred = xgb_classifier.predict(X_test_buy)

# Evaluate the model
accuracy = accuracy_score(y_test_buy, y_pred)
print(f'Accuracy: {accuracy:.2f}')
print('Classification Report:\n', classification_report(y_test_buy, y_pred))

# Get class probabilities for calculating AUC
y_probs = xgb_classifier.predict_proba(X_test_buy)

# Calculate AUC score
auc_score = roc_auc_score(y_test_buy, y_probs, multi_class='ovr')
print(f'AUC Score: {auc_score:.2f}')

#### For Sell Side

In [None]:
# Train the classifier
xgb_classifier.fit(X_train_sell, y_train_sell)

# Make predictions on the testing set
y_pred = xgb_classifier.predict(X_test_sell)

# Evaluate the model
accuracy = accuracy_score(y_test_sell, y_pred)
print(f'Accuracy: {accuracy:.2f}')
print('Classification Report:\n', classification_report(y_test_sell,
                                                        y_pred))

# Get class probabilities for calculating AUC
y_probs = xgb_classifier.predict_proba(X_test_sell)

# Calculate AUC score
auc_score = roc_auc_score(y_test_sell, y_probs, multi_class='ovr')
print(f'AUC Score: {auc_score:.2f}')

## CatBoost


#### For Buy Side

In [None]:
# Define the CatBoost classifier with handling categorical variables
catboost_classifier = CatBoostClassifier(iterations=100,
                                         loss_function='MultiClass',
                                         eval_metric='Accuracy')

# Train the classifier
catboost_classifier.fit(X_train_buy, y_train_buy)

# Make predictions on the testing set
y_pred = catboost_classifier.predict(X_test_buy)

# Evaluate the model
accuracy = accuracy_score(y_test_buy, y_pred)
print(f'Accuracy: {accuracy:.2f}')
print('Classification Report:\n', classification_report(y_test_buy,
                                                        y_pred))

# Get class probabilities for calculating AUC
y_probs = catboost_classifier.predict_proba(X_test_buy)

auc_score = roc_auc_score(y_test_buy, y_probs, multi_class='ovr')
print(f'AUC Score: {auc_score:.2f}')

#### For Sell Side

In [None]:
# Train the classifier
catboost_classifier.fit(X_train_sell, y_train_sell)

# Make predictions on the testing set 
y_pred = catboost_classifier.predict(X_test_sell)

# Evaluate the model
accuracy = accuracy_score(y_test_sell, y_pred)
print(f'Accuracy: {accuracy:.2f}')
print('Classification Report:\n', classification_report(y_test_sell, y_pred))

# Get class probabilities for calculating AUC
y_probs = catboost_classifier.predict_proba(X_test_sell)

auc_score = roc_auc_score(y_test_sell, y_probs, multi_class='ovr')  
print(f'AUC Score: {auc_score:.2f}')

## LightGBM

#### For Buy Side

In [None]:
# Define the LightGBM classifier with handling categorical variables
lgb_classifier = lgb.LGBMClassifier(objective='multiclass',
                                    num_class=len(df['company_short_name']
                                                  .unique()),
                                    metric='multi_logloss')

# Train the classifier
lgb_classifier.fit(X_train_buy, y_train_buy)

# Make predictions on the testing set
y_pred = lgb_classifier.predict(X_test_buy)

# Evaluate the model
accuracy = accuracy_score(y_test_buy, y_pred)
print(f'Accuracy: {accuracy:.2f}')
print('Classification Report:\n', classification_report(y_test_buy, y_pred))

# Get class probabilities for calculating AUC
y_probs = lgb_classifier.predict_proba(X_test_buy)

auc_score = roc_auc_score(y_test_buy, y_probs, multi_class='ovr')
print(f'AUC Score: {auc_score:.2f}')

#### For Sell Side

In [None]:
# Train the classifier
lgb_classifier.fit(X_train_sell, y_train_sell)

# Make predictions on the testing set
y_pred = lgb_classifier.predict(X_test_sell)

# Evaluate the model
accuracy = accuracy_score(y_test_sell, y_pred)
print(f'Accuracy: {accuracy:.2f}')
print('Classification Report:\n', classification_report(y_test_sell, y_pred))

# Get class probabilities for calculating AUC
y_probs = lgb_classifier.predict_proba(X_test_sell)

auc_score = roc_auc_score(y_test_sell, y_probs, multi_class='ovr')
print(f'AUC Score: {auc_score:.2f}')