In [19]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import xgboost as xgb
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
import lightgbm as lgb

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, classification_report, \
    confusion_matrix, roc_auc_score

In [20]:
path = '/Users/vincenzoalbano/Desktop/natixis/RFQ_Data_Challenge_HEC.csv'
data = pd.read_csv(path)
data.head()

  data = pd.read_csv(path)


Unnamed: 0,Deal_Date,ISIN,cusip,Instrument,Sales_Name,Sales_Initial,company_short_name,B_Price,B_Side,Total_Requested_Volume,...,MidYTM,YTWDate,SpreadvsBenchmarkMid,MidASWSpread,MidZSpread,GSpreadMid,MidModifiedDuration,MidConvexity,MidEffectiveDuration,MidEffectiveConvexity
0,2022-09-15,DE000A14J587,D8397TBT2,TKAGR 25/2/2025 2.500,Blanca Tailpied,BLA,Bnp Reunion,0.0,NATIXIS SELL,77551,...,5.57551,,417.014771,286.230682,295.597916,430.414978,2.23491,8.41587,2.11347,6.82874
1,2022-09-15,DE000A14J587,D8397TBT2,TKAGR 25/2/2025 2.500,Blanca Tailpied,BLA,Cegi Ett,0.0,NATIXIS SELL,8163,...,5.57551,,417.014771,286.230682,295.597916,430.414978,2.23491,8.41587,2.11347,6.82874
2,2022-09-15,DE000A14J587,D8397TBT2,TKAGR 25/2/2025 2.500,Blanca Tailpied,BLA,Qbe,0.0,NATIXIS SELL,20408,...,5.57551,,417.014771,286.230682,295.597916,430.414978,2.23491,8.41587,2.11347,6.82874
3,2022-09-15,DE000A14J587,D8397TBT2,TKAGR 2.5% 25 FEB 2025,Blanca Tailpied,BLA,Cegi Ett,0.0,NATIXIS SELL,16327,...,5.57551,,417.014771,286.230682,295.597916,430.414978,2.23491,8.41587,2.11347,6.82874
4,2022-09-15,DE000A14J587,D8397TBT2,TKAGR 2.5% 25 FEB 2025,Constantin Durie,CON,Scp Laureau-Jeannerot,0.0,NATIXIS SELL,244898,...,5.57551,,417.014771,286.230682,295.597916,430.414978,2.23491,8.41587,2.11347,6.82874


In [116]:
def preprocess_dataframe(df):
    """
    Preprocesses the input DataFrame with the following steps:
    1. Converts 'Deal_Date', 'maturity', columns to datetime.
    2. Converts 'B_Side' column to boolean.
    3. Converts 'B_Price' and 'Total_Requested_Volume' integers.
    4. Fills null values in 'Tier', 'AssumedMaturity'.
    5. Converts 'Frequency' feature values into integers.
    6. Drops the unused 'Cusip' column.
    Parameters:
    - df (DataFrame): Input DataFrame.

    Returns:
    - DataFrame: Processed DataFrame.
    """

    df = df.copy()

    # Drop null values only for columns below the threshold
    columns_to_delete_null_values = [
        'MidYTM', 'Coupon', 'Ccy', 'cusip',
        'maturity', 'cdcissuerShortName', 'Frequency', 'MidPrice', 'cdcissuer',
        'company_short_name', 'BloomIndustrySubGroup', 'B_Price',
        'Total_Traded_Volume_Natixis', 'B_Side',
        'Total_Traded_Volume_Away', 'Total_Requested_Volume',
        'Total_Traded_Volume', 'Type', 'Maturity', 'ISIN', 'Deal_Date']
    df = df.dropna(subset=columns_to_delete_null_values)

    # Convert 'B_Price', 'Total_Requested_Volume', 'Frequency' to integers
    df['Frequency'] = df['Frequency'].str.replace('M', '')
    numerical_columns = ['B_Price', 'Total_Requested_Volume', 'Frequency']
    df.dropna(subset=numerical_columns, inplace=True)
    for column in numerical_columns:
        df[column] = pd.to_numeric(df[column], errors='coerce').astype(int)

    # Fix the error in the B_Price column
    df = df[df['B_Price'] >= 20]

    # Replace NaT with null values in the 'Maturity' column
    df['maturity'].replace({pd.NaT: np.nan}, inplace=True)

    # Convert 'Deal_Date', 'maturity', 'AssumedMaturity', 'YTWDate' to datetime
    df['Deal_Date'] = pd.to_datetime(df['Deal_Date'])
    df['maturity'] = pd.to_datetime(df['maturity'], errors='coerce',                
                                    format='%Y-%m-%d %H:%M:%S.%f')
    df['AssumedMaturity'] = pd.to_datetime(df['AssumedMaturity'],
                                           errors='coerce')
    df['YTWDate'] = pd.to_datetime(df['YTWDate'], errors='coerce')

    # Add year, month, day for clustering
    df['Year_dealdate'] = df['Deal_Date'].dt.year
    df['Month_dealdate'] = df['Deal_Date'].dt.month
    df['Day_dealdate'] = df['Deal_Date'].dt.day
    df['Year_maturity'] = df['maturity'].dt.year
    df['Month_maturity'] = df['maturity'].dt.month
    df['Day_maturity'] = df['maturity'].dt.day

    # Delete maturities smaller than 2021 (as deal dates start in 2021)
    df = df[df['maturity'].dt.year >= 2021]

    # Compute the number of days between maturity and deal date
    df['Days_to_Maturity'] = (df['maturity'] - df['Deal_Date']).dt.days

    # Replace null values in 'AssumedMaturity' with values from 'Maturity'
    df['AssumedMaturity'] = df['AssumedMaturity'].fillna(df['Maturity'])

    # Convert 'B_Side' column to boolean (1 for ' BUY', 0 for ' SELL')
    df = df[df['B_Side'].isin(['NATIXIS SELL', 'NATIXIS BUY'])]
    df['B_Side'] = df['B_Side'].replace({'NATIXIS BUY': 1, 'NATIXIS SELL': 0})

    # Convert null values of 'Tier'
    df['Tier'].fillna('UNKNOWN', inplace=True)

    # Lower string names
    df['Sales_Name'] = df['Sales_Name'].str.lower()
    df['company_short_name'] = df['company_short_name'].str.lower()

    # Drop unused columns
    columns_to_drop = ['Cusip', 'Maturity']
    df.drop(columns=columns_to_drop, inplace=True)

    return df

44:69: W291 trailing whitespace


In [70]:
# Apply preprocessing function

df = preprocess_dataframe(data)

In [93]:
def column_encoding(df):

    """
    Perform column encoding and data transformations on the input DataFrame.

    Parameters:
    - df (pd.DataFrame): Input DataFrame containing financial data.

    Returns:
    pd.DataFrame: Transformed DataFrame with column encoding.
    """

    # List of columns to delete
    columns_to_del = ['cusip', 'Instrument', 'Sales_Name', 'Sales_Initial',
                      'Total_Traded_Volume_Natixis',
                      'Total_Traded_Volume_Away',
                      'Total_Traded_Volume', 'cdcissuer',
                      'Tier']

    df.drop(columns=columns_to_del, axis=1, inplace=True)

    # Transform 'Ccy' to 'is_euro' boolean column
    df['is_euro'] = (df['Ccy'] == 'EUR').astype(int)

    # Transform 'Type' to 'is_fixed' boolean column
    df['is_fixed'] = (df['Type'] == 'Fixed').astype(int)

    # Drop the original 'Ccy' and 'Type' columns
    df = df.drop(['Ccy', 'Type'], axis=1, errors='ignore')

    # Ordinal encoding for 'Rating_Fitch'
    rating_mapping = {
        'AAA': 22,
        'AA+': 21,
        'AA': 20,
        'AA-': 19,
        'A+': 18,
        'A': 17,
        'A-': 16,
        'BBB+': 15,
        'BBB': 14,
        'BBB-': 13,
        'BB+': 12,
        'BB': 11,
        'BB-': 10,
        'B+': 9,
        'B': 8,
        'B-': 7,
        'CCC+': 6,
        'CCC': 5,
        'CCC-': 4,
        'CC': 3,
        'C': 2,
        'WD': 1,
        'D': 0,
        'NR': np.nan
    }

    rating_mapping_moodys = {
        'Aaa': 22,
        'Aa1': 21,
        'Aa2': 20,
        '(P)Aa2': 20,
        'Aa3': 19,
        '(P)Aa3': 19,
        'A1': 18,
        '(P)A1': 18,
        'A2': 17,
        '(P)A2': 17,
        'A3': 16,
        '(P)A3': 16,
        'Baa1': 15,
        '(P)Baa1': 15,
        'Baa2': 14,
        '(P)Baa2': 14,
        'Baa3': 13,
        'Ba1': 12,
        'Ba2': 11,
        'Ba3': 10,
        'B1': 9,
        'B2': 8,
        'B3': 7,
        'Caa1': 6,
        'Caa2': 5,
        'Caa3': 4,
        'Ca': 2.5,
        'C': 0
    }

    df['Rating_Fitch_encoded'] = df['Rating_Fitch'].map(rating_mapping)
    df['Rating_SP_encoded'] = df['Rating_SP'].map(rating_mapping)
    df['Rating_Moodys_encoded'] = df['Rating_Moodys'].map(
        rating_mapping_moodys
        )

    # Create a unique Rating that averages the 3 Ratings
    df['Rating'] = df[['Rating_Fitch_encoded', 'Rating_SP_encoded',
                       'Rating_Moodys_encoded']].mean(axis=1)
    df.drop(columns=['Rating_Fitch', 'Rating_SP',
                     'Rating_Moodys'], axis=1, inplace=True)

    # List of countries to encode
    encode_countries = ['ITALY', 'FRANCE', 'GERMANY', 'NETHERLANDS', 'BELGIUM']

    # Use the apply function with a lambda function to update the 'country'
    df['Country'] = df['Country'].apply(
        lambda x: x if x in encode_countries else 'Other')

    df = pd.get_dummies(df,
                        columns=['Classification', 'Country'], prefix='Class')

    # Convert booleans to numeric in df
    def convert_boolean_to_binary(df):
        for column in df.columns:
            # Check if the column contains boolean values
            if df[column].dtype == bool:
                # Convert boolean to binary integer (True -> 1, False -> 0)
                df[column] = df[column].astype(int)
        return df

    df = convert_boolean_to_binary(df)

    return df

In [95]:
# Apply encoding function
df = column_encoding(df)

KeyError: "['cusip', 'Instrument', 'Sales_Name', 'Sales_Initial', 'Total_Traded_Volume_Natixis', 'Total_Traded_Volume_Away', 'Total_Traded_Volume', 'cdcissuer', 'Tier'] not found in axis"

In [96]:
df.sort_values(by=['Deal_Date'], inplace=True)
df.head()

Unnamed: 0,Deal_Date,ISIN,company_short_name,B_Price,B_Side,Total_Requested_Volume,BloomIndustrySector,BloomIndustryGroup,BloomIndustrySubGroup,cdcissuerShortName,...,Class_Mortgage Finance,Class_Technology,Class_Telecommunications Services,Class_Utilities,Class_BELGIUM,Class_FRANCE,Class_GERMANY,Class_ITALY,Class_NETHERLANDS,Class_Other
566145,2021-09-16,XS2379486884,groupe bpce,99,0,81633,Financial,Diversified Finan Serv,Finance-Other Services,INVENTIVGL,...,0,0,0,0,0,0,0,0,0,1
341068,2021-09-17,XS1751178499,groupe bpce,100,1,81633,Financial,Real Estate,Real Estate Oper/Develop,CNTYGDHD,...,0,0,0,0,0,0,0,0,0,1
478979,2021-09-29,XS1165659514,bnp paribas,98,0,122449,Financial,Investment Companies,Investment Companies,HUARONGFIN,...,0,0,0,0,0,0,0,0,0,1
369735,2021-10-07,US056752AU22,cooperative financiere - cmgm,96,0,408163,Communications,Internet,Web Portals/ISP,BAIDUINC,...,0,1,0,0,0,0,0,0,0,1
626296,2021-10-12,US91282CCJ80,societe generale meung sur loire,99,0,16326531,Government,Sovereign,Sovereign,REPUSA,...,0,0,0,0,0,0,0,0,0,1


## Modelling

In [103]:
# Encode categorical target variable
label_encoder = LabelEncoder()
df['y_encoded'] = label_encoder.fit_transform(df['company_short_name'])

# Split the data into features (X) and target (y) for Buy and Sell side
X = df.drop(['company_short_name', 'y_encoded',
             'Deal_Date', 'maturity', 'YTWDate', 'AssumedMaturity'], axis=1)

y_buy = df.loc[df['B_Side'] == 1, 'y_encoded']
y_sell = df.loc[df['B_Side'] == 0, 'y_encoded']

for col in X.select_dtypes(include=['object']).columns:
    X[col] = label_encoder.fit_transform(X[col])

X_buy = X[X['B_Side'] == 1]
X_sell = X[X['B_Side'] == 0]

In [106]:
# Split the data into training and testing sets
X_train_buy, X_test_buy, y_train_buy, y_test_buy = \
    train_test_split(X_buy, y_buy, test_size=0.2, random_state=42)
X_train_sell, X_test_sell, y_train_sell, y_test_sell = \
    train_test_split(X_sell, y_sell, test_size=0.2, random_state=42)

## XGBoost

#### For Buy Side

In [108]:
# Define the XGBoost classifier with handling categorical variables
xgb_classifier = xgb.XGBClassifier(objective='multi:softmax',
                                   num_class=len(df['company_short_name']
                                                .unique()),
                                   eval_metric='mlogloss')

# Train the classifier
xgb_classifier.fit(X_train_buy, y_train_buy)

# Make predictions on the testing set
y_pred = xgb_classifier.predict(X_test_buy)

# Evaluate the model
accuracy = accuracy_score(y_test_buy, y_pred)
print(f'Accuracy: {accuracy:.2f}')
print('Classification Report:\n', classification_report(y_test_buy, y_pred))

# Get class probabilities for calculating AUC
y_probs = xgb_classifier.predict_proba(X_test_buy)

# Calculate AUC score
auc_score = roc_auc_score(y_test_buy, y_probs, multi_class='ovr')
print(f'AUC Score: {auc_score:.2f}')

Accuracy: 0.38
Classification Report:
               precision    recall  f1-score   support

           0       0.04      0.02      0.02       123
           1       0.04      0.02      0.03        54
           2       0.31      0.29      0.30       143
           3       0.19      0.08      0.12        60
           4       0.11      0.06      0.08        34
           5       0.45      0.46      0.45       304
           6       0.28      0.13      0.17       126
           7       0.00      0.00      0.00        10
           8       0.22      0.16      0.19       155
           9       0.00      0.00      0.00        19
          10       0.51      0.59      0.55       305
          11       0.29      0.13      0.18        45
          12       0.53      0.53      0.53       388
          13       0.12      0.06      0.08        99
          14       0.00      0.00      0.00        19
          15       0.15      0.08      0.11       144
          16       0.15      0.10      0.1

4:49: E128 continuation line under-indented for visual indent


AUC Score: 0.86


#### For Sell Side

In [110]:
# Train the classifier
xgb_classifier.fit(X_train_sell, y_train_sell)

# Make predictions on the testing set
y_pred = xgb_classifier.predict(X_test_sell)

# Evaluate the model
accuracy = accuracy_score(y_test_sell, y_pred)
print(f'Accuracy: {accuracy:.2f}')
print('Classification Report:\n', classification_report(y_test_sell,
                                                        y_pred))

# Get class probabilities for calculating AUC
y_probs = xgb_classifier.predict_proba(X_test_sell)

# Calculate AUC score
auc_score = roc_auc_score(y_test_sell, y_probs, multi_class='ovr')
print(f'AUC Score: {auc_score:.2f}')

Accuracy: 0.40
Classification Report:
               precision    recall  f1-score   support

           0       0.22      0.07      0.11       108
           1       0.10      0.04      0.05       109
           2       0.29      0.35      0.31       167
           3       0.19      0.11      0.14        89
           4       0.25      0.13      0.17        99
           5       0.32      0.34      0.33       319
           6       0.14      0.06      0.08       181
           7       0.33      0.21      0.26        19
           8       0.20      0.13      0.16       129
           9       0.46      0.49      0.48        77
          10       0.49      0.49      0.49       378
          11       0.34      0.25      0.29       156
          12       0.39      0.32      0.35       344
          13       0.00      0.00      0.00        76
          14       0.14      0.15      0.14        20
          15       0.19      0.12      0.14       233
          16       0.23      0.27      0.2

## CatBoost


#### For Buy Side

In [112]:
# Define the CatBoost classifier with handling categorical variables
catboost_classifier = CatBoostClassifier(iterations=100,
                                         loss_function='MultiClass',
                                         eval_metric='Accuracy')

# Train the classifier
catboost_classifier.fit(X_train_buy, y_train_buy)

# Make predictions on the testing set
y_pred = catboost_classifier.predict(X_test_buy)

# Evaluate the model
accuracy = accuracy_score(y_test_buy, y_pred)
print(f'Accuracy: {accuracy:.2f}')
print('Classification Report:\n', classification_report(y_test_buy,
                                                        y_pred))

# Get class probabilities for calculating AUC
y_probs = catboost_classifier.predict_proba(X_test_buy)

auc_score = roc_auc_score(y_test_buy, y_probs, multi_class='ovr')
print(f'AUC Score: {auc_score:.2f}')

Learning rate set to 0.5
0:	learn: 0.2480919	total: 1.08s	remaining: 1m 46s
1:	learn: 0.1048532	total: 2.02s	remaining: 1m 39s
2:	learn: 0.2092503	total: 2.9s	remaining: 1m 33s
3:	learn: 0.1947751	total: 3.84s	remaining: 1m 32s
4:	learn: 0.1656801	total: 4.68s	remaining: 1m 28s
5:	learn: 0.1500812	total: 5.53s	remaining: 1m 26s
6:	learn: 0.2179176	total: 6.6s	remaining: 1m 27s
7:	learn: 0.1729789	total: 7.73s	remaining: 1m 28s
8:	learn: 0.1777298	total: 8.75s	remaining: 1m 28s
9:	learn: 0.1931285	total: 9.67s	remaining: 1m 27s
10:	learn: 0.2067692	total: 10.6s	remaining: 1m 25s
11:	learn: 0.1893567	total: 11.5s	remaining: 1m 24s
12:	learn: 0.1528850	total: 12.5s	remaining: 1m 23s
13:	learn: 0.2178397	total: 13.5s	remaining: 1m 22s
14:	learn: 0.1690291	total: 14.5s	remaining: 1m 22s
15:	learn: 0.2203209	total: 15.5s	remaining: 1m 21s
16:	learn: 0.1968001	total: 16.8s	remaining: 1m 22s
17:	learn: 0.1936069	total: 18.3s	remaining: 1m 23s
18:	learn: 0.1813236	total: 19.6s	remaining: 1m 23s

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Classification Report:
               precision    recall  f1-score   support

           0       0.00      0.00      0.00       123
           1       0.00      0.00      0.00        54
           2       0.00      0.00      0.00       143
           3       0.00      0.00      0.00        60
           4       0.00      0.00      0.00        34
           5       0.04      0.01      0.02       304
           6       0.00      0.00      0.00       126
           7       0.00      0.00      0.00        10
           8       0.00      0.00      0.00       155
           9       0.00      0.00      0.00        19
          10       0.16      0.35      0.22       305
          11       0.25      0.07      0.11        45
          12       0.18      0.36      0.24       388
          13       0.00      0.00      0.00        99
          14       0.00      0.00      0.00        19
          15       0.00      0.00      0.00       144
          16       0.07      0.01      0.02       500
   

#### For Sell Side

In [13]:
# Train the classifier
catboost_classifier.fit(X_train_sell, y_train_sell)

# Make predictions on the testing set 
y_pred = catboost_classifier.predict(X_test_sell)

# Evaluate the model
accuracy = accuracy_score(y_test_sell, y_pred)
print(f'Accuracy: {accuracy:.2f}')
print('Classification Report:\n', classification_report(y_test_sell, y_pred))

# Get class probabilities for calculating AUC
y_probs = catboost_classifier.predict_proba(X_test_sell)

auc_score = roc_auc_score(y_test_sell, y_probs, multi_class='ovr')  
print(f'AUC Score: {auc_score:.2f}')

Learning rate set to 0.5
0:	learn: 0.3340848	total: 1.74s	remaining: 2m 51s
1:	learn: 0.1145303	total: 3.3s	remaining: 2m 41s
2:	learn: 0.2968264	total: 4.92s	remaining: 2m 39s
3:	learn: 0.1940381	total: 6.51s	remaining: 2m 36s
4:	learn: 0.2181339	total: 8.25s	remaining: 2m 36s
5:	learn: 0.2109379	total: 10.1s	remaining: 2m 37s
6:	learn: 0.2653494	total: 11.7s	remaining: 2m 35s
7:	learn: 0.2162870	total: 13.3s	remaining: 2m 33s
8:	learn: 0.2099928	total: 14.9s	remaining: 2m 30s
9:	learn: 0.2731754	total: 16.5s	remaining: 2m 28s
10:	learn: 0.2207652	total: 18.3s	remaining: 2m 27s
11:	learn: 0.2533602	total: 20s	remaining: 2m 27s
12:	learn: 0.1787691	total: 21.7s	remaining: 2m 25s
13:	learn: 0.2521248	total: 23.4s	remaining: 2m 23s
14:	learn: 0.2454230	total: 25.1s	remaining: 2m 22s
15:	learn: 0.2252557	total: 26.9s	remaining: 2m 21s
16:	learn: 0.2366643	total: 28.5s	remaining: 2m 19s
17:	learn: 0.2661215	total: 30.1s	remaining: 2m 17s
18:	learn: 0.1988313	total: 31.8s	remaining: 2m 15s


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Classification Report:
               precision    recall  f1-score   support

           0       0.00      0.00      0.00       108
           1       0.00      0.00      0.00       109
           2       0.00      0.00      0.00       167
           3       0.01      0.03      0.02        89
           4       0.00      0.00      0.00        99
           5       0.00      0.00      0.00       319
           6       0.00      0.00      0.00       181
           7       0.00      0.00      0.00        19
           8       0.00      0.00      0.00       129
           9       0.50      0.01      0.03        77
          10       0.03      0.15      0.05       378
          11       0.11      0.01      0.02       156
          12       0.01      0.01      0.01       344
          13       0.00      0.00      0.00        76
          14       0.00      0.00      0.00        20
          15       0.03      0.00      0.01       233
          16       0.10      0.09      0.09      1583
   

## LightGBM

#### For Buy Side

In [114]:
# Define the LightGBM classifier with handling categorical variables
lgb_classifier = lgb.LGBMClassifier(objective='multiclass',
                                    num_class=len(df['company_short_name']
                                                  .unique()),
                                    metric='multi_logloss')

# Train the classifier
lgb_classifier.fit(X_train_buy, y_train_buy)

# Make predictions on the testing set
y_pred = lgb_classifier.predict(X_test_buy)

# Evaluate the model
accuracy = accuracy_score(y_test_buy, y_pred)
print(f'Accuracy: {accuracy:.2f}')
print('Classification Report:\n', classification_report(y_test_buy, y_pred))

# Get class probabilities for calculating AUC
y_probs = lgb_classifier.predict_proba(X_test_buy)

auc_score = roc_auc_score(y_test_buy, y_probs, multi_class='ovr')
print(f'AUC Score: {auc_score:.2f}')

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.005762 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 4240
[LightGBM] [Info] Number of data points in the train set: 89878, number of used features: 50
[LightGBM] [Info] Start training from score -5.181650
[LightGBM] [Info] Start training from score -5.876779
[LightGBM] [Info] Start training from score -5.163985
[LightGBM] [Info] Start training from score -5.868874
[LightGBM] [Info] Start training from score -6.602187
[LightGBM] [Info] Start training from score -4.363049
[LightGBM] [Info] Start training from score -5.301415
[LightGBM] [Info] Start training from score -7.088720
[LightGBM] [Info] Start training from score -4.857989
[LightGBM] [Info] Start training from score -7.049500
[LightGBM] [Info] Start training from score -4.223856
[LightGBM] [Info] Start training from score -6.264545

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


AUC Score: 0.50


#### For Sell Side

In [115]:
# Train the classifier
lgb_classifier.fit(X_train_sell, y_train_sell)

# Make predictions on the testing set
y_pred = lgb_classifier.predict(X_test_sell)

# Evaluate the model
accuracy = accuracy_score(y_test_sell, y_pred)
print(f'Accuracy: {accuracy:.2f}')
print('Classification Report:\n', classification_report(y_test_sell, y_pred))

# Get class probabilities for calculating AUC
y_probs = lgb_classifier.predict_proba(X_test_sell)

auc_score = roc_auc_score(y_test_sell, y_probs, multi_class='ovr')
print(f'AUC Score: {auc_score:.2f}')

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.012194 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 4203
[LightGBM] [Info] Number of data points in the train set: 161896, number of used features: 49
[LightGBM] [Info] Start training from score -5.835614
[LightGBM] [Info] Start training from score -5.961623
[LightGBM] [Info] Start training from score -5.318886
[LightGBM] [Info] Start training from score -6.100307
[LightGBM] [Info] Start training from score -5.942620
[LightGBM] [Info] Start training from score -4.842441
[LightGBM] [Info] Start training from score -5.388059
[LightGBM] [Info] Start training from score -7.704250
[LightGBM] [Info] Start training from score -5.748603
[LightGBM] [Info] Start training from score -6.183568
[LightGBM] [Info] Start training from score -4.677497
[LightGBM] [Info] Start training from score -5.51466

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Classification Report:
               precision    recall  f1-score   support

           0       0.00      0.00      0.00       108
           1       0.00      0.00      0.00       109
           2       0.00      0.00      0.00       167
           3       0.00      0.00      0.00        89
           4       0.00      0.00      0.00        99
           5       0.00      0.00      0.00       319
           6       0.00      0.00      0.00       181
           7       0.00      0.00      0.00        19
           8       0.00      0.00      0.00       129
           9       0.00      0.00      0.00        77
          10       0.05      0.01      0.01       378
          11       0.02      0.01      0.01       156
          12       0.18      0.04      0.07       344
          13       0.00      0.00      0.00        76
          14       0.00      0.00      0.00        20
          15       0.00      0.00      0.00       233
          16       0.10      0.01      0.01      1583
   

15:67: W291 trailing whitespace


AUC Score: 0.50
