### Imports

In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler

### 1. Preprocessing the data

In [2]:
def preprocess_dataframe(df):
    """
    Preprocesses the input DataFrame with the following steps:
    1. Converts 'Deal_Date', 'maturity', 'AssumedMaturity', 'YTWDate' columns to datetime.
    2. Converts 'B_Side' column to boolean (1 for 'NATIXIS BUY', 0 for 'NATIXIS SELL').
    3. Converts 'B_Price' and 'Total_Requested_Volume' columns to integers.
    4. Fills null values in 'Tier', 'AssumedMaturity', and 'YTWDate' columns with 'UNKNOWN'.
    5. Converts 'Frequency' feature values into integers (removing 'M' from the end).
    6. Drops the unsused 'Cusip' column.

    Parameters:
    - df (DataFrame): Input DataFrame.

    Returns:
    - DataFrame: Processed DataFrame.
    """

    df = df.copy()

    # Drop null values only for columns below the threshold
    columns_to_delete_null_vales = [
        "MidYTM",
        "Coupon",
        "Ccy",
        "cusip",
        "maturity",
        "cdcissuerShortName",
        "Frequency",
        "MidPrice",
        "cdcissuer",
        "company_short_name",
        "BloomIndustrySubGroup",
        "B_Price",
        "Total_Traded_Volume_Natixis",
        "B_Side",
        "Total_Traded_Volume_Away",
        "Total_Requested_Volume",
        "Total_Traded_Volume",
        "Type",
        "Maturity",
        "ISIN",
        "Deal_Date",
    ]
    df = df.dropna(subset=columns_to_delete_null_vales)

    # Convert 'B_Price', 'Total_Requested_Volume', 'Frequency' to integers
    df["Frequency"] = df["Frequency"].str.replace("M", "")
    numerical_columns = ["B_Price", "Total_Requested_Volume", "Frequency"]
    df.dropna(subset=numerical_columns, inplace=True)
    for column in numerical_columns:
        df[column] = pd.to_numeric(df[column], errors="coerce").astype(int)

    # Fix the error in the B_Price column
    df = df[df["B_Price"] >= 20]

    # Replace NaT with null values in the 'Maturity' column
    df["maturity"].replace({pd.NaT: np.nan}, inplace=True)

    # Convert 'Deal_Date', 'maturity', 'AssumedMaturity', 'YTWDate' to datetime
    df["Deal_Date"] = pd.to_datetime(df["Deal_Date"])
    df["maturity"] = pd.to_datetime(
        df["maturity"], errors="coerce", format="%Y-%m-%d %H:%M:%S.%f"
    )
    df["AssumedMaturity"] = pd.to_datetime(df["AssumedMaturity"], errors="coerce")
    df["YTWDate"] = pd.to_datetime(df["YTWDate"], errors="coerce")

    # Add year, month, day for clustering
    df["Year_dealdate"] = df["Deal_Date"].dt.year
    df["Month_dealdate"] = df["Deal_Date"].dt.month
    df["Day_dealdate"] = df["Deal_Date"].dt.day
    df["Year_maturity"] = df["maturity"].dt.year
    df["Month_maturity"] = df["maturity"].dt.month
    df["Day_maturity"] = df["maturity"].dt.day

    # Delete maturities smaller than 2021 (as deal dates starts in 2021)
    df = df[df["maturity"].dt.year >= 2021]

    # Compute number of days between maturity and deal date
    df["Days_to_Maturity"] = (df["maturity"] - df["Deal_Date"]).dt.days

    # Replace null values in 'AssumedMaturity' with values from 'Maturity'
    df["AssumedMaturity"] = df["AssumedMaturity"].fillna(df["Maturity"])

    # Convert 'B_Side' column to boolean (1 for 'NATIXIS BUY', 0 for 'NATIXIS SELL')
    df = df[df["B_Side"].isin(["NATIXIS SELL", "NATIXIS BUY"])]
    df["B_Side"] = df["B_Side"].replace({"NATIXIS BUY": 1, "NATIXIS SELL": 0})

    # Convert null values of 'Tier'
    df["Tier"].fillna("UNKNOWN", inplace=True)

    # Lower string names
    df["Sales_Name"] = df["Sales_Name"].str.lower()
    df["company_short_name"] = df["company_short_name"].str.lower()

    # Drop unused columns
    columns_to_drop = ["Cusip", "Maturity"]
    df.drop(columns=columns_to_drop, inplace=True)

    return df

In [3]:
# Preprocess the data
df = pd.read_csv("../data/data.csv")
df_preprocessed = preprocess_dataframe(df)

input_column = ["ISIN"]
output_column = ["company_short_name"]

# Drop unnecessary columns
columns_to_drop = [
    "Deal_Date",
    "cusip",
    "Sales_Name",
    "Sales_Initial",
    "cdcissuer",
    "maturity",
    "AssumedMaturity",
]
columns_unsure = [
    "Instrument",
    "YTWDate",
    "Month_dealdate",
    "Day_dealdate",
    "Month_maturity",
    "Day_maturity",
]
df_final = df_preprocessed.drop(columns_to_drop + columns_unsure, axis=1)

# Handle missing values (if any)
df_final.dropna(inplace=True)

# Standardize numerical columns
numerical_columns = [
    "B_Price",
    "Total_Requested_Volume",
    "Total_Traded_Volume_Natixis",
    "Total_Traded_Volume_Away",
    "Total_Traded_Volume",
    "Coupon",
    "MidPrice",
    "MidYTM",
    "SpreadvsBenchmarkMid",
    "MidASWSpread",
    "MidZSpread",
    "GSpreadMid",
    "MidModifiedDuration",
    "MidConvexity",
    "MidEffectiveDuration",
    "MidEffectiveConvexity",
    "Year_dealdate",
    "Year_maturity",
    "Days_to_Maturity",
]
scaler = StandardScaler()
df_final[numerical_columns] = scaler.fit_transform(df_final[numerical_columns])

# Encode categorical columns
categorical_columns = [
    "B_Side",
    "BloomIndustrySector",
    "BloomIndustryGroup",
    "BloomIndustrySubGroup",
    "cdcissuerShortName",
    "Country",
    "lb_Platform_2",
    "Rating_Fitch",
    "Rating_Moodys",
    "Rating_SP",
    "Ccy",
    "Classification",
    "Tier",
    "Frequency",
    "Type",
]
df_final = pd.get_dummies(
    df_final, columns=categorical_columns, drop_first=True, dtype=int
)

df_final

  df = pd.read_csv("../data/data.csv")


Unnamed: 0,ISIN,company_short_name,B_Price,Total_Requested_Volume,Total_Traded_Volume_Natixis,Total_Traded_Volume_Away,Total_Traded_Volume,Coupon,MidPrice,MidYTM,...,Classification_Technology,Classification_Telecommunications Services,Classification_Utilities,Tier_SECDOM,Tier_SNRFOR,Tier_SUBLT2,Tier_UNKNOWN,Frequency_6,Frequency_12,Type_Stepup
12,IT0005530032,societe generale mer,0.065155,-0.246467,-0.078542,-0.197600,-0.210594,1.549703,-0.116114,2.026412,...,0,0,0,0,0,0,1,1,0,0
13,IT0005530032,societe generale mer,0.065155,-0.247578,-0.078542,-0.199460,-0.212183,1.549703,-0.116114,2.026412,...,0,0,0,0,0,0,1,1,0,0
14,IT0005530032,societe generale mer,-0.015470,-0.246467,-0.078542,-0.197600,-0.210594,1.549703,-0.116114,2.026412,...,0,0,0,0,0,0,1,1,0,0
15,IT0005530032,societe generale mer,-0.015470,-0.247023,-0.078542,-0.198344,-0.211230,1.549703,-0.116114,2.026412,...,0,0,0,0,0,0,1,1,0,0
16,IT0005530032,societe generale mer,-0.015470,-0.245079,-0.078542,-0.195739,-0.209004,1.549703,-0.116114,2.026412,...,0,0,0,0,0,0,1,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
635408,ES0000012K20,atradius credit insurance nv,-0.902348,-0.220087,-0.078542,-0.162254,-0.180392,-0.717426,-0.894547,-0.565436,...,0,0,0,0,0,0,1,0,1,0
635414,ES0413860802,bpifrance financement,-0.176721,-0.136780,-0.078542,-0.050634,-0.085017,-0.082629,-0.117670,0.082053,...,0,0,0,0,0,0,1,0,1,0
635415,IT0005439275,cegi ett,0.468281,-0.170103,-0.078542,-0.095281,-0.123167,-1.140623,0.445709,-0.145333,...,0,0,0,0,0,0,1,0,1,0
635421,XS1496770626,caisse d'epargne loire centre,0.468281,-0.220087,-0.078542,-0.162254,-0.180392,-1.140623,0.502002,-0.853967,...,0,0,0,0,0,0,1,0,1,0


In [5]:
df.describe()

Unnamed: 0,Total_Traded_Volume_Natixis,Total_Traded_Volume_Away,Total_Traded_Volume,Coupon,MidPrice,MidYTM,SpreadvsBenchmarkMid,MidASWSpread,MidZSpread,GSpreadMid,MidModifiedDuration,MidConvexity,MidEffectiveDuration,MidEffectiveConvexity
count,635453.0,635453.0,635453.0,626910.0,635371.0,622410.0,362144.0,567171.0,592718.0,301488.0,598190.0,584073.0,560337.0,560302.0
mean,68004.02,275647.8,343648.0,1.905299,92.77137,3.646613,147.673661,43.297103,43.061551,145.496902,5.780046,88.801467,4.933199,50298.51
std,1402310.0,1791809.0,2268750.0,1.71995,11.784319,4.737644,610.60169,293.562139,271.683859,650.289233,6.163062,221.165959,4.602549,2535840.0
min,0.0,0.0,0.0,0.0,4.875,-31.753,-3476.973633,-4529.927246,-3548.072021,-3467.842773,0.0,-0.21689,0.0,-3306.458
25%,0.0,0.0,0.0,0.375,89.829777,2.89528,54.344083,-31.361255,-36.074764,49.175598,1.65878,2.96784,1.52996,3.635608
50%,0.0,0.0,816.0,1.6,96.345001,3.496505,102.482319,16.031019,11.856192,95.713036,3.893305,16.70112,3.57883,16.82372
75%,0.0,31837.0,40816.0,3.125,99.158272,4.11823,180.736652,96.483887,95.730194,180.089951,7.864615,69.817802,7.04119,64.56211
max,408163300.0,146938800.0,408163300.0,12.0,161.298996,802.714783,79908.1875,170222.0625,79499.347656,79903.953125,55.889252,4440.126953,49.485229,275905300.0


In [6]:
df_final.to_csv("../data/data_preprocessed.csv", index=False)