# Credit score classification using logistic regression 

We are using a credit-related dataset found in Kaggle (source: https://www.kaggle.com/datasets/parisrohan/credit-score-classification/data). The dataset consists of 27 input features and one target label, that is the credit score.

Goal : To predict the credit score of a new, unseen data using logistic regression model.

In [15]:
import numpy as np
np.random.seed(42)
import pandas as pd
import tqdm
import time

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import roc_auc_score, precision_score, recall_score
from sklearn.metrics import confusion_matrix, accuracy_score, f1_score
from sklearn.impute import KNNImputer
from numba import njit

np.seterr(all='raise')

{'divide': 'raise', 'over': 'raise', 'under': 'raise', 'invalid': 'raise'}

## Load and clean the data

In [2]:
def load_data():
    filename = "../train_data/cs_train.csv"
    df = pd.read_csv(filename)

    ## Separate target from features
    X = df.drop(columns=['Credit_Score'])
    y = df['Credit_Score']

    return X, y

In [None]:
def split_random_sample(X, y, num_samples, compare_to_r_ref):
    

In [26]:
def slice_split(X, y, num_samples, compare_to_r_ref):
    X = X.to_numpy()[:num_samples]
    y = y.to_numpy()[:num_samples]
    
    ## Split into training and test set
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=1332)
    
    print(f"Train X shape is: {X_train.shape}")
    print(f"Train Y shape is: {y_train.shape}")
    print(f"Test X shape is: {X_test.shape}")
    print(f"Test Y shape is: {y_test.shape}")
    return X_train, X_test, y_train, y_test

In [4]:
X, y = load_data()
X.head()

  df = pd.read_csv(filename)


Unnamed: 0,ID,Customer_ID,Month,Name,Age,SSN,Occupation,Annual_Income,Monthly_Inhand_Salary,Num_Bank_Accounts,...,Num_Credit_Inquiries,Credit_Mix,Outstanding_Debt,Credit_Utilization_Ratio,Credit_History_Age,Payment_of_Min_Amount,Total_EMI_per_month,Amount_invested_monthly,Payment_Behaviour,Monthly_Balance
0,0x1602,CUS_0xd40,January,Aaron Maashoh,23,821-00-0265,Scientist,19114.12,1824.843333,3,...,4.0,_,809.98,26.82262,22 Years and 1 Months,No,49.574949,80.41529543900253,High_spent_Small_value_payments,312.49408867943663
1,0x1603,CUS_0xd40,February,Aaron Maashoh,23,821-00-0265,Scientist,19114.12,,3,...,4.0,Good,809.98,31.94496,,No,49.574949,118.28022162236736,Low_spent_Large_value_payments,284.62916249607184
2,0x1604,CUS_0xd40,March,Aaron Maashoh,-500,821-00-0265,Scientist,19114.12,,3,...,4.0,Good,809.98,28.609352,22 Years and 3 Months,No,49.574949,81.699521264648,Low_spent_Medium_value_payments,331.2098628537912
3,0x1605,CUS_0xd40,April,Aaron Maashoh,23,821-00-0265,Scientist,19114.12,,3,...,4.0,Good,809.98,31.377862,22 Years and 4 Months,No,49.574949,199.4580743910713,Low_spent_Small_value_payments,223.45130972736783
4,0x1606,CUS_0xd40,May,Aaron Maashoh,23,821-00-0265,Scientist,19114.12,1824.843333,3,...,4.0,Good,809.98,24.797347,22 Years and 5 Months,No,49.574949,41.420153086217326,High_spent_Medium_value_payments,341.48923103222177


We'll then perform data cleaning to filter out irrelevant features for our prediction. A little bit of feature engineering after might be necessary.

First of all, we need to look more into our 'Monthly_Balance' column, which holds more than one data type, hence detected as DTypeWarning. 

In [5]:
print(X['Monthly_Balance'].unique())

['312.49408867943663' '284.62916249607184' '331.2098628537912' ...
 516.8090832742814 319.1649785257098 393.6736955618808]


In [6]:
X['Monthly_Balance'] = pd.to_numeric(X['Monthly_Balance'], errors='coerce')
X.head()

Unnamed: 0,ID,Customer_ID,Month,Name,Age,SSN,Occupation,Annual_Income,Monthly_Inhand_Salary,Num_Bank_Accounts,...,Num_Credit_Inquiries,Credit_Mix,Outstanding_Debt,Credit_Utilization_Ratio,Credit_History_Age,Payment_of_Min_Amount,Total_EMI_per_month,Amount_invested_monthly,Payment_Behaviour,Monthly_Balance
0,0x1602,CUS_0xd40,January,Aaron Maashoh,23,821-00-0265,Scientist,19114.12,1824.843333,3,...,4.0,_,809.98,26.82262,22 Years and 1 Months,No,49.574949,80.41529543900253,High_spent_Small_value_payments,312.494089
1,0x1603,CUS_0xd40,February,Aaron Maashoh,23,821-00-0265,Scientist,19114.12,,3,...,4.0,Good,809.98,31.94496,,No,49.574949,118.28022162236736,Low_spent_Large_value_payments,284.629162
2,0x1604,CUS_0xd40,March,Aaron Maashoh,-500,821-00-0265,Scientist,19114.12,,3,...,4.0,Good,809.98,28.609352,22 Years and 3 Months,No,49.574949,81.699521264648,Low_spent_Medium_value_payments,331.209863
3,0x1605,CUS_0xd40,April,Aaron Maashoh,23,821-00-0265,Scientist,19114.12,,3,...,4.0,Good,809.98,31.377862,22 Years and 4 Months,No,49.574949,199.4580743910713,Low_spent_Small_value_payments,223.45131
4,0x1606,CUS_0xd40,May,Aaron Maashoh,23,821-00-0265,Scientist,19114.12,1824.843333,3,...,4.0,Good,809.98,24.797347,22 Years and 5 Months,No,49.574949,41.420153086217326,High_spent_Medium_value_payments,341.489231


In [7]:
X.shape

(100000, 27)

'ID', 'Customer_ID', 'Name', 'SSN', 'Occupation', and 'Type_of_Loan' are, for now, **irrelevant** to the prediction.

In [8]:
## Drop irrelevant features
X = X.drop(columns=['ID', 'Customer_ID', 'Name', 'SSN', 'Occupation', 'Type_of_Loan'])
X.shape

(100000, 21)

Furthermore, there are features that hold string values, which obviously cannot be processed by the logistic regression model. We need to convert them into one-hot encoding vectors.

In [9]:
## One-hot encoding
X = pd.get_dummies(X, columns=['Month', 'Credit_Mix', 'Payment_of_Min_Amount', 'Payment_Behaviour'])
y = pd.get_dummies(y)
print(f"New shape of X: {X.shape}")
print(f"New shape of y: {y.shape}")

New shape of X: (100000, 39)
New shape of y: (100000, 3)


In [10]:
## Map 'Credit_History_Age' into decimals and NaN values
import re

# Replace 'NA' with NaN
X['Credit_History_Age'].replace('NA', pd.NA, inplace=True)

# Function to extract years and months
def extract_years_months(value):
    if pd.isna(value):
        return value
    match = re.match(r'(\d+)\s+Years?\s+and\s+(\d+)\s+Months?', value)
    if match:
        years, months = map(int, match.groups())
        return years + months / 12
    return None

# Apply the function to the column
X['Credit_History_Age'] = X['Credit_History_Age'].apply(extract_years_months)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  X['Credit_History_Age'].replace('NA', pd.NA, inplace=True)


In [11]:
X['Credit_History_Age'].head()

0    22.083333
1          NaN
2    22.250000
3    22.333333
4    22.416667
Name: Credit_History_Age, dtype: float64

Now, we need to address NaN values as a part of data cleaning process.

In [12]:
## Make sure all values are numerical
X_num = X.apply(pd.to_numeric, errors='coerce')

In [13]:
X_num.isna().sum()

Age                                                    4939
Annual_Income                                          6980
Monthly_Inhand_Salary                                 15002
Num_Bank_Accounts                                         0
Num_Credit_Card                                           0
Interest_Rate                                             0
Num_of_Loan                                            4785
Delay_from_due_date                                       0
Num_of_Delayed_Payment                                 9746
Changed_Credit_Limit                                   2091
Num_Credit_Inquiries                                   1965
Outstanding_Debt                                       1009
Credit_Utilization_Ratio                                  0
Credit_History_Age                                     9030
Total_EMI_per_month                                       0
Amount_invested_monthly                                8784
Monthly_Balance                         

In [16]:
## Fill the missing values with mean in each column
# X_num = X_num.fillna(X_num.mean())

# Initialize the KNN imputer
imputer = KNNImputer(missing_values=np.nan, n_neighbors=5)

# Fit and transform the data
start_time = time.time()
X_imputed = imputer.fit_transform(X_num)
end_time = time.time()

print(f"KNNImputer fitting time: {end_time - start_time} seconds")

# Convert back to DataFrame
X_num = pd.DataFrame(X_imputed, columns=X_num.columns)
del X_imputed

KNNImputer fitting time: 719.2777955532074 seconds


In [17]:
X_num.isna().sum()

Age                                                   0
Annual_Income                                         0
Monthly_Inhand_Salary                                 0
Num_Bank_Accounts                                     0
Num_Credit_Card                                       0
Interest_Rate                                         0
Num_of_Loan                                           0
Delay_from_due_date                                   0
Num_of_Delayed_Payment                                0
Changed_Credit_Limit                                  0
Num_Credit_Inquiries                                  0
Outstanding_Debt                                      0
Credit_Utilization_Ratio                              0
Credit_History_Age                                    0
Total_EMI_per_month                                   0
Amount_invested_monthly                               0
Monthly_Balance                                       0
Month_April                                     

## Feature engineering

Feature engineering lets us create new features by combining existing ones. These new features help the model to see the relationship between features and can improve the model's performance.

In [18]:
## Interaction features
X_num['Debt_to_Income_Ratio'] = X_num['Outstanding_Debt'] / X_num['Annual_Income']
X_num['Loan_to_Income_Ratio'] = X_num['Num_of_Loan'] / X_num['Annual_Income']
X_num['Investment_Ratio'] = X_num['Amount_invested_monthly'] / X_num['Monthly_Inhand_Salary']
X_num['Credit_Card_Utilization'] = X_num['Credit_Utilization_Ratio'] * X_num['Num_Credit_Card']

## Aggregated features
X_num['Total_Debt'] = X_num['Outstanding_Debt'] + X_num['Total_EMI_per_month']

## One-hot encoding for seasons
## --> Winter
X_num['Winter'] = X_num['Month_January'] + X_num['Month_February']
## --> Spring
X_num['Spring'] = X_num['Month_March'] + X_num['Month_April'] + X_num['Month_May']
## --> Summer
X_num['Summer'] = X_num['Month_June'] + X_num['Month_July'] + X_num['Month_August']

We also need to perform normalization to some features so that all features are in a virtually comparable range of values (feature scaling). We will use z-score normalization that subtracts the mean from each feature and divide the resulting value by its standard deviation.

First, we will reduce the impact of **outliers** (i.e., skewed, and possibly incorrect,
values in columns that might distort z-score normalization results) by applying natural log transformation.

In [19]:
columns_log = [
    'Age', 'Annual_Income', 'Monthly_Inhand_Salary', 'Num_Bank_Accounts',
    'Num_Credit_Card',
    'Interest_Rate', 'Num_of_Loan', 'Delay_from_due_date', 'Num_of_Delayed_Payment',
    'Changed_Credit_Limit', 'Num_Credit_Inquiries', 'Outstanding_Debt',
    'Credit_Utilization_Ratio', 'Credit_History_Age', 'Total_EMI_per_month',
    'Amount_invested_monthly', 'Monthly_Balance',
    'Debt_to_Income_Ratio', 'Loan_to_Income_Ratio', 'Investment_Ratio', 
    'Credit_Card_Utilization', 'Total_Debt'
]

## Prevent FloatingPointError
X_num[columns_log] = X_num[columns_log].map(lambda x: x if x >= 0 else 0.01)

## Apply log transformation to necessary columns
X_num[columns_log] = np.log1p(X_num[columns_log])

We can then proceed for normalization.

In [20]:
## We want to normalize all columns except one-hot encoding vectors
columns_norm = [
    'Age', 'Annual_Income', 'Monthly_Inhand_Salary', 'Num_Bank_Accounts', 
    'Num_Credit_Card',
    'Interest_Rate', 'Num_of_Loan', 'Delay_from_due_date', 'Num_of_Delayed_Payment',
    'Changed_Credit_Limit', 'Num_Credit_Inquiries', 'Outstanding_Debt',
    'Credit_Utilization_Ratio', 'Credit_History_Age', 'Total_EMI_per_month',
    'Amount_invested_monthly', 'Monthly_Balance',
    'Debt_to_Income_Ratio', 'Loan_to_Income_Ratio', 'Investment_Ratio', 
    'Credit_Card_Utilization', 'Total_Debt'
]

## Initialize the scaler
scaler = StandardScaler()

## Fit and transform the selected columns
X_num[columns_norm] = scaler.fit_transform(X_num[columns_norm])

In [21]:
X_num['Debt_to_Income_Ratio'].head()

0   -0.183463
1   -0.183463
2   -0.183463
3   -0.183463
4   -0.183463
Name: Debt_to_Income_Ratio, dtype: float64

In [22]:
X_num['Num_Credit_Card'].head()

0   -0.409512
1   -0.409512
2   -0.409512
3   -0.409512
4   -0.409512
Name: Num_Credit_Card, dtype: float64

## Risk minimization

We need to come to an understanding that logistic regression model cannot be trained on the target label of three possible binary values.

In [23]:
y.head()

Unnamed: 0,Good,Poor,Standard
0,True,False,False
1,True,False,False
2,True,False,False
3,True,False,False
4,True,False,False


Let's assume that the global finance company we're working with is not looking for more market expansion and is trying to minimize loan risk. Therefore, we want to label parties with **Good** credit scores as being eligible for loans, while the rest are not.

In [24]:
## Create a new binary label
y['Target'] = y['Good']

## Drop the original one-hot encoded columns
y = y.drop(columns=['Good', 'Standard', 'Poor'])

In [25]:
y.shape

(100000, 1)

## Preparing data for training

In [86]:
## NUM_SAMPLES is the combination of all sets (NUM_TRAINING + NUM_TEST)
NUM_SAMPLES = 8192
COMPARE_TO_R_REF = False
lr = 0.001
mu = 0.5

## Slice data into NumPy arrays and split into training and test sets
X_train, X_test, y_train, y_test = slice_split(
    X_num, y,
    num_samples=NUM_SAMPLES,
    compare_to_r_ref=COMPARE_TO_R_REF
)

## n = num_of_features
n = X_train.shape[1]

# Same shape as Marcelo's reference code
betas = np.zeros((n, ))
# betas = np.random.randn(n)

Train X shape is: (4096, 47)
Train Y shape is: (4096, 1)
Test X shape is: (4096, 47)
Test Y shape is: (4096, 1)


In [87]:
n

47

## Applying Principal Component Analysis

Principal Component Analysis (PCA) allows for a reduced dataset, meaning the new dataset will have a small number of new features (called the principal components), which explains all original features. 

How well these new features explain the original ones is parameterized by explained variance ratio.

In [88]:
from sklearn.decomposition import PCA

def apply_pca(X_train, X_test, n_components):
    pca = PCA(n_components=n_components)
    X_train_pca = pca.fit_transform(X_train)
    X_test_pca = pca.transform(X_test)
    
    print(f"Explained variance ratio: {pca.explained_variance_ratio_}")
    print(f"Number of components after PCA: {X_train_pca.shape[1]}")
    
    return X_train_pca, X_test_pca

In [89]:
X_train, X_test = apply_pca(X_train, X_test, n_components=32)

## n = num_of_features
n = X_train.shape[1]
print(f"n = {n}")

# Same shape as Marcelo's reference code
betas = np.zeros((n, ))

Explained variance ratio: [0.21939469 0.0915127  0.08018888 0.07458434 0.05153002 0.04331276
 0.04107244 0.03735378 0.03498121 0.03048053 0.0273365  0.02678491
 0.02415991 0.02111715 0.02027564 0.01919968 0.01663116 0.01569339
 0.01512239 0.01095883 0.00982277 0.00861407 0.00802758 0.00687137
 0.00639006 0.00629253 0.00549047 0.00502738 0.00495027 0.00487403
 0.004825   0.00474715]
Number of components after PCA: 32
n = 32


In [90]:
print(f"Train X shape is: {X_train.shape}")
print(f"Train Y shape is: {y_train.shape}")
print(f"Test X shape is: {X_test.shape}")
print(f"Test Y shape is: {y_test.shape}")

Train X shape is: (4096, 32)
Train Y shape is: (4096, 1)
Test X shape is: (4096, 32)
Test Y shape is: (4096, 1)


## Nesterov model training

In [91]:
@njit
def sigmoid(z):
    return 1 / (1 + np.exp(-z))

@njit
def fwd(train_x, betas, dbg=False):
    preds = train_x @ betas   # A vector of linear_predictions/logits z = train_x @ weights
    if dbg:
        print(f"Logits: {preds}")
    return np.expand_dims(sigmoid(preds), -1)   # Shape: (m, 1)

@njit
def calculate_gradient(train_x, train_y, betas, fwd, dbg):
    preds = fwd(train_x, betas, dbg)   # A vector of logistic predictions y_hat = sigmoid(z)
    gradient = -train_x.T @ (train_y - preds) / len(train_y)
    return gradient   # Shape: (10, 1) == Rows correspond to num_features
    ## This function is used to update the values of betas: w_new = w_old + lr * gradient

def cost(x, y, theta):
    m = x.shape[0]
    h = sigmoid(np.matmul(x, theta))   # h: hypothesis, basically preds/y_hat
    t1 = np.matmul(-y.T, np.log(np.clip(h, 0.000000000000001, np.max(h))))
    t2_a = (1 - y.T)
    t2_b = np.log(np.clip(1 - h, 0.000000000000001, np.max(1 - h)))  # Used to get numerical issues
    ## np.clip() function prevents computing the log of 0, by taking the minimum of 1e-15.
    t2 = np.matmul(t2_a, t2_b)

    return ((t1 - t2) / m)[0]   # Shape: (1,) == scalar value

def nesterov(betas, epochs, patience, lr, mu, train_x, train_y):
    import copy

    phi = copy.deepcopy(betas)
    theta = copy.deepcopy(betas)

    check_early = 0
    best_loss = float('inf')
    best_epochs_list = []

    nesterov_loss = [0 for _ in range(epochs)]
    for i in tqdm.trange(epochs):
    # for i in range(epochs):
        gradient = calculate_gradient(train_x, train_y, theta, fwd, dbg=False)

        ## Assign updated weights into phi_prime
        phi_prime = theta - lr * np.squeeze(gradient)   # np.squeeze() removes single dimensions --> shape (10,)
        
        ## Nesterov acceleration process
        if i == 0:
            theta = phi_prime
        else:
            ## If current updated weight (phi_prime) < previous weight (phi), 
            ## The updated weight theta will be even smaller.
            theta = phi_prime + mu * (phi_prime - phi)
        phi = phi_prime   # phi is then the weight of the previous epoch/update
        loss = cost(train_x, train_y, theta)

        ## Early stopping
        if patience > 0:
            if loss < best_loss:
                best_loss = loss
                best_epoch = i
                best_theta = theta
                best_epochs_list.append(best_epoch)
                check_early = 0
            else:
                check_early += 1
                if check_early >= patience:
                    print(f"Early stopping at {patience} epochs after {best_epoch} with best loss {best_loss}")
                    return nesterov_loss[:best_epoch + 1], best_theta, phi, best_epochs_list
        else:
            best_epoch = epochs - 1
            best_theta = theta

        ## Update list
        nesterov_loss[i] = loss

        # print(f"New loss: {cost(train_x, train_y, v)[0]}")
    return nesterov_loss[:best_epoch + 1], best_theta, phi, best_epochs_list


In [92]:
# Ensure inputs are of the correct type
betas = np.array(betas, dtype=np.float64)
X_train = np.array(X_train, dtype=np.float64)
y_train = np.array(y_train, dtype=np.float64)
X_test = np.array(X_test, dtype=np.float64)
y_test = np.array(y_test, dtype=np.float64)
lr = float(lr)
mu = float(mu)

In [93]:
losses, theta, phi, best_epochs_list = nesterov(betas, 800, -1, lr, mu, X_train, y_train)

100%|██████████| 800/800 [00:02<00:00, 288.13it/s]


In [94]:
for j in range(len(losses)):
    print(f"{j}: {losses[j]}", end='\n')
# Ideal Gradient (800 epochs) #
## seed(42), lr = 0.001, mu = 0.3 --> loss = 0.5433165928958901
## seed(42), lr = 0.001, mu = 0.5 --> loss = 0.5139704562064134

# Oscillating Gradient (800 epochs) #
## seed(42), lr = 0.003, mu = 0.5 --> loss = 0.49509424569059735
## seed(42), lr = 0.003, mu = 0.6 --> loss = 0.4102733269497619

# Early stopping (patience = 50) #
## seed(42), lr = 0.003, mu = 0.5 --> 964: 0.410228534339208
## seed(42), lr = 0.003, mu = 0.6 --> 422: 0.46334269494781627

# Ideal Gradient (800 epochs) with Feature Engineering #
## seed(42), lr = 0.001, mu = 0.3 --> loss = 0.523951850559082
## seed(42), lr = 0.001, mu = 0.5 --> loss = 0.4922252143522289

# Early stopping (patience = 50) with Feature Engineering #
## seed(42), lr = 0.003, mu = 0.5 --> 739: 0.41164361016071044
## seed(42), lr = 0.003, mu = 0.6 --> 314: 0.4688941580652026

0: 0.693003682604868
1: 0.6927890738578446
2: 0.692539657072174
3: 0.6922735958810877
4: 0.6919999660780027
5: 0.6917232979658036
6: 0.6914458502201825
7: 0.6911687467462324
8: 0.6908925447138424
9: 0.6906175182900889
10: 0.6903438003406072
11: 0.690071453192347
12: 0.6898005039696082
13: 0.6895309622390359
14: 0.6892628288205002
15: 0.6889961001866802
16: 0.6887307706598937
17: 0.6884668335089974
18: 0.6882042814970458
19: 0.6879431071546931
20: 0.6876833029166447
21: 0.6874248611897253
22: 0.6871677743867979
23: 0.6869120349436328
24: 0.6866576353272601
25: 0.686404568040071
26: 0.6861528256217955
27: 0.6859024006504187
28: 0.6856532857425672
29: 0.6854054735536306
30: 0.685158956777751
31: 0.6849137281477451
32: 0.6846697804349935
33: 0.6844271064493125
34: 0.6841856990388175
35: 0.6839455510897812
36: 0.683706655526491
37: 0.6834690053111021
38: 0.6832325934434923
39: 0.6829974129611145
40: 0.6827634569388492
41: 0.6825307184888574
42: 0.6822991907604306
43: 0.6820688669398435
44: 

In [95]:
for j in range(len(best_epochs_list)):
    print(f"{j}: {best_epochs_list[j]}", end='\n')

In [96]:
theta

array([-0.24007112, -0.01535225, -0.02042874, -0.00850244, -0.10020641,
        0.01383243,  0.00412154,  0.00176149, -0.00310306,  0.01833061,
        0.01326636, -0.01421149, -0.01135224, -0.02777846, -0.02074188,
       -0.00917358,  0.0174313 , -0.00543941, -0.01252769, -0.06180679,
        0.01662555, -0.0187003 , -0.00613393, -0.00466333, -0.01164758,
        0.00279842, -0.00797512,  0.00346959,  0.00086546, -0.00201151,
       -0.00347156,  0.00446255])

## Model performance on training set

We can test the performance of the model through its confusion matrix, F1 score, and accuracy score.

First, we predict on the training set.

In [97]:
## On training data
pred = fwd(X_train, theta, dbg=False)

## Decision (Threshold = 0.5)
y_train_hat = (pred >= 0.5).astype(int)

In [98]:
y_train_hat.shape

(4096, 1)

In [99]:
y_train.shape

(4096, 1)

In [100]:
print(confusion_matrix(y_train, y_train_hat))

[[2006 1357]
 [  69  664]]


In [101]:
print(f1_score(y_train, y_train_hat))

0.4822076978939724


In [102]:
print(precision_score(y_train, y_train_hat))

0.3285502226620485


In [103]:
print(recall_score(y_train, y_train_hat))

0.9058663028649386


In [104]:
print(roc_auc_score(y_train, pred))

0.8402854431845795


In [105]:
print(accuracy_score(y_train, y_train_hat))

0.65185546875


## Model performance on test set

Now, we predict on the test set.

In [106]:
## On test data

pred = fwd(X_test, theta, dbg=False)

## Decision (Threshold = 0.5)
y_test_hat = (pred >= 0.5).astype(int)

In [107]:
y_test_hat.shape

(4096, 1)

In [108]:
y_test.shape

(4096, 1)

In [109]:
print(confusion_matrix(y_test, y_test_hat))

[[1954 1359]
 [  59  724]]


In [110]:
print(f1_score(y_test, y_test_hat))

0.5052337752965806


In [111]:
print(roc_auc_score(y_test, pred))

0.8458358438582636


In [112]:
print(accuracy_score(y_test, y_test_hat))

0.65380859375


## Write to a CSV file

In [113]:
# Convert the NumPy array to a DataFrame
X_train_csv = pd.DataFrame(X_train)
y_train_csv = pd.DataFrame(y_train)
X_test_csv = pd.DataFrame(X_test)
y_test_csv = pd.DataFrame(y_test)

# Save the DataFrame to a CSV file
X_train_csv.to_csv("../cscore_data/X_train_4096.csv", index=False)
y_train_csv.to_csv("../cscore_data/y_train_4096.csv", index=False)
X_test_csv.to_csv("../cscore_data/X_test_4096.csv", index=False)
y_test_csv.to_csv("../cscore_data/y_test_4096.csv", index=False)

In [None]:
del X

In [117]:
X_train_csv[X_train_csv.columns[:]].head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,22,23,24,25,26,27,28,29,30,31
0,2.198939,-1.386113,-1.754133,0.066202,2.772689,0.524358,-0.752849,-0.078448,-0.641564,5.230106,...,-0.212204,0.613856,-0.068885,0.024195,-0.083641,-0.189059,-0.526651,-0.084249,-0.372196,-0.218408
1,-3.582945,-3.104108,1.625395,0.851905,0.408469,-0.36636,-0.829054,0.219654,0.002127,-1.248474,...,-0.019534,-0.078361,-0.034219,0.874848,-0.126403,-0.466379,0.080115,-0.369146,0.422136,-0.028935
2,-0.931388,-0.637706,-0.867205,0.723298,0.263651,0.146046,-0.278242,-0.426526,0.710626,0.850852,...,-0.524058,0.492138,0.114483,-0.232441,0.448486,0.740165,-0.191677,-0.103786,0.087587,-0.055254
3,0.480324,1.332575,-2.139174,0.902649,-2.605806,0.655541,0.107937,-0.094884,0.528272,1.650635,...,-0.309875,0.050581,0.813386,-0.197001,-0.360753,-0.360105,0.286082,0.646569,-0.54125,0.096434
4,-1.529067,-0.186602,0.864413,-0.62324,1.343525,-1.313599,1.146676,0.715594,-1.318801,0.361061,...,-0.588974,0.030033,1.137614,0.439302,-0.34279,-0.2165,-0.514616,0.00748,-0.525417,-0.090916
