In [114]:
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.linear_model import LogisticRegression
from typing import List
from typing import Tuple

In [98]:
df = pd.read_csv('Credit_Card.csv')
label_df = pd.read_csv('Credit_card_label.csv')
df.head()

Unnamed: 0,Ind_ID,GENDER,Car_Owner,Propert_Owner,CHILDREN,Annual_income,Type_Income,EDUCATION,Marital_status,Housing_type,Birthday_count,Employed_days,Mobile_phone,Work_Phone,Phone,EMAIL_ID,Type_Occupation,Family_Members
0,5008827,M,Y,Y,0,180000.0,Pensioner,Higher education,Married,House / apartment,-18772.0,365243,1,0,0,0,,2
1,5009744,F,Y,N,0,315000.0,Commercial associate,Higher education,Married,House / apartment,-13557.0,-586,1,1,1,0,,2
2,5009746,F,Y,N,0,315000.0,Commercial associate,Higher education,Married,House / apartment,,-586,1,1,1,0,,2
3,5009749,F,Y,N,0,,Commercial associate,Higher education,Married,House / apartment,-13557.0,-586,1,1,1,0,,2
4,5009752,F,Y,N,0,315000.0,Commercial associate,Higher education,Married,House / apartment,-13557.0,-586,1,1,1,0,,2


In [99]:
df.columns

Index(['Ind_ID', 'GENDER', 'Car_Owner', 'Propert_Owner', 'CHILDREN',
       'Annual_income', 'Type_Income', 'EDUCATION', 'Marital_status',
       'Housing_type', 'Birthday_count', 'Employed_days', 'Mobile_phone',
       'Work_Phone', 'Phone', 'EMAIL_ID', 'Type_Occupation', 'Family_Members'],
      dtype='object')

In [100]:
df.dtypes

Ind_ID               int64
GENDER              object
Car_Owner           object
Propert_Owner       object
CHILDREN             int64
Annual_income      float64
Type_Income         object
EDUCATION           object
Marital_status      object
Housing_type        object
Birthday_count     float64
Employed_days        int64
Mobile_phone         int64
Work_Phone           int64
Phone                int64
EMAIL_ID             int64
Type_Occupation     object
Family_Members       int64
dtype: object

In [101]:
merged_df = pd.merge(df, label_df, on='Ind_ID')

In [102]:
merged_df.dtypes

Ind_ID               int64
GENDER              object
Car_Owner           object
Propert_Owner       object
CHILDREN             int64
Annual_income      float64
Type_Income         object
EDUCATION           object
Marital_status      object
Housing_type        object
Birthday_count     float64
Employed_days        int64
Mobile_phone         int64
Work_Phone           int64
Phone                int64
EMAIL_ID             int64
Type_Occupation     object
Family_Members       int64
label                int64
dtype: object

# **1. (5 pts) Clean your dataset to turn categorical values into numerical ones. One-hot encoding is likely the answer, but it depends on the dataset. Your data may have ordinal columns, for example where one-hot encoding is not as appropriate.**

In [112]:
def clean_and_encode_data(merged_df: pd.DataFrame, 
                          numerical_cols: List[str] = ['Annual_income', 'Birthday_count'], 
                          categorical_cols: List[str] = ['GENDER', 'Car_Owner', 'Propert_Owner', 'Type_Income', 
                                                         'EDUCATION', 'Marital_status', 'Housing_type', 'Type_Occupation'], 
                          occupation_col: str = 'Type_Occupation') -> pd.DataFrame:
    for col in numerical_cols:
        df[col] = df[col].fillna(df[col].median())
    
    merged_df.dropna(subset=[occupation_col], inplace=True)
    
    merged_df['GENDER'] = merged_df['GENDER'].fillna(merged_df['GENDER'].mode()[0])
    
    df_encoded = pd.get_dummies(merged_df, columns=categorical_cols, drop_first=True)

    df_encoded = df_encoded.replace([np.inf, -np.inf], np.nan).fillna(0)

    df_encoded = df_encoded.astype(int)
    
    return df_encoded
df_encoded = clean_and_encode_data(merged_df)
df_encoded


Unnamed: 0,Ind_ID,CHILDREN,Annual_income,Birthday_count,Employed_days,Mobile_phone,Work_Phone,Phone,EMAIL_ID,Family_Members,...,Type_Occupation_Laborers,Type_Occupation_Low-skill Laborers,Type_Occupation_Managers,Type_Occupation_Medicine staff,Type_Occupation_Private service staff,Type_Occupation_Realty agents,Type_Occupation_Sales staff,Type_Occupation_Secretaries,Type_Occupation_Security staff,Type_Occupation_Waiters/barmen staff
8,5010864,1,450000,-18173,-678,1,0,1,1,3,...,0,0,0,0,0,0,0,0,0,0
9,5010868,1,450000,-18173,-678,1,0,1,1,3,...,0,0,0,0,0,0,0,0,0,0
10,5010869,1,450000,-18173,-678,1,0,1,1,1,...,0,0,0,0,0,0,0,0,0,0
11,5018498,0,90000,-18950,-1002,1,1,1,0,2,...,0,0,0,0,0,0,0,0,0,0
12,5018501,0,0,-18950,-1002,1,1,1,0,2,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1542,5118268,1,360000,-11294,-3536,1,0,1,0,3,...,0,0,0,0,0,0,0,0,0,0
1543,5028645,0,0,-11957,-2182,1,0,0,0,2,...,0,0,1,0,0,0,0,0,0,0
1544,5023655,0,225000,-10229,-1209,1,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
1545,5115992,2,180000,-13174,-2477,1,0,0,0,4,...,0,0,1,0,0,0,0,0,0,0


# **2. (3 pts) Perform univariate linear regression on the dataset. Select your variable to predict. How well did this model perform? Is this a good approach for this dataset? Why or why not?**

In [113]:
def credit_card_approval_model(df_encoded: pd.DataFrame, 
                               feature_cols: List[str], 
                               target_col: str) -> float:

    merged_df = pd.merge(df, label_df, on='Ind_ID')

    X = df_encoded[feature_cols]
    y = df_encoded[target_col]

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

    lin_reg = LinearRegression()
    lin_reg.fit(X_train, y_train)

    y_pred = lin_reg.predict(X_test)

    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)

    return mse, r2

feature_cols = ['Annual_income']
target_col = 'label'

mse, r2 = credit_card_approval_model(df_encoded, feature_cols, target_col)
print(f'MSE: {mse}, R2: {r2}')

MSE: 0.11819873066694385, R2: -0.010801559404991723


In [105]:
y_pred

array([0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1,
       0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0,
       0, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0,
       1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 1, 1, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0,
       0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1,
       0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0,
       0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0])

My model is not explaining any meaningful variance in the data and is actually performing worse than predicting the mean of the target variable due to the r squared value being less than 0. However my MSE is realatively low which mean it does good job predicting close to actual target values. Overall this model is bad espcially since linear regression is better suited for continous output and not a classification task where output is binary. 

# **3. (8 pts) Perform KNN on this dataset. As part of this, write a function that selects the optimal value of k. How well did this model perform?**

In [115]:
def find_optimal_k(df_encoded: pd.DataFrame, 
                   target_col: str, 
                   k_range: Tuple[int, int] = (1, 21), 
                   test_size: float = 0.2, 
                   random_state: int = 42) -> float:

    X = df_encoded.drop(columns=[target_col])
    y = df_encoded[target_col]
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state)
    
    best_k = 0
    best_score = 0
    for k in k_range:
        knn = KNeighborsClassifier(n_neighbors=((2*k)+1))
        knn.fit(X_train, y_train)
        score = knn.score(X_test, y_test)
        if score > best_score:
            best_k = k
            best_score = score

    knn = KNeighborsClassifier(n_neighbors=best_k)
    knn.fit(X_train, y_train)
    
    y_pred = knn.predict(X_test)
    test_accuracy = accuracy_score(y_test, y_pred)
    conf_matrix = confusion_matrix(y_test, y_pred)
    class_report = classification_report(y_test, y_pred)
    
    return best_k, test_accuracy, conf_matrix, class_report

best_k, test_accuracy, conf_matrix, class_report = find_optimal_k(df_encoded, target_col='label')

print(f"Optimal k: {best_k}")
print(f"Test Accuracy: {test_accuracy}")
print("Confusion Matrix:")
print(conf_matrix)
print("Classification Report:")
print(class_report)


Optimal k: 1
Test Accuracy: 0.8490566037735849
Confusion Matrix:
[[168  13]
 [ 19  12]]
Classification Report:
              precision    recall  f1-score   support

           0       0.90      0.93      0.91       181
           1       0.48      0.39      0.43        31

    accuracy                           0.85       212
   macro avg       0.69      0.66      0.67       212
weighted avg       0.84      0.85      0.84       212



The overall accuracy is relatively high, but accuracy alone doesn't fully reflect the model's ability to handle class imbalances.
The model predicts Class 0 well (168 correct), but struggles with Class 1, misclassifying 19 out of 31 instances.

**Class 0 (Negative Class)**
Precision: 90% of predicted Class 0 are correct.
Recall: 93% of actual Class 0 are correctly identified.

**Class 1 (Positive Class)**
Precision: Only 48% of predicted Class 1 are correct.
Recall: The model captures only 39% of actual Class 1 instances.

**F1-score**
Class 0: Strong balance between precision and recall.
Class 1: Weak performance in predicting Class 1.

**Macro avg F1-score: 0.67**
Reflects poor performance for the minority class (Class 1).

**Weighted avg F1-score: 0.84**
Driven by the high accuracy of the dominant class (Class 0).

# **4. (6 pts) Work with your dataset to perform logistic regression. How well did this perform?**

In [116]:
def train_logistic_regression(df_encoded: pd.DataFrame) -> float:
    X = df_encoded.drop(columns=['label', 'Ind_ID'])
    y = df_encoded['label']

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    log_reg = LogisticRegression(max_iter=1000)
    log_reg.fit(X_train, y_train)

    y_pred = log_reg.predict(X_test)

    accuracy = accuracy_score(y_test, y_pred)
    report = classification_report(y_test, y_pred)

    return accuracy, report
print(accuracy)
print(report)

0.6037735849056604
              precision    recall  f1-score   support

           0       0.61      0.83      0.71       121
           1       0.57      0.30      0.39        91

    accuracy                           0.60       212
   macro avg       0.59      0.57      0.55       212
weighted avg       0.60      0.60      0.57       212



The overall accuracy of 60.4% indicates a moderate performance.

**Class 0 (Negative Class)**
Precision: 61% of the predictions for Class 0 are correct.
Recall:The model correctly identifies 83% of the actual Class 0 instances.
F1-score:A balanced score for precision and recall in Class 0, showing decent performance.

**Class 1 (Positive Class)**
Precision: 57% of the predictions for Class 1 are correct.
Recall:  The model only captures 30% of the actual Class 1 instances, indicating poor performance in identifying the minority class.
F1-score: Low, indicating that the model struggles significantly with the minority class.

**Averages:**
Macro Average F1-score: 0.55
Reflects an overall weak performance across both classes, particularly due to poor Class 1 recall.
Weighted Average F1-score: 0.57
Similar to the accuracy score, showing that performance is driven by the dominant class.

# **5. (3 pts) Perform normalization on your dataset. Does it change the performance for 2-4? What is the best measure of performance for your dataset (accuracy or something else) and why?**

In [119]:
#Normalized question 2
def credit_card_approval_model(df_encoded: pd.DataFrame, 
                               feature_cols: List[str], 
                               target_col: str) -> float:

    merged_df = pd.merge(df, label_df, on='Ind_ID')

    X = df_encoded[feature_cols]
    y = df_encoded[target_col]

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

    scaler = StandardScaler()
    X_train_sc = scaler.fit_transform(X_train)
    X_test_sc = scaler.transform(X_test)

    lin_reg = LinearRegression()
    lin_reg.fit(X_train_sc, y_train)

    y_pred = lin_reg.predict(X_test_sc)

    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)

    return mse, r2

feature_cols = ['Annual_income']
target_col = 'label'

mse, r2 = credit_card_approval_model(df_encoded, feature_cols, target_col)
print(f'MSE: {mse}, R2: {r2}')

MSE: 0.11819873066694385, R2: -0.010801559404991723


For question 2 normalizing the dataset didn't change the outcome

In [118]:
#Normalized question 3
def find_optimal_k(df_encoded: pd.DataFrame, 
                   target_col: str, 
                   k_range: Tuple[int, int] = (1, 21), 
                   test_size: float = 0.2, 
                   random_state: int = 42) -> float:

    X = df_encoded.drop(columns=[target_col])
    y = df_encoded[target_col]
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state)

    scaler = StandardScaler()
    X_train_sc = scaler.fit_transform(X_train)
    X_test_sc = scaler.transform(X_test)

    best_k = 0
    best_score = 0
    for k in k_range:
        knn = KNeighborsClassifier(n_neighbors=((2*k)+1))
        knn.fit(X_train_sc, y_train)
        score = knn.score(X_test_sc, y_test)
        if score > best_score:
            best_k = k
            best_score = score

    knn = KNeighborsClassifier(n_neighbors=best_k)
    knn.fit(X_train_sc, y_train)
    
    y_pred = knn.predict(X_test_sc)
    test_accuracy = accuracy_score(y_test, y_pred)
    conf_matrix = confusion_matrix(y_test, y_pred)
    class_report = classification_report(y_test, y_pred)
    
    return best_k, test_accuracy, conf_matrix, class_report

best_k, test_accuracy, conf_matrix, class_report = find_optimal_k(df_encoded, target_col='label')

print(f"Optimal k: {best_k}")
print(f"Test Accuracy: {test_accuracy}")
print("Confusion Matrix:")
print(conf_matrix)
print("Classification Report:")
print(class_report)


Optimal k: 1
Test Accuracy: 0.8632075471698113
Confusion Matrix:
[[171  10]
 [ 19  12]]
Classification Report:
              precision    recall  f1-score   support

           0       0.90      0.94      0.92       181
           1       0.55      0.39      0.45        31

    accuracy                           0.86       212
   macro avg       0.72      0.67      0.69       212
weighted avg       0.85      0.86      0.85       212



For question 3 most of the testers displayed better performance such as improved accuracy, more precision and etc.

In [117]:
#Normalized question 4
def train_logistic_regression(df_encoded: pd.DataFrame) -> float:
    X = df_encoded.drop(columns=['label', 'Ind_ID'])
    y = df_encoded['label']

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
        
    scaler = StandardScaler()
    X_train_sc = scaler.fit_transform(X_train)
    X_test_sc = scaler.transform(X_test)
    

    log_reg = LogisticRegression(max_iter=1000)
    log_reg.fit(X_train_sc, y_train)

    y_pred = log_reg.predict(X_test_sc)

    accuracy = accuracy_score(y_test, y_pred)
    report = classification_report(y_test, y_pred)

    return accuracy, report
print(accuracy)
print(report)

0.6037735849056604
              precision    recall  f1-score   support

           0       0.61      0.83      0.71       121
           1       0.57      0.30      0.39        91

    accuracy                           0.60       212
   macro avg       0.59      0.57      0.55       212
weighted avg       0.60      0.60      0.57       212



For question 4 there also seem to be no improvement when normalizing the model.