In [119]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score
import joblib


In [120]:
# Loading dataset
credit = pd.read_csv("credit_scores.csv",low_memory=False)
df = credit.copy()

### Dropping Columns: 


In [121]:
delete_columns = ["Name", "SSN", "ID", "Customer_ID"]
df.drop(delete_columns, axis=1, inplace=True)
df.head()

Unnamed: 0,Month,Age,Occupation,Annual_Income,Monthly_Inhand_Salary,Num_Bank_Accounts,Num_Credit_Card,Interest_Rate,Delay_from_due_date,Num_of_Delayed_Payment,...,Credit_Score,Count_Auto Loan,Count_Credit-Builder Loan,Count_Personal Loan,Count_Home Equity Loan,Count_Not Specified,Count_Mortgage Loan,Count_Student Loan,Count_Debt Consolidation Loan,Count_Payday Loan
0,July,23.0,Scientist,19114.12,1824.843333,3.0,4.0,3.0,3.0,8,...,Good,1.0,1.0,1.0,1.0,0.0,0,0,0,0
1,February,28.0,Teacher,34847.84,3037.986667,2.0,4.0,6.0,7.0,1,...,Good,0.0,1.0,0.0,0.0,0.0,0,0,0,0
2,May,28.0,Teacher,34847.84,3037.986667,2.0,4.0,6.0,3.0,1,...,Good,0.0,1.0,0.0,0.0,0.0,0,0,0,0
3,June,28.0,Teacher,34847.84,3037.986667,2.0,4.0,6.0,3.0,0,...,Good,0.0,1.0,0.0,0.0,0.0,0,0,0,0
4,August,28.0,Teacher,34847.84,3037.986667,2.0,4.0,6.0,3.0,4,...,Good,0.0,1.0,0.0,0.0,0.0,0,0,0,0


In [122]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 33402 entries, 0 to 33401
Data columns (total 31 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   Month                          33400 non-null  object 
 1   Age                            33400 non-null  float64
 2   Occupation                     33400 non-null  object 
 3   Annual_Income                  33402 non-null  float64
 4   Monthly_Inhand_Salary          33402 non-null  float64
 5   Num_Bank_Accounts              33401 non-null  float64
 6   Num_Credit_Card                33401 non-null  float64
 7   Interest_Rate                  33401 non-null  float64
 8   Delay_from_due_date            33399 non-null  float64
 9   Num_of_Delayed_Payment         33402 non-null  int64  
 10  Changed_Credit_Limit           33401 non-null  float64
 11  Num_Credit_Inquiries           33399 non-null  float64
 12  Credit_Mix                     33402 non-null 

### Data Preprocessing: Dealing with wrong types

In [123]:
# Data Preprocessing: There are features with incorrect format.

incorrect_format = [ 
               'Age',
               'Annual_Income',
                'Monthly_Inhand_Salary',
                'Num_Bank_Accounts',
                'Num_Credit_Card',
                'Interest_Rate',
                'Delay_from_due_date',
               'Num_of_Delayed_Payment',
                'Changed_Credit_Limit', 
               'Outstanding_Debt',
               'Amount_invested_monthly',
               'Monthly_Balance',
                'Num_Credit_Inquiries',
                'Credit_Utilization_Ratio',
                'Credit_History_Age',
                'Total_EMI_per_month',
                'Amount_invested_monthly',
                'Count_Auto Loan',
                'Count_Credit-Builder Loan',
                'Count_Personal Loan',
                'Count_Home Equity Loan',
                'Count_Not Specified',
                'Count_Mortgage Loan',
                'Count_Student Loan',
                'Count_Debt Consolidation Loan',
                'Count_Payday Loan'
                

]

# Checking which feature contains '_' as entry value
for col in incorrect_format:
    if df[col][df[col] == '_'].any():
        print(f"Column {col}: contains '_' ")
    else:
        print('Everyting is ok')


Everyting is ok
Everyting is ok
Everyting is ok
Everyting is ok
Everyting is ok
Everyting is ok
Everyting is ok
Everyting is ok
Everyting is ok
Everyting is ok
Everyting is ok
Everyting is ok
Everyting is ok
Everyting is ok
Everyting is ok
Everyting is ok
Everyting is ok
Everyting is ok
Everyting is ok
Everyting is ok
Everyting is ok
Everyting is ok
Everyting is ok
Everyting is ok
Everyting is ok
Everyting is ok


### Selecting Numerical and Categorical Data

In [124]:
numerical_format=['float64','int']
numerical_data = df.select_dtypes(include=numerical_format).columns
categorical_data = df.select_dtypes(include='object').columns

In [125]:
print("Numerical Data")
print(numerical_data)
print("")
print("Categorical Data")
print("")
print(categorical_data)

Numerical Data
Index(['Age', 'Annual_Income', 'Monthly_Inhand_Salary', 'Num_Bank_Accounts',
       'Num_Credit_Card', 'Interest_Rate', 'Delay_from_due_date',
       'Num_of_Delayed_Payment', 'Changed_Credit_Limit',
       'Num_Credit_Inquiries', 'Outstanding_Debt', 'Credit_Utilization_Ratio',
       'Credit_History_Age', 'Total_EMI_per_month', 'Amount_invested_monthly',
       'Monthly_Balance', 'Count_Auto Loan', 'Count_Credit-Builder Loan',
       'Count_Personal Loan', 'Count_Home Equity Loan', 'Count_Not Specified',
       'Count_Mortgage Loan', 'Count_Student Loan',
       'Count_Debt Consolidation Loan', 'Count_Payday Loan'],
      dtype='object')

Categorical Data

Index(['Month', 'Occupation', 'Credit_Mix', 'Payment_of_Min_Amount',
       'Payment_Behaviour', 'Credit_Score'],
      dtype='object')


In [126]:
# Checking the min, max and mean values for each Numerical category
for col in numerical_data:
    print(f"{col} : Min {df[col].min()} - Max {df[col].max()} - {df[col].mean()}") 
    

Age : Min 14.0 - Max 95.0 - 32.963502994011975
Annual_Income : Min 7005.93 - Max 24198062.0 - 180449.1165729597
Monthly_Inhand_Salary : Min 303.6454166666666 - Max 15204.633333333331 - 4011.3346753538767
Num_Bank_Accounts : Min -1.0 - Max 1798.0 - 17.670518846741114
Num_Credit_Card : Min 0.0 - Max 1499.0 - 22.108409927846473
Interest_Rate : Min 1.0 - Max 5797.0 - 77.57309661387383
Delay_from_due_date : Min -5.0 - Max 67.0 - 22.085661247342735
Num_of_Delayed_Payment : Min 0 - Max 4397 - 32.5789773067481
Changed_Credit_Limit : Min -6.44 - Max 36.29 - 10.788655728870394
Num_Credit_Inquiries : Min 0.0 - Max 2597.0 - 28.006347495433996
Outstanding_Debt : Min 0.23 - Max 4998.07 - 1510.6508349799412
Credit_Utilization_Ratio : Min 20.88125003902868 - Max 49.56451934738699 - 32.22058395632018
Credit_History_Age : Min 0.0 - Max 404.0 - 193.59627533758496
Total_EMI_per_month : Min 4.4628374669131645 - Max 82204.0 - 1403.5067912154855
Amount_invested_monthly : Min 0.0 - Max 10000.0 - 614.076343383

In [128]:
# Dropping values
# Age
df.drop(df[df.Age < 0].index, inplace=True)

# Num_Bank_Accounts
df.drop(df[df["Num_Bank_Accounts"] < 0].index, inplace=True)

# Reseting index values
df.reset_index(drop=True, inplace=True)

In [133]:
# Checking Null Values
features_null_values = []
for col in numerical_data:
    if df[col].isnull().sum() !=0:
        features_null_values.append(col)
        
#####
print(features_null_values)


['Age', 'Num_Bank_Accounts', 'Num_Credit_Card', 'Interest_Rate', 'Delay_from_due_date', 'Changed_Credit_Limit', 'Num_Credit_Inquiries', 'Credit_History_Age', 'Total_EMI_per_month', 'Monthly_Balance', 'Count_Auto Loan', 'Count_Credit-Builder Loan', 'Count_Personal Loan', 'Count_Home Equity Loan', 'Count_Not Specified']


### Replacing NaN Values from Numerical Columns with Mean.

In [135]:
# Checking NAN values and replacing them.

imputer = SimpleImputer(strategy='mean')

# Making the corrections and saving the values in a Df.
imputed_num = pd.DataFrame(imputer.fit_transform(df[features_null_values]),
                          columns=features_null_values)

# Updating df with imputed values
df[features_null_values] = imputed_num

In [137]:
# Checking corrections
print(df.isnull().sum())

Month                            2
Age                              0
Occupation                       2
Annual_Income                    0
Monthly_Inhand_Salary            0
Num_Bank_Accounts                0
Num_Credit_Card                  0
Interest_Rate                    0
Delay_from_due_date              0
Num_of_Delayed_Payment           0
Changed_Credit_Limit             0
Num_Credit_Inquiries             0
Credit_Mix                       0
Outstanding_Debt                 0
Credit_Utilization_Ratio         0
Credit_History_Age               0
Payment_of_Min_Amount            0
Total_EMI_per_month              0
Amount_invested_monthly          0
Payment_Behaviour                1
Monthly_Balance                  0
Credit_Score                     0
Count_Auto Loan                  0
Count_Credit-Builder Loan        0
Count_Personal Loan              0
Count_Home Equity Loan           0
Count_Not Specified              0
Count_Mortgage Loan              0
Count_Student Loan  

## **Categorical columns:** Checking data and doing corrections

In [140]:
# Checking if Credit_Mix contains '_' 
df["Credit_Mix"].value_counts() 

Credit_Mix
Standard    15514
Good         8948
Bad          8935
Name: count, dtype: int64

In [142]:
df["Payment_Behaviour"].value_counts()  

Payment_Behaviour
Low_spent_Small_value_payments      9175
High_spent_Medium_value_payments    6407
Low_spent_Medium_value_payments     4938
High_spent_Large_value_payments     4926
High_spent_Small_value_payments     4105
Low_spent_Large_value_payments      3845
Name: count, dtype: int64

In [143]:
# Checking our target values.
df["Credit_Score"].value_counts()

Credit_Score
Standard    17755
Poor        10399
Good         5243
Name: count, dtype: int64

In [144]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 33397 entries, 0 to 33396
Data columns (total 31 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   Month                          33395 non-null  object 
 1   Age                            33397 non-null  float64
 2   Occupation                     33395 non-null  object 
 3   Annual_Income                  33397 non-null  float64
 4   Monthly_Inhand_Salary          33397 non-null  float64
 5   Num_Bank_Accounts              33397 non-null  float64
 6   Num_Credit_Card                33397 non-null  float64
 7   Interest_Rate                  33397 non-null  float64
 8   Delay_from_due_date            33397 non-null  float64
 9   Num_of_Delayed_Payment         33397 non-null  int64  
 10  Changed_Credit_Limit           33397 non-null  float64
 11  Num_Credit_Inquiries           33397 non-null  float64
 12  Credit_Mix                     33397 non-null 

In [145]:
df["Month"].value_counts()

Month
January     4288
March       4222
July        4213
August      4181
June        4143
May         4136
April       4110
February    4102
Name: count, dtype: int64

In [146]:
# Selecting Categorical features
categorical_data

# for loop with NAN values:
features_null_values_cat = []
for col in categorical_data:
    if df[col].isnull().sum() !=0:
        features_null_values_cat.append(col)
        
#####
print(features_null_values_cat)


['Month', 'Occupation', 'Payment_Behaviour']


In [147]:
# 'Credit_Score' is our target value

# Replacing NaN with most frequent values
imputer = SimpleImputer(strategy='most_frequent')
imputed_cat = pd.DataFrame(
                            imputer.fit_transform(df[features_null_values_cat]),
                            columns=features_null_values_cat
                          )

In [148]:
df[features_null_values_cat] = imputed_cat

In [149]:
# Checking null / NaN values
df.isnull().sum()

Month                            0
Age                              0
Occupation                       0
Annual_Income                    0
Monthly_Inhand_Salary            0
Num_Bank_Accounts                0
Num_Credit_Card                  0
Interest_Rate                    0
Delay_from_due_date              0
Num_of_Delayed_Payment           0
Changed_Credit_Limit             0
Num_Credit_Inquiries             0
Credit_Mix                       0
Outstanding_Debt                 0
Credit_Utilization_Ratio         0
Credit_History_Age               0
Payment_of_Min_Amount            0
Total_EMI_per_month              0
Amount_invested_monthly          0
Payment_Behaviour                0
Monthly_Balance                  0
Credit_Score                     0
Count_Auto Loan                  0
Count_Credit-Builder Loan        0
Count_Personal Loan              0
Count_Home Equity Loan           0
Count_Not Specified              0
Count_Mortgage Loan              0
Count_Student Loan  

# **Scaling / Encoding Values:**
Transforming values:
Scaling is used for numerical data and Encoding for categorical data.

In [150]:
# Printing Numerical data
#for col in numerical_data:
#    print(f"{col}")

# Scaling values
#df[numerical_data] = StandardScaler().fit_transform(df[numerical_data])

In [151]:
# Checking results
df.head()

Unnamed: 0,Month,Age,Occupation,Annual_Income,Monthly_Inhand_Salary,Num_Bank_Accounts,Num_Credit_Card,Interest_Rate,Delay_from_due_date,Num_of_Delayed_Payment,...,Credit_Score,Count_Auto Loan,Count_Credit-Builder Loan,Count_Personal Loan,Count_Home Equity Loan,Count_Not Specified,Count_Mortgage Loan,Count_Student Loan,Count_Debt Consolidation Loan,Count_Payday Loan
0,July,23.0,Scientist,19114.12,1824.843333,3.0,4.0,3.0,3.0,8,...,Good,1.0,1.0,1.0,1.0,0.0,0,0,0,0
1,February,28.0,Teacher,34847.84,3037.986667,2.0,4.0,6.0,7.0,1,...,Good,0.0,1.0,0.0,0.0,0.0,0,0,0,0
2,May,28.0,Teacher,34847.84,3037.986667,2.0,4.0,6.0,3.0,1,...,Good,0.0,1.0,0.0,0.0,0.0,0,0,0,0
3,June,28.0,Teacher,34847.84,3037.986667,2.0,4.0,6.0,3.0,0,...,Good,0.0,1.0,0.0,0.0,0.0,0,0,0,0
4,August,28.0,Teacher,34847.84,3037.986667,2.0,4.0,6.0,3.0,4,...,Good,0.0,1.0,0.0,0.0,0.0,0,0,0,0


In [152]:
# Checking categorical values
print(categorical_data)

# Including our target value in our Encoder
categorical = list(categorical_data)
categorical.append('Month')
#categorical.append('Credit_Score')

# Converting values Credit_Score:
df['Credit_Score'] = df['Credit_Score'].map({
    'Poor':0,
    'Good':1,
    'Standard':2
})

# Applying Encoder
for col in categorical:
    df[col] = LabelEncoder().fit_transform(df[col])

Index(['Month', 'Occupation', 'Credit_Mix', 'Payment_of_Min_Amount',
       'Payment_Behaviour', 'Credit_Score'],
      dtype='object')


In [153]:
df.tail()

Unnamed: 0,Month,Age,Occupation,Annual_Income,Monthly_Inhand_Salary,Num_Bank_Accounts,Num_Credit_Card,Interest_Rate,Delay_from_due_date,Num_of_Delayed_Payment,...,Credit_Score,Count_Auto Loan,Count_Credit-Builder Loan,Count_Personal Loan,Count_Home Equity Loan,Count_Not Specified,Count_Mortgage Loan,Count_Student Loan,Count_Debt Consolidation Loan,Count_Payday Loan
33392,3,28.0,1,20002.88,1929.906667,10.0,8.0,29.0,35.0,26,...,0,1.0,0.0,1.0,0.0,0.0,1,2,0,0
33393,2,28.0,1,20002.88,1929.906667,10.0,8.0,29.0,33.0,25,...,0,1.0,0.0,1.0,0.0,0.0,1,2,0,0
33394,4,28.0,1,20002.88,1929.906667,10.0,8.0,29.0,33.0,26,...,2,1.0,0.0,1.0,0.0,0.0,1,2,0,0
33395,1,29.0,1,20002.88,1929.906667,10.0,8.0,29.0,33.0,25,...,2,1.0,0.0,1.0,0.0,0.0,1,2,0,0
33396,5,25.0,9,39628.99,3359.415833,4.0,6.0,5729.0,27.0,6,...,0,1.0,0.0,0.0,0.0,0.0,0,1,0,0


# Spliting our data : Training and Test

In [154]:
# Selecting values
X = df.drop('Credit_Score', axis=1)
y = df['Credit_Score']

In [155]:
# Spliting into training and testing
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2, random_state = 1)

# Creating our SVC Model

In [156]:
# Applying pipeline
pipeline = Pipeline([
    ('scaler', StandardScaler()), 
    ('svc', SVC())
])

In [157]:
param_grid = {
    'svc__kernel': ['rbf', 'linear'],
    'svc__C': [0.01, 10, 20]
}


In [159]:
n_iter_search = 6  # Number of hyperparameter combinations to try

random_search = RandomizedSearchCV(pipeline, 
                                   param_distributions=param_grid,
                                   n_iter=n_iter_search,
                                   cv=3,
                                   n_jobs=-1,
                                   scoring='accuracy',
                                   random_state=1)



In [160]:
random_search.fit(X_train, y_train)

# Get the best parameters
best_params = random_search.best_params_
print("Best parameters found: ", best_params)

Best parameters found:  {'svc__kernel': 'rbf', 'svc__C': 10}


In [161]:
# Predict on the test set
y_pred = random_search.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print("Test set accuracy: ", accuracy)


Test set accuracy:  0.6901197604790419


In [162]:
best_model = random_search.best_estimator_
best_model.fit(X, y)
joblib.dump(best_model, 'best_svm_model.pkl')


['best_svm_model.pkl']

In [169]:
# Getting unique values
unique_classes = {}
for col in categorical_data:
    unique_classes[col] = df[col].unique()

In [170]:
for key, value in unique_classes.items():
    print("Column:", key)
    print("Unique Values:", value)

Column: Month
Unique Values: [4 2 7 5 1 3 0 6]
Column: Occupation
Unique Values: [12 13  4  5  7  3  6  8 10  9  0  1 11 14  2]
Column: Credit_Mix
Unique Values: [1 2 0]
Column: Payment_of_Min_Amount
Unique Values: [0 1]
Column: Payment_Behaviour
Unique Values: [5 0 1 2 3 4]
Column: Credit_Score
Unique Values: [1 2 0]


In [171]:
X_train.head()

Unnamed: 0,Month,Age,Occupation,Annual_Income,Monthly_Inhand_Salary,Num_Bank_Accounts,Num_Credit_Card,Interest_Rate,Delay_from_due_date,Num_of_Delayed_Payment,...,Monthly_Balance,Count_Auto Loan,Count_Credit-Builder Loan,Count_Personal Loan,Count_Home Equity Loan,Count_Not Specified,Count_Mortgage Loan,Count_Student Loan,Count_Debt Consolidation Loan,Count_Payday Loan
19365,0,22.0,2,14274.32,907.526667,4.0,4.0,12.0,14.0,9,...,254.75282,0.0,0.0,0.0,0.0,0.0,1,0,2,0
9182,0,20.0,5,29892.37,2664.030833,7.0,3.0,13.0,7.0,8,...,304.521176,0.0,2.0,0.0,0.0,0.0,0,0,0,0
31865,6,51.0,3,24251.49,1991.9575,8.0,4.0,14.0,24.0,20,...,357.181612,0.0,0.0,0.0,0.0,0.0,0,1,0,0
13707,0,37.0,1,28496.3,2357.691667,7.0,10.0,21.0,12.0,20,...,392.815039,0.0,0.0,0.0,0.0,0.0,1,0,0,1
8351,0,25.0,11,15701.03,1213.419167,8.0,10.0,30.0,22.0,20,...,251.209978,2.0,0.0,1.0,1.0,0.0,1,0,2,2
