# TASK 1: Credit Scoring Model 
Objective: Predict an individual's creditworthiness using past financial data. 

Approach: Use classification algorithms like Logistic Regression, Decision Trees, or Random Forest. 

In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split,RandomizedSearchCV
from sklearn .metrics import f1_score,accuracy_score,precision_score,recall_score,roc_auc_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
import pickle


In [32]:
columns=[
    "checking_status",
    "duration",
    "credit_history",
    "purpose",
    "credit_amount",
    "savings_status",
    "employment",
    "installment_commitment",
    "personal_status",
    "other_parties",
    "residence_since",
    "property_magnitude",
    "age",
    "other_payment_plans",
    "housing",
    "existing_credits",
    "job",
    "num_dependents",
    "own_telephone",
    "foreign_worker",
    "class"
]

In [33]:
df = pd.read_csv('german.data', header=None, delim_whitespace=True, names=columns)

  df = pd.read_csv('german.data', header=None, delim_whitespace=True, names=columns)


In [34]:
df.head()

Unnamed: 0,checking_status,duration,credit_history,purpose,credit_amount,savings_status,employment,installment_commitment,personal_status,other_parties,residence_since,property_magnitude,age,other_payment_plans,housing,existing_credits,job,num_dependents,own_telephone,foreign_worker,class
0,A11,6,A34,A43,1169,A65,A75,4,A93,A101,4,A121,67,A143,A152,2,A173,1,A192,A201,1
1,A12,48,A32,A43,5951,A61,A73,2,A92,A101,2,A121,22,A143,A152,1,A173,1,A191,A201,2
2,A14,12,A34,A46,2096,A61,A74,2,A93,A101,3,A121,49,A143,A152,1,A172,2,A191,A201,1
3,A11,42,A32,A42,7882,A61,A74,2,A93,A103,4,A122,45,A143,A153,1,A173,2,A191,A201,1
4,A11,24,A33,A40,4870,A61,A73,3,A93,A101,4,A124,53,A143,A153,2,A173,2,A191,A201,2


In [35]:
print(df.checking_status.unique())
print(df.credit_history.unique())
print(df.purpose.unique())
print(df.savings_status.unique())
print(df.employment.unique())
print(df.personal_status.unique())
print(df.other_parties.unique())
print(df.property_magnitude.unique())
print(df.other_payment_plans.unique())
print(df.housing.unique())
print(df.job.unique())
print(df.own_telephone.unique())
print(df.foreign_worker.unique())

['A11' 'A12' 'A14' 'A13']
['A34' 'A32' 'A33' 'A30' 'A31']
['A43' 'A46' 'A42' 'A40' 'A41' 'A49' 'A44' 'A45' 'A410' 'A48']
['A65' 'A61' 'A63' 'A64' 'A62']
['A75' 'A73' 'A74' 'A71' 'A72']
['A93' 'A92' 'A91' 'A94']
['A101' 'A103' 'A102']
['A121' 'A122' 'A124' 'A123']
['A143' 'A141' 'A142']
['A152' 'A153' 'A151']
['A173' 'A172' 'A174' 'A171']
['A192' 'A191']
['A201' 'A202']


In [36]:
df['checking_status']=df['checking_status'].str.split('A').str[1]
df['credit_history']=df['credit_history'].str.split('A').str[1]
df['purpose']=df['purpose'].str.split('A').str[1]
df['savings_status']=df['savings_status'].str.split('A').str[1]
df['employment']=df['employment'].str.split('A').str[1]
df['personal_status']=df['personal_status'].str.split('A').str[1]
df['other_parties']=df['other_parties'].str.split('A').str[1]
df['property_magnitude']=df['property_magnitude'].str.split('A').str[1]
df['other_payment_plans']=df['other_payment_plans'].str.split('A').str[1]
df['housing']=df['housing'].str.split('A').str[1]
df['job']=df['job'].str.split('A').str[1]
df['own_telephone']=df['own_telephone'].str.split('A').str[1]
df['foreign_worker']=df['foreign_worker'].str.split('A').str[1]


In [37]:
df.head()

Unnamed: 0,checking_status,duration,credit_history,purpose,credit_amount,savings_status,employment,installment_commitment,personal_status,other_parties,residence_since,property_magnitude,age,other_payment_plans,housing,existing_credits,job,num_dependents,own_telephone,foreign_worker,class
0,11,6,34,43,1169,65,75,4,93,101,4,121,67,143,152,2,173,1,192,201,1
1,12,48,32,43,5951,61,73,2,92,101,2,121,22,143,152,1,173,1,191,201,2
2,14,12,34,46,2096,61,74,2,93,101,3,121,49,143,152,1,172,2,191,201,1
3,11,42,32,42,7882,61,74,2,93,103,4,122,45,143,153,1,173,2,191,201,1
4,11,24,33,40,4870,61,73,3,93,101,4,124,53,143,153,2,173,2,191,201,2


In [38]:
df.corr()

Unnamed: 0,checking_status,duration,credit_history,purpose,credit_amount,savings_status,employment,installment_commitment,personal_status,other_parties,residence_since,property_magnitude,age,other_payment_plans,housing,existing_credits,job,num_dependents,own_telephone,foreign_worker,class
checking_status,1.0,-0.072013,0.192191,-0.061804,-0.042705,0.222867,0.106339,-0.00528,0.043261,-0.127737,-0.042234,-0.03226,0.059751,0.046841,0.022424,0.076005,0.040663,-0.014145,0.066296,-0.026758,-0.350847
duration,-0.072013,1.0,-0.077186,0.112326,0.624984,0.047661,0.057381,0.074749,0.014789,-0.02449,0.034067,0.303971,-0.036136,-0.054884,0.157049,-0.011284,0.21091,-0.023834,0.164718,-0.138196,0.214927
credit_history,0.192191,-0.077186,1.0,-0.02726,-0.059905,0.039058,0.138225,0.044375,0.042171,-0.040676,0.063198,-0.053777,0.147086,0.121973,0.062095,0.437066,0.01035,0.01155,0.05237,0.013873,-0.228785
purpose,-0.061804,0.112326,-0.02726,1.0,0.193361,-0.019823,-0.033179,-0.026222,-0.015018,0.078943,0.036959,0.083797,0.041528,-0.147999,0.083342,0.021124,0.098898,0.001245,0.117988,0.019634,0.028674
credit_amount,-0.042705,0.624984,-0.059905,0.193361,1.0,0.06463,-0.008367,-0.271316,-0.016091,-0.027832,0.028926,0.311599,0.032716,-0.046008,0.135632,0.020795,0.285385,0.017142,0.276995,-0.05005,0.154739
savings_status,0.222867,0.047661,0.039058,-0.019823,0.06463,1.0,0.12095,0.021993,0.017349,-0.105069,0.091424,0.018948,0.084245,0.001908,0.006505,-0.021644,0.011709,0.027514,0.087208,0.007095,-0.178943
employment,0.106339,0.057381,0.138225,-0.033179,-0.008367,0.12095,1.0,0.126161,0.111278,-0.008116,0.245081,0.087187,0.256227,-0.040154,0.111126,0.125791,0.101225,0.097192,0.060518,-0.027232,-0.116002
installment_commitment,-0.00528,0.074749,0.044375,-0.026222,-0.271316,0.021993,0.126161,1.0,0.119308,-0.011398,0.049302,0.053391,0.058266,-0.000983,0.089405,0.021669,0.097755,-0.071207,0.014413,-0.090024,0.072404
personal_status,0.043261,0.014789,0.042171,-0.015018,-0.016091,0.017349,0.111278,0.119308,1.0,0.050634,-0.027269,-0.00694,0.007783,-0.036765,0.099579,0.064672,-0.011956,0.122165,0.027275,0.065618,-0.088184
other_parties,-0.127737,-0.02449,-0.040676,0.078943,-0.027832,-0.105069,-0.008116,-0.011398,0.050634,1.0,-0.025678,-0.15545,-0.029873,-0.059023,-0.065889,-0.025447,-0.057963,0.0204,-0.075035,0.117999,-0.025137


In [43]:
## Removing unwanted columns 
df.drop(['purpose','installment_commitment', 'personal_status', 'other_parties',
       'residence_since', 'age',
       'housing', 'existing_credits', 'job', 'num_dependents', 'own_telephone',
       'foreign_worker'],inplace=True,axis=1)

In [47]:
## Save the modified dataset
df.to_csv("ModifiedGerman.csv")

In [48]:
X=df.drop('class',axis=1)
y=df['class']

In [51]:
## Performing train test split
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=.3,random_state=42)

In [52]:
## Performing standardization

scaler=StandardScaler()
X_train=scaler.fit_transform(X_train)
X_test=scaler.transform(X_test)

In [54]:
## Training and testing models


models={
    "Logisitic Regression":LogisticRegression(),
    "Random Forest":RandomForestClassifier(),
    "Gradient Boost":DecisionTreeClassifier(),
}
for i in range(len(list(models))):
    model = list(models.values())[i]
    model.fit(X_train, y_train) # Train model

    # Make predictions
    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)

    # Training set performance
    model_train_accuracy = accuracy_score(y_train, y_train_pred) # Calculate Accuracy
    model_train_f1 = f1_score(y_train, y_train_pred, average='weighted') # Calculate F1-score
    model_train_precision = precision_score(y_train, y_train_pred) # Calculate Precision
    model_train_recall = recall_score(y_train, y_train_pred) # Calculate Recall
    model_train_rocauc_score = roc_auc_score(y_train, y_train_pred)


    # Test set performance
    model_test_accuracy = accuracy_score(y_test, y_test_pred) # Calculate Accuracy
    model_test_f1 = f1_score(y_test, y_test_pred, average='weighted') # Calculate F1-score
    model_test_precision = precision_score(y_test, y_test_pred) # Calculate Precision
    model_test_recall = recall_score(y_test, y_test_pred) # Calculate Recall
    model_test_rocauc_score = roc_auc_score(y_test, y_test_pred) #Calculate Roc


    print(list(models.keys())[i])
    
    print('Model performance for Training set')
    print("- Accuracy: {:.4f}".format(model_train_accuracy))
    print('- F1 score: {:.4f}'.format(model_train_f1))
    
    print('- Precision: {:.4f}'.format(model_train_precision))
    print('- Recall: {:.4f}'.format(model_train_recall))
    print('- Roc Auc Score: {:.4f}'.format(model_train_rocauc_score))

    
    
    print('----------------------------------')
    
    print('Model performance for Test set')
    print('- Accuracy: {:.4f}'.format(model_test_accuracy))
    print('- F1 score: {:.4f}'.format(model_test_f1))
    print('- Precision: {:.4f}'.format(model_test_precision))
    print('- Recall: {:.4f}'.format(model_test_recall))
    print('- Roc Auc Score: {:.4f}'.format(model_test_rocauc_score))

    
    print('='*35)
    print('\n')

Logisitic Regression
Model performance for Training set
- Accuracy: 0.7571
- F1 score: 0.7408
- Precision: 0.7861
- Recall: 0.8982
- Roc Auc Score: 0.6620
----------------------------------
Model performance for Test set
- Accuracy: 0.7500
- F1 score: 0.7266
- Precision: 0.7702
- Recall: 0.9139
- Roc Auc Score: 0.6438


Random Forest
Model performance for Training set
- Accuracy: 1.0000
- F1 score: 1.0000
- Precision: 1.0000
- Recall: 1.0000
- Roc Auc Score: 1.0000
----------------------------------
Model performance for Test set
- Accuracy: 0.7667
- F1 score: 0.7501
- Precision: 0.7884
- Recall: 0.9091
- Roc Auc Score: 0.6743


Gradient Boost
Model performance for Training set
- Accuracy: 1.0000
- F1 score: 1.0000
- Precision: 1.0000
- Recall: 1.0000
- Roc Auc Score: 1.0000
----------------------------------
Model performance for Test set
- Accuracy: 0.7233
- F1 score: 0.7211
- Precision: 0.7944
- Recall: 0.8134
- Roc Auc Score: 0.6649




In [60]:
## Out of all random forest used to perform well so we choose random forest for hyper-parameter tuning 

rf_params = {"max_depth": [5, 8, 15, None, 10],
             "max_features": [5, 7, 8],
             "min_samples_split": [2, 8, 15, 20],
             "n_estimators": [100, 200, 500, 1000]}

In [61]:
randomcv_models = [
                   ("Random-Forest", RandomForestClassifier(), rf_params)
                   
                   ]

In [70]:
model_param = {}
for name, model, params in randomcv_models:
    random = RandomizedSearchCV(estimator=model,
                                   param_distributions=params,
                                   n_iter=100,
                                   cv=3,
                                   n_jobs=-1)
    random.fit(X_train, y_train)
    model_param[name] = random.best_params_

for model_name in model_param:
    print(f"---------------- Best Params for {model_name} -------------------")
    print(model_param[model_name])

---------------- Best Params for Random-Forest -------------------
{'n_estimators': 500, 'min_samples_split': 8, 'max_features': 5, 'max_depth': None}


In [71]:
models={
    
    "Random Forest":RandomForestClassifier(n_estimators=1000,min_samples_split=8,
                                          max_features=8,max_depth=15)
}
for i in range(len(list(models))):
    model = list(models.values())[i]
    model.fit(X_train, y_train) # Train model

    # Make predictions
    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)

    # Training set performance
    model_train_accuracy = accuracy_score(y_train, y_train_pred) # Calculate Accuracy
    model_train_f1 = f1_score(y_train, y_train_pred, average='weighted') # Calculate F1-score
    model_train_precision = precision_score(y_train, y_train_pred) # Calculate Precision
    model_train_recall = recall_score(y_train, y_train_pred) # Calculate Recall
    model_train_rocauc_score = roc_auc_score(y_train, y_train_pred)


    # Test set performance
    model_test_accuracy = accuracy_score(y_test, y_test_pred) # Calculate Accuracy
    model_test_f1 = f1_score(y_test, y_test_pred, average='weighted') # Calculate F1-score
    model_test_precision = precision_score(y_test, y_test_pred) # Calculate Precision
    model_test_recall = recall_score(y_test, y_test_pred) # Calculate Recall
    model_test_rocauc_score = roc_auc_score(y_test, y_test_pred) #Calculate Roc


    print(list(models.keys())[i])
    
    print('Model performance for Training set')
    print("- Accuracy: {:.4f}".format(model_train_accuracy))
    print('- F1 score: {:.4f}'.format(model_train_f1))
    
    print('- Precision: {:.4f}'.format(model_train_precision))
    print('- Recall: {:.4f}'.format(model_train_recall))
    print('- Roc Auc Score: {:.4f}'.format(model_train_rocauc_score))

    
    
    print('----------------------------------')
    
    print('Model performance for Test set')
    print('- Accuracy: {:.4f}'.format(model_test_accuracy))
    print('- F1 score: {:.4f}'.format(model_test_f1))
    print('- Precision: {:.4f}'.format(model_test_precision))
    print('- Recall: {:.4f}'.format(model_test_recall))
    print('- Roc Auc Score: {:.4f}'.format(model_test_rocauc_score))

    
    print('='*35)
    print('\n')
    
    with open("randomforest.pkl",'wb') as file:
         pickle.dump(random,file)
   

Random Forest
Model performance for Training set
- Accuracy: 0.9157
- F1 score: 0.9130
- Precision: 0.9075
- Recall: 0.9796
- Roc Auc Score: 0.8726
----------------------------------
Model performance for Test set
- Accuracy: 0.7600
- F1 score: 0.7444
- Precision: 0.7866
- Recall: 0.8995
- Roc Auc Score: 0.6695


