### Muhammad Awais Asghar
### SU92-BSAIM-F24-065

In [1]:
import pandas as pd
import numpy as np
import pickle
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix




In [2]:
class DataLoader:
    def __init__(self, filepath):
        self.filepath = filepath
    
    def load_data(self):
        df = pd.read_csv(self.filepath)
        print("Data Loaded")

        print(f"\nDataset Shape: {df.shape[0]} rows, {df.shape[1]} columns")

        print("\nFirst 5 rows:")
        print(df.head())

        print("\nRandom Sample Rows:")
        print(df.sample(5))

        print("\nDataset Info:")
        print(df.info())

        print("\nStatistical Summary:")
        print(df.describe(include='all'))

        print("\nMissing Values per Column:")
        print(df.isnull().sum())
        return df

loader = DataLoader("Loan.csv")
df = loader.load_data()

Data Loaded

Dataset Shape: 614 rows, 13 columns

First 5 rows:
    Loan_ID Gender Married Dependents     Education Self_Employed  \
0  LP001002   Male      No          0      Graduate            No   
1  LP001003   Male     Yes          1      Graduate            No   
2  LP001005   Male     Yes          0      Graduate           Yes   
3  LP001006   Male     Yes          0  Not Graduate            No   
4  LP001008   Male      No          0      Graduate            No   

   ApplicantIncome  CoapplicantIncome  LoanAmount  Loan_Amount_Term  \
0             5849                0.0         NaN             360.0   
1             4583             1508.0       128.0             360.0   
2             3000                0.0        66.0             360.0   
3             2583             2358.0       120.0             360.0   
4             6000                0.0       141.0             360.0   

   Credit_History Property_Area Loan_Status  
0             1.0         Urban           Y  
1 

In [3]:
class Preprocessor:
    def __init__(self):
        self.scaler = StandardScaler()
        self.label_encoders = {}
    
    def clean_data(self, df):
        df = df.fillna(method='ffill')
        print("Missing values filled")
        return df
    
    def encode_features(self, df):
        for col in df.select_dtypes(include=['object']).columns:
            le = LabelEncoder()
            df[col] = le.fit_transform(df[col])
            self.label_encoders[col] = le
            print(f"Encoded: {col}")
        return df
    
    def scale_features(self, X):
        scaled = self.scaler.fit_transform(X)
        print("Features scaled")
        print("Scaled sample (first 5 rows):\n", scaled[:5])
        return scaled
    
pre = Preprocessor()
df = pre.clean_data(df)
df = pre.encode_features(df)

X = df.drop("Loan_Status", axis=1)
y = df["Loan_Status"]

X_scaled = pre.scale_features(X)

    
X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y, test_size=0.2, random_state=42
)

Missing values filled
Encoded: Loan_ID
Encoded: Gender
Encoded: Married
Encoded: Dependents
Encoded: Education
Encoded: Self_Employed
Encoded: Property_Area
Encoded: Loan_Status
Features scaled
Scaled sample (first 5 rows):
 [[-1.72923217  0.47749346 -1.36717185 -0.75217674 -0.52836225 -0.40358244
   0.07299082 -0.55448733         nan  0.28003663  0.44371285  1.22329839]
 [-1.7235903   0.47749346  0.73143694  0.23676655 -0.52836225 -0.40358244
  -0.13441195 -0.03873155 -0.22165521  0.28003663  0.44371285 -1.31851281]
 [-1.71794844  0.47749346  0.73143694 -0.75217674 -0.52836225  2.47780848
  -0.39374734 -0.55448733 -0.93070561  0.28003663  0.44371285  1.22329839]
 [-1.71230657  0.47749346  0.73143694 -0.75217674  1.89264089 -0.40358244
  -0.46206247  0.2519796  -0.31314558  0.28003663  0.44371285  1.22329839]
 [-1.7066647   0.47749346 -1.36717185 -0.75217674 -0.52836225 -0.40358244
   0.09772844 -0.55448733 -0.07298335  0.28003663  0.44371285  1.22329839]]


  df = df.fillna(method='ffill')


In [4]:
class ModelTrainer:
    def __init__(self, model=RandomForestClassifier(random_state=42)):
        self.model = model
    
    def train(self, X_train, y_train):
        self.model.fit(X_train, y_train)
        print("Model Trained")
        return self.model
    
    def evaluate(self, X_test, y_test):
        preds = self.model.predict(X_test)
        acc = accuracy_score(y_test, preds)
        cm = confusion_matrix(y_test, preds)
        print(f"Accuracy: {acc:.2f}")
        print("Confusion Matrix:\n", cm)
        return acc, cm
    
trainer = ModelTrainer()
model = trainer.train(X_train, y_train)
acc, cm = trainer.evaluate(X_test, y_test)

Model Trained
Accuracy: 0.80
Confusion Matrix:
 [[21 22]
 [ 2 78]]


In [5]:
class ModelSaver:
    def __init__(self, model, scaler, encoders, features):
        self.model = model
        self.scaler = scaler
        self.encoders = encoders
        self.features = features
    
    def save(self, filename="Loan_Prediction_model.pkl"):
        with open(filename, 'wb') as f:
            pickle.dump({
                "model": self.model,
                "scaler": self.scaler,
                "encoders": self.encoders,
                "features": self.features
            }, f)
        print(f"Model + Preprocessors saved as {filename}")


saver = ModelSaver(model, pre.scaler, pre.label_encoders, X.columns.tolist())
saver.save()


Model + Preprocessors saved as Loan_Prediction_model.pkl
