In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
import plotly.express as px
import requests
from io import StringIO

# Loading the data from the provided URL
url = "https://github.com/dsrscientist/DSData/raw/master/loan_prediction.csv"
response = requests.get(url)
data = StringIO(response.text)
df = pd.read_csv(data)

# Preprocessing
df = df.drop('Loan_ID', axis=1)
df['Gender'].fillna(df['Gender'].mode()[0], inplace=True)
df['Married'].fillna(df['Married'].mode()[0], inplace=True)
df['Dependents'].fillna(df['Dependents'].mode()[0], inplace=True)
df['Self_Employed'].fillna(df['Self_Employed'].mode()[0], inplace=True)
df['LoanAmount'].fillna(df['LoanAmount'].median(), inplace=True)
df['Loan_Amount_Term'].fillna(df['Loan_Amount_Term'].mode()[0], inplace=True)
df['Credit_History'].fillna(df['Credit_History'].mode()[0], inplace=True)

# Convert categorical columns to numerical using one-hot encoding
cat_cols = ['Gender', 'Married', 'Dependents', 'Education', 'Self_Employed', 'Property_Area']
df = pd.get_dummies(df, columns=cat_cols)

# Split the dataset into features (X) and target (y)
X = df.drop('Loan_Status', axis=1)
y = df['Loan_Status']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scale the numerical columns using StandardScaler
scaler = StandardScaler()
numerical_cols = ['ApplicantIncome', 'CoapplicantIncome', 'LoanAmount', 'Loan_Amount_Term', 'Credit_History']
X_train[numerical_cols] = scaler.fit_transform(X_train[numerical_cols])
X_test[numerical_cols] = scaler.transform(X_test[numerical_cols])

# Model Training
model = SVC(random_state=42)
model.fit(X_train, y_train)

# Model Evaluation
y_pred = model.predict(X_test)
accuracy = model.score(X_test, y_test)
print("Accuracy:", accuracy)

# Convert X_test to a DataFrame
X_test_df = pd.DataFrame(X_test, columns=X_test.columns)

# Add the predicted values to X_test_df
X_test_df['Loan_Status_Predicted'] = y_pred
print(X_test_df.head())

# Visualization
fig_loan_status = px.pie(df['Loan_Status'].value_counts(),
                         names=df['Loan_Status'].value_counts().index,
                         title='Loan Approval Status')
fig_loan_status.show()

Accuracy: 0.7886178861788617
     ApplicantIncome  CoapplicantIncome  LoanAmount  Loan_Amount_Term  \
350         0.604085          -0.528127    0.926826          0.287611   
377        -0.201226          -0.528127   -0.209201          0.287611   
163        -0.225353          -0.072293    0.115378          0.287611   
609        -0.439124          -0.528127   -0.893136          0.287611   
132        -0.469831          -0.528127   -0.904728          0.287611   

     Credit_History  Gender_Female  Gender_Male  Married_No  Married_Yes  \
350        0.407763              0            1           0            1   
377        0.407763              0            1           0            1   
163        0.407763              0            1           0            1   
609        0.407763              1            0           1            0   
132        0.407763              0            1           1            0   

     Dependents_0  ...  Dependents_2  Dependents_3+  Education_Graduate  \
