**logistic regression**

In [33]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import LabelEncoder

In [34]:
# Load the dataset
file_path = '/content/loan_approval_dataset.csv'
data = pd.read_csv(file_path)

# Strip leading spaces from column names
data.columns = data.columns.str.strip()

# Encode categorical variables
label_encoder = LabelEncoder()
data['education'] = label_encoder.fit_transform(data['education'])
data['self_employed'] = label_encoder.fit_transform(data['self_employed'])
data['loan_status'] = label_encoder.fit_transform(data['loan_status'])

# Define features and target variable
X = data.drop(['loan_id', 'loan_status'], axis=1)
y = data['loan_status']

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize and train the logistic regression model
log_reg = LogisticRegression(max_iter=1000)
log_reg.fit(X_train, y_train)

# Make predictions on the test set
y_pred = log_reg.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

print(f"Accuracy: {accuracy}")
print(report)

Accuracy: 0.6288056206088993
              precision    recall  f1-score   support

           0       0.63      1.00      0.77       536
           1       0.67      0.01      0.01       318

    accuracy                           0.63       854
   macro avg       0.65      0.50      0.39       854
weighted avg       0.64      0.63      0.49       854



**Decision Trees:**

In [35]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import LabelEncoder

In [36]:
# Load the dataset
file_path = '/content/loan_approval_dataset.csv'
data = pd.read_csv(file_path)

# Strip leading spaces from column names
data.columns = data.columns.str.strip()

# Encode categorical variables
label_encoder = LabelEncoder()
data['education'] = label_encoder.fit_transform(data['education'])
data['self_employed'] = label_encoder.fit_transform(data['self_employed'])
data['loan_status'] = label_encoder.fit_transform(data['loan_status'])

# Define features and target variable
X = data.drop(['loan_id', 'loan_status'], axis=1)
y = data['loan_status']

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize and train the decision tree classifier
tree_clf = DecisionTreeClassifier(random_state=42)
tree_clf.fit(X_train, y_train)

# Make predictions on the test set
y_pred = tree_clf.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

print(f"Accuracy: {accuracy}")
print(report)


Accuracy: 0.977751756440281
              precision    recall  f1-score   support

           0       0.98      0.98      0.98       536
           1       0.97      0.97      0.97       318

    accuracy                           0.98       854
   macro avg       0.98      0.98      0.98       854
weighted avg       0.98      0.98      0.98       854



**Random Forest**

In [37]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import LabelEncoder

# Load the dataset
file_path = '/content/loan_approval_dataset.csv'
data = pd.read_csv(file_path)

# Strip leading spaces from column names
data.columns = data.columns.str.strip()

# Encode categorical variables
label_encoder = LabelEncoder()
data['education'] = label_encoder.fit_transform(data['education'])
data['self_employed'] = label_encoder.fit_transform(data['self_employed'])
data['loan_status'] = label_encoder.fit_transform(data['loan_status'])

# Define features and target variable
X = data.drop(['loan_id', 'loan_status'], axis=1)
y = data['loan_status']

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize and train the Random Forest classifier
rf_clf = RandomForestClassifier(random_state=42)
rf_clf.fit(X_train, y_train)

# Make predictions on the test set
y_pred = rf_clf.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

print(f"Accuracy: {accuracy}")
print(report)


Accuracy: 0.977751756440281
              precision    recall  f1-score   support

           0       0.98      0.99      0.98       536
           1       0.98      0.96      0.97       318

    accuracy                           0.98       854
   macro avg       0.98      0.97      0.98       854
weighted avg       0.98      0.98      0.98       854



**Gradient Boosting Machines (GBM)**

In [38]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import LabelEncoder

# Load the dataset
file_path = '/content/loan_approval_dataset.csv'
data = pd.read_csv(file_path)

# Strip leading spaces from column names
data.columns = data.columns.str.strip()

# Encode categorical variables
label_encoder = LabelEncoder()
data['education'] = label_encoder.fit_transform(data['education'])
data['self_employed'] = label_encoder.fit_transform(data['self_employed'])
data['loan_status'] = label_encoder.fit_transform(data['loan_status'])

# Define features and target variable
X = data.drop(['loan_id', 'loan_status'], axis=1)
y = data['loan_status']

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize and train the Gradient Boosting Machine classifier
gbm_clf = GradientBoostingClassifier(random_state=42)
gbm_clf.fit(X_train, y_train)

# Make predictions on the test set
y_pred = gbm_clf.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

print(f"Accuracy: {accuracy}")
print(report)


Accuracy: 0.977751756440281
              precision    recall  f1-score   support

           0       0.98      0.99      0.98       536
           1       0.97      0.97      0.97       318

    accuracy                           0.98       854
   macro avg       0.98      0.98      0.98       854
weighted avg       0.98      0.98      0.98       854



**Support Vector Machines (SVM)**

In [6]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler

# Load the dataset
file_path = '/content/loan_approval_dataset.csv'
data = pd.read_csv(file_path)

# Strip leading spaces from column names
data.columns = data.columns.str.strip()

# Encode categorical variables
label_encoder = LabelEncoder()
data['education'] = label_encoder.fit_transform(data['education'])
data['self_employed'] = label_encoder.fit_transform(data['self_employed'])
data['loan_status'] = label_encoder.fit_transform(data['loan_status'])

# Define features and target variable
X = data.drop(['loan_id', 'loan_status'], axis=1)
y = data['loan_status']

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardize the features (important for SVM)
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Initialize and train the SVM classifier
svm_clf = SVC()
svm_clf.fit(X_train, y_train)

# Make predictions on the test set
y_pred = svm_clf.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

print(f"Accuracy: {accuracy}")
print(report)


Accuracy: 0.9238875878220141
              precision    recall  f1-score   support

           0       0.95      0.93      0.94       536
           1       0.88      0.92      0.90       318

    accuracy                           0.92       854
   macro avg       0.92      0.92      0.92       854
weighted avg       0.92      0.92      0.92       854



**k-Nearest Neighbors**

In [40]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler

# Load the dataset
file_path = '/content/loan_approval_dataset.csv'
data = pd.read_csv(file_path)

# Strip leading spaces from column names
data.columns = data.columns.str.strip()

# Encode categorical variables
label_encoder = LabelEncoder()
data['education'] = label_encoder.fit_transform(data['education'])
data['self_employed'] = label_encoder.fit_transform(data['self_employed'])
data['loan_status'] = label_encoder.fit_transform(data['loan_status'])

# Define features and target variable
X = data.drop(['loan_id', 'loan_status'], axis=1)
y = data['loan_status']

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardize the features (important for k-NN)
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Initialize and train the k-NN classifier
knn_clf = KNeighborsClassifier(n_neighbors=5)
knn_clf.fit(X_train, y_train)

# Make predictions on the test set
y_pred = knn_clf.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

print(f"Accuracy: {accuracy}")
print(report)


Accuracy: 0.892271662763466
              precision    recall  f1-score   support

           0       0.93      0.90      0.91       536
           1       0.84      0.88      0.86       318

    accuracy                           0.89       854
   macro avg       0.88      0.89      0.89       854
weighted avg       0.89      0.89      0.89       854



In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error

# Load the dataset
file_path = 'path_to_your_dataset/loan_approval_dataset.csv'  # replace with your file path
loan_data = pd.read_csv(file_path)

# Remove leading spaces from column names
loan_data.columns = loan_data.columns.str.strip()

# Define the target variable and features
target = 'loan_amount'
features = loan_data.drop(columns=[target, 'loan_id', 'loan_status'])

# Separate the target variable
X = features
y = loan_data[target]

# Define the preprocessing steps for numeric and categorical features
numeric_features = features.select_dtypes(include=['int64']).columns.tolist()
categorical_features = features.select_dtypes(include=['object']).columns.tolist()

# Create preprocessing pipelines for both numeric and categorical data
numeric_transformer = StandardScaler()
categorical_transformer = OneHotEncoder(handle_unknown='ignore')

# Combine the numeric and categorical transformers into a single ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])

# Create a Ridge Regression model pipeline
ridge_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('ridge', Ridge(alpha=1.0))
])

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train the Ridge Regression model
ridge_pipeline.fit(X_train, y_train)

# Make predictions on the test set
y_pred = ridge_pipeline.predict(X_test)

# Calculate the mean squared error
mse = mean_squared_error(y_test, y_pred)
rmse = mse ** 0.5

print(f"Root Mean Squared Error (RMSE): {rmse}")

# Save the trained model to a file (optional)
import joblib
joblib.dump(ridge_pipeline, 'ridge_regression_model.pkl')


**Ridge Regression**

In [5]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error

# Load the dataset
file_path = '/content/loan_approval_dataset.csv'  # replace with your file path
loan_data = pd.read_csv(file_path)

# Remove leading spaces from column names
loan_data.columns = loan_data.columns.str.strip()

# Define the target variable and features
target = 'loan_amount'
features = loan_data.drop(columns=[target, 'loan_id', 'loan_status'])

# Separate the target variable
X = features
y = loan_data[target]

# Define the preprocessing steps for numeric and categorical features
numeric_features = features.select_dtypes(include=['int64']).columns.tolist()
categorical_features = features.select_dtypes(include=['object']).columns.tolist()

# Create preprocessing pipelines for both numeric and categorical data
numeric_transformer = StandardScaler()
categorical_transformer = OneHotEncoder(handle_unknown='ignore')

# Combine the numeric and categorical transformers into a single ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])

# Create a Ridge Regression model pipeline
ridge_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('ridge', Ridge(alpha=1.0))
])

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train the Ridge Regression model
ridge_pipeline.fit(X_train, y_train)

# Make predictions on the test set
y_pred = ridge_pipeline.predict(X_test)

# Calculate the mean squared error
mse = mean_squared_error(y_test, y_pred)
rmse = mse ** 0.5

print(f"Root Mean Squared Error (RMSE): {rmse}")

# Save the trained model to a file (optional)
import joblib
joblib.dump(ridge_pipeline, 'ridge_regression_model.pkl')


Root Mean Squared Error (RMSE): 3450969.894766542


['ridge_regression_model.pkl']

**Lasso Regression**

In [7]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import Lasso
from sklearn.metrics import mean_squared_error

# Load the dataset
file_path = '/content/loan_approval_dataset.csv'  # Replace with your file path
loan_data = pd.read_csv(file_path)

# Remove leading spaces from column names
loan_data.columns = loan_data.columns.str.strip()

# Define the target variable and features
target = 'loan_amount'
features = loan_data.drop(columns=[target, 'loan_id', 'loan_status'])

# Separate the target variable
X = features
y = loan_data[target]

# Define the preprocessing steps for numeric and categorical features
numeric_features = features.select_dtypes(include=['int64']).columns.tolist()
categorical_features = features.select_dtypes(include=['object']).columns.tolist()

# Create preprocessing pipelines for both numeric and categorical data
numeric_transformer = StandardScaler()
categorical_transformer = OneHotEncoder(handle_unknown='ignore')

# Combine the numeric and categorical transformers into a single ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])

# Create a Lasso Regression model pipeline
lasso_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('lasso', Lasso(alpha=1.0))
])

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train the Lasso Regression model
lasso_pipeline.fit(X_train, y_train)

# Make predictions on the test set
y_pred = lasso_pipeline.predict(X_test)

# Calculate the mean squared error
mse = mean_squared_error(y_test, y_pred)
rmse = mse ** 0.5

print(f"Root Mean Squared Error (RMSE): {rmse}")

# Save the trained model to a file (optional)
import joblib
joblib.dump(lasso_pipeline, 'lasso_regression_model.pkl')


Root Mean Squared Error (RMSE): 3451900.4422268565


['lasso_regression_model.pkl']

*Elastic net regression ***

In [8]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import ElasticNet
from sklearn.metrics import mean_squared_error

# Load the dataset
file_path = '/content/loan_approval_dataset.csv'  # Replace with your file path
loan_data = pd.read_csv(file_path)

# Remove leading spaces from column names
loan_data.columns = loan_data.columns.str.strip()

# Define the target variable and features
target = 'loan_amount'
features = loan_data.drop(columns=[target, 'loan_id', 'loan_status'])

# Separate the target variable
X = features
y = loan_data[target]

# Define the preprocessing steps for numeric and categorical features
numeric_features = features.select_dtypes(include=['int64']).columns.tolist()
categorical_features = features.select_dtypes(include=['object']).columns.tolist()

# Create preprocessing pipelines for both numeric and categorical data
numeric_transformer = StandardScaler()
categorical_transformer = OneHotEncoder(handle_unknown='ignore')

# Combine the numeric and categorical transformers into a single ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])

# Create an Elastic Net Regression model pipeline
elastic_net_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('elasticnet', ElasticNet(alpha=1.0, l1_ratio=0.5))
])

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train the Elastic Net Regression model
elastic_net_pipeline.fit(X_train, y_train)

# Make predictions on the test set
y_pred = elastic_net_pipeline.predict(X_test)

# Calculate the mean squared error
mse = mean_squared_error(y_test, y_pred)
rmse = mse ** 0.5

print(f"Root Mean Squared Error (RMSE): {rmse}")

# Save the trained model to a file (optional)
import joblib
joblib.dump(elastic_net_pipeline, 'elastic_net_regression_model.pkl')


Root Mean Squared Error (RMSE): 3806081.033486707


['elastic_net_regression_model.pkl']

**Polynomial Regression**

In [9]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import ElasticNet
from sklearn.metrics import mean_squared_error

# Load the dataset
file_path = '/content/loan_approval_dataset.csv'  # Replace with your file path
loan_data = pd.read_csv(file_path)

# Remove leading spaces from column names
loan_data.columns = loan_data.columns.str.strip()

# Define the target variable and features
target = 'loan_amount'
features = loan_data.drop(columns=[target, 'loan_id', 'loan_status'])

# Separate the target variable
X = features
y = loan_data[target]

# Define the preprocessing steps for numeric and categorical features
numeric_features = features.select_dtypes(include=['int64']).columns.tolist()
categorical_features = features.select_dtypes(include=['object']).columns.tolist()

# Create preprocessing pipelines for both numeric and categorical data
numeric_transformer = StandardScaler()
categorical_transformer = OneHotEncoder(handle_unknown='ignore')

# Combine the numeric and categorical transformers into a single ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])

# Create an Elastic Net Regression model pipeline
elastic_net_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('elasticnet', ElasticNet(alpha=1.0, l1_ratio=0.5))
])

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train the Elastic Net Regression model
elastic_net_pipeline.fit(X_train, y_train)

# Make predictions on the test set
y_pred = elastic_net_pipeline.predict(X_test)

# Calculate the mean squared error
mse = mean_squared_error(y_test, y_pred)
rmse = mse ** 0.5

print(f"Root Mean Squared Error (RMSE): {rmse}")

# Save the trained model to a file (optional)
import joblib
joblib.dump(elastic_net_pipeline, 'elastic_net_regression_model.pkl')


Root Mean Squared Error (RMSE): 3806081.033486707


['elastic_net_regression_model.pkl']