# Importing Libraries

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.metrics import roc_auc_score, roc_curve
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import average_precision_score
from sklearn.metrics import f1_score
from sklearn.metrics import precision_score, recall_score
from sklearn.metrics import ConfusionMatrixDisplay
from sklearn.metrics import RocCurveDisplay

# Reading Data into dataframe

In [None]:
# Load the dataset
data = pd.read_csv('/Users/arfatshaikh/Documents/Machine-Learning/Logistic Regression/dataset/loantap.csv')

In [None]:
# Changing the setting to display max_columns in dataframe
pd.set_option('display.max_columns', None)

In [None]:
data.head()

In [None]:
# Shape of the data
data.shape

In [None]:
data.info()

In [None]:
#checking null values
data.isna().sum()

In [None]:
# Checcking values in each column for categorical variables
for i in data.columns:
    if data[i].dtype == 'object':
        print(i)
        print(data[i].unique())
        print("--------"*20)

# Data Cleansing

In [None]:
data['home_ownership'].value_counts()

In [None]:
# Merging ANY and NONE to OTHER becasue they are not significant
data['home_ownership'] = np.where(data['home_ownership'].isin(['ANY', 'NONE']), "OTHER", data['home_ownership'])

In [None]:
data['term'].replace({' 36 months': 36, ' 60 months': 60}, inplace=True)

Unique patterns in address field
[
    '0174 Michelle Gateway\r\nMendozaberg, OK 22690',
    'USCGC Roth\r\nFPO AA 70466',
    'Unit 8386 Box 5821\r\nDPO AE 05113',
    'USNV Trujillo\r\nFPO AA 30723',
    'PSC 5108, Box 2953\r\nAPO AP 05113',
    'USS Ramirez\r\nFPO AP 29597',
    'USNS Roberts\r\nFPO AA 11650'
]

In [None]:
# Address split function
def split_address(addr):
    match = re.match(r'(?P<house_no>\d+)\s+(?P<first_add>.*?)\r\n(?P<second_add>.*?),\s+(?P<state>[A-Z]+)\s+(?P<zipcode>\d+)', addr)
    if match:
        return pd.Series(match.groupdict())
    return pd.Series([None]*5, index=['house_no', 'first_add', 'second_add', 'state', 'zipcode'])

In [None]:
data['address'] = data['address'].astype(str)

In [None]:
print(data['address'].head())  # Check the first few rows
print(data['address'].dtype)  # Check the data type
print(data['address'].isnull().sum())  # Check for null values

In [None]:
address_split = data['address'].apply(split_address)

In [None]:
data = pd.concat([data, address_split], axis=1)

In [None]:
data.head()

In [None]:
data.loc[data['emp_title'].isna(), ['emp_title']] = 'Unknown'

In [None]:
data.loc[data['title'].isna(), ['title']] = 'Unknown'

In [None]:
data.loc[data['emp_length'].isna(), ['emp_length']] = 'Unknown'

USCGC Nunez\r\nFPO AE 30723,
Unit 8386 Box 5821\r\nDPO AE 05113,
USNV Trujillo\r\nFPO AA 30723,
PSC 5108, Box 2953\r\nAPO AP 05113,
USS Goodman\r\nFPO AE 22690,
USNS Roberts\r\nFPO AA 11650

In [None]:
def pattern2(addr):
    pattern = (
    r'(?P<house_no>[A-Za-z0-9,]+(?:\s+[A-Za-z0-9,]+)*)'  # house_no: allows letters, numbers, spaces, and commas
    r'\s+(?P<first_add>Box\s+\d+)'                       # first_add: Box number (e.g., Box 2953)
    r'\r\n(?P<second_add>[A-Za-z]{3})'                   # second_add: military type (e.g., APO)
    r'\s+(?P<state>[A-Z]{2})'                            # state code (e.g., AP)
    r'\s+(?P<zipcode>\d{5})'                             # 5-digit zip code
)

    # Apply regex to extract the parts
    match = re.match(pattern, addr)
    if match:
        result = match.groupdict()
        result['first_add'] = result['first_add'] if result['first_add'] else ''
        return pd.Series(result)
    return pd.Series([None]*5, index=['house_no', 'first_add','second_add', 'state', 'zipcode'])

In [None]:
def pattern3(addr):
    pattern = (
    r'(?P<house_no>[A-Za-z0-9]+)'                    # house_no: first word only (e.g., USNV)
    r'\s+(?P<first_add>[^\r\n]+)'                    # first_add: until newline (e.g., Trujillo)
    r'\r\n(?P<second_add>[A-Za-z]{3})'               # second_add: FPO/DPO/APO
    r'\s+(?P<state>[A-Z]{2})\s+(?P<zipcode>\d{5})'   # state and 5-digit zip
)

    # Apply regex to extract the parts
    match = re.match(pattern, addr)
    if match:
        result = match.groupdict()
        result['first_add'] = result['first_add'] if result['first_add'] else ''
        return pd.Series(result)
    return pd.Series([None]*5, index=['house_no', 'first_add','second_add', 'state', 'zipcode'])

In [None]:
data.loc[data['address'].str.split(' ').str[0].isin(['Unit','PSC']),'address'].apply(pattern2)

In [None]:
# Apply the pattern function to rows where 'house_no' is None and 'address' starts with 'USCGC'
uscgc_rows = data.loc[data['address'].str.split(' ').str[0].isin(['Unit','PSC'])]
updated_values = uscgc_rows['address'].apply(pattern2)

# Update the columns with the extracted values where they are None
for col in ['house_no', 'first_add', 'second_add', 'state', 'zipcode']:
    data.loc[data['address'].str.split(' ').str[0].isin(['Unit','PSC']), col] = data.loc[data['address'].str.split(' ').str[0].isin(['Unit','PSC']), col].fillna(updated_values[col])

In [None]:
data.loc[data['address'].str.split(' ').str[0].isin(['USNV','USS','USCGC','USNS']),'address'].apply(pattern3)

In [None]:
# Apply the pattern function to rows where 'house_no' is None and 'address' starts with 'USCGC'
uscgc_rows = data.loc[data['address'].str.split(' ').str[0].isin(['USNV','USS','USCGC','USNS'])]
updated_values = uscgc_rows['address'].apply(pattern3)

# Update the columns with the extracted values where they are None
for col in ['house_no', 'first_add', 'second_add', 'state', 'zipcode']:
    data.loc[data['address'].str.split(' ').str[0].isin(['USNV','USS','USCGC','USNS']), col] = data.loc[data['address'].str.split(' ').str[0].isin(['USNV','USS','USCGC','USNS']), col].fillna(updated_values[col])

In [None]:
data.loc[data['revol_util'].isna(), ['revol_util']] = 0

In [None]:
data.loc[data['mort_acc'].isna(), ['mort_acc']] = 0

In [None]:
data.loc[data['pub_rec_bankruptcies'].isna(), ['pub_rec_bankruptcies']] = 0

In [None]:
data.isna().sum()  

In [None]:
data.drop('address', axis=1, inplace=True)

In [None]:
data['pub_rec'].unique()

In [None]:
def pub_rec(number):
    if number == 0.0:
        return 0
    else:
        return 1

def mort_acc(number):
    if number == 0.0:
        return 0
    else:
        return 1


def pub_rec_bankruptcies(number):
    if number == 0.0:
        return 0
    else:
        return 1

In [None]:
data['pub_rec'] = data.pub_rec.apply(pub_rec)
data['mort_acc'] = data.mort_acc.apply(mort_acc)
data['pub_rec_bankruptcies'] = data.pub_rec_bankruptcies.apply(pub_rec_bankruptcies)

In [None]:
data['title'] = data.title.str.lower()

In [None]:
data['initial_list_status'] = data['initial_list_status'].map({'f': 0, 'w': 1})

# Outlier treatement

In [None]:
def detect_outliers_iqr(df, column):
    """
    Detects outliers using the IQR method for a given column.
    
    Returns:
    - mask of outliers (boolean Series)
    - percentage of outliers in that column
    """
    Q1 = df[column].quantile(0.25)
    Q3 = df[column].quantile(0.75)
    IQR = Q3 - Q1

    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR

    # Boolean mask for outliers
    outliers = (df[column] < lower_bound) | (df[column] > upper_bound)
    outlier_count = outliers.sum()
    total_count = df[column].notna().sum()
    
    outlier_percentage = (outlier_count / total_count) * 100

    print(f"Outliers in '{column}': {outlier_count} out of {total_count} ({outlier_percentage:.2f}%)")
    
    return outliers, outlier_percentage


In [None]:
for i in data.columns:
    if data[i].dtype == 'int64' or data[i].dtype == 'float64':
        outliers, percentage = detect_outliers_iqr(data, i)

In [None]:
#for col in data.columns:
#    if data[col].dtype == 'int64' or data[col].dtype == 'float64':
#        mean = data[col].mean()
#        std = data[col].std()
#        upper_limit = mean+3*std
#        lower_limit = mean-3*std
#        data = data[(data[col]<upper_limit) & (data[col]>lower_limit)]
#data.shape

In [None]:
data[['loan_amnt','installment','annual_inc','revol_bal','revol_util','dti']].describe()

In [None]:
#clipping the outliers to 99 percentile values
data['loan_amnt'] = np.where(data['loan_amnt'] > 35000, 35000, data['loan_amnt'])
data['installment'] = np.where(data['installment'] > 1000, 1000, data['installment'])
data['annual_inc'] = np.where(data['annual_inc'] > 150000.0, 150000.0, data['annual_inc'])
data['revol_bal'] = np.where(data['revol_bal'] > 40000, 40000, data['revol_bal'])
data['revol_util'] = np.where(data['revol_util'] > 100, 100, data['revol_util'])
data['dti'] = np.where(data['dti'] > 36, 36, data['dti'])

In [None]:
for i in ['loan_amnt','installment','annual_inc','revol_bal','revol_util','dti']:
    if data[i].dtype == 'int64' or data[i].dtype == 'float64':
        outliers, percentage = detect_outliers_iqr(data, i)

# Feature Engineering

In [None]:
data.drop(columns=['issue_d','earliest_cr_line'],inplace=True)

In [None]:
data.head()

In [None]:
data['loan_status'] = data['loan_status'].map({'Fully Paid': 0, 'Charged Off': 1})

In [None]:
data['loan_status'].value_counts()

In [None]:
data.select_dtypes(include=['object']).nunique()

In [None]:
label_encoding_columns = ['grade','sub_grade','emp_length','title','house_no','first_add','second_add','state','emp_title']
one_hot_encoding_columns = ['home_ownership','verification_status','purpose','zipcode','application_type']

In [None]:
for i in label_encoding_columns:
    if data[i].dtype == 'object' or data[i].dtype == 'datetime64[ns]':
        print(i)

In [None]:
le = LabelEncoder()
for i in label_encoding_columns:
    if data[i].dtype == 'object' or data[i].dtype == 'datetime64[ns]':
        data[i] = le.fit_transform(data[i])
        print(i,'Label Encoding Completed')

In [None]:
data = pd.get_dummies(data, columns=one_hot_encoding_columns, drop_first=True)

In [None]:
data.head()

In [None]:
sns.countplot(x='loan_status', data=data)

In [None]:
X = data.drop(columns=['loan_status'])
Y = data['loan_status']

In [None]:
from sklearn.preprocessing import MinMaxScaler

In [None]:
Scaler = MinMaxScaler()

In [None]:
X_scaled = Scaler.fit_transform(X)

In [None]:
X_scaled.shape

# Oversampling Minority Class

In [None]:
sm = SMOTE(random_state=42)

In [None]:
X_res, Y_res = sm.fit_resample(X_scaled, Y)
print(X_res.shape)
print(Y_res.shape)
print(Y_res.value_counts())
print(Y_res.value_counts(normalize=True))

# Feature Reduction

In [None]:
import pandas as pd
from statsmodels.stats.outliers_influence import variance_inflation_factor
from statsmodels.tools.tools import add_constant

def calculate_vif(df, features):
    """
    Calculate VIF for a set of features in a DataFrame.
    
    Parameters:
    - df: pandas DataFrame
    - features: list of column names (strings)
    
    Returns:
    - DataFrame with features and their corresponding VIF scores
    """
    X = df[features].copy()
    X = add_constant(X)  # Add intercept term for VIF calculation
    vif_data = pd.DataFrame()
    vif_data["feature"] = X.columns
    vif_data["VIF"] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]
    
    return vif_data.drop(index=0)  # Drop the constant row


In [None]:
# Convert X_res and Y_res to a pandas DataFrame
X_res_df = pd.DataFrame(X_res, columns=X.columns)
Y_res_df = pd.DataFrame(Y_res, columns=['loan_status'])

# Combine them into a single DataFrame if needed
resampled_data = pd.concat([X_res_df, Y_res_df], axis=1)

In [None]:
calculate_vif(resampled_data, resampled_data.columns).sort_values(by='VIF', ascending=False)

In [None]:
resampled_data.drop(columns=['loan_amnt','sub_grade','installment','grade','int_rate','term','purpose_debt_consolidation','purpose_credit_card','purpose_home_improvement','purpose_other'], inplace=True)

# Training Model

In [None]:
X_train, X_test, y_train, y_test = train_test_split(resampled_data.drop(columns='loan_status'), resampled_data.loan_status, test_size=0.2, random_state=42)
log_reg = LogisticRegression()

In [None]:
log_reg = LogisticRegression(max_iter=1000, random_state=42)
log_reg.fit(X_train, y_train)

In [None]:
y_pred = log_reg.predict(X_test)
y_pred_proba = log_reg.predict_proba(X_test)[:, 1]

In [None]:
y_pred_train = log_reg.predict(X_train)
y_pred_proba_train = log_reg.predict_proba(X_train)[:, 1]

In [None]:
print("Train Accuracy: ", accuracy_score(y_train, y_pred_train))
print("Test Accuracy: ", accuracy_score(y_test, y_pred))

In [None]:
confusion_matrix_train = confusion_matrix(y_train, y_pred_train)
confusion_matrix_test = confusion_matrix(y_test, y_pred)
print("Train Confusion Matrix:\n", confusion_matrix_train)
print("Test Confusion Matrix:\n", confusion_matrix_test)

In [None]:
TN = confusion_matrix(y_train, y_pred_train)[0][0]
TP = confusion_matrix(y_train, y_pred_train)[1][1]
FN = confusion_matrix(y_train, y_pred_train)[1][0]
FP = confusion_matrix(y_train, y_pred_train)[0][1]

In [None]:
Precision = TP / (TP + FP)
Recall = TP / (TP + FN)
print("Precision:", Precision)
print("Recall:", Recall)

In [None]:
classification_report_test = classification_report(y_test, y_pred)
print("Test Classification Report:\n", classification_report_test)

In [None]:
logit_roc_auc = roc_auc_score(y_test, y_pred)
fpr, tpr, thresholds = roc_curve(y_test, y_pred_proba)
plt.figure()
plt.plot(fpr, tpr, label='Logistic Regression (area = %0.2f)' % logit_roc_auc)
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic')
plt.legend(loc="lower right")
plt.savefig('Log_ROC')
plt.show()

# Training Regularized Model

In [None]:
log_reg_regularized = LogisticRegression(max_iter=10000, random_state=42, C=100, penalty='l2')
log_reg_regularized.fit(X_train, y_train)

In [None]:
print("Train Accuracy: ", accuracy_score(y_train, log_reg_regularized.predict(X_train)))
print("Test Accuracy: ", accuracy_score(y_test, log_reg_regularized.predict(X_test)))

In [None]:
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_val_score

In [None]:
kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
cross_val_scores = cross_val_score(log_reg_regularized, X_train, y_train, cv=kfold, scoring='accuracy')
print("Cross-validation scores: ", cross_val_scores)
print("Mean cross-validation score: ", cross_val_scores.mean())
print("Standard Deviation of cross-validation scores: ", cross_val_scores.std())