In [33]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score, classification_report, accuracy_score

In [34]:
train_data = pd.read_csv('train.csv')
test_data = pd.read_csv('test.csv')

In [35]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 58645 entries, 0 to 58644
Data columns (total 13 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   id                          58645 non-null  int64  
 1   person_age                  58645 non-null  int64  
 2   person_income               58645 non-null  int64  
 3   person_home_ownership       58645 non-null  object 
 4   person_emp_length           58645 non-null  float64
 5   loan_intent                 58645 non-null  object 
 6   loan_grade                  58645 non-null  object 
 7   loan_amnt                   58645 non-null  int64  
 8   loan_int_rate               58645 non-null  float64
 9   loan_percent_income         58645 non-null  float64
 10  cb_person_default_on_file   58645 non-null  object 
 11  cb_person_cred_hist_length  58645 non-null  int64  
 12  loan_status                 58645 non-null  int64  
dtypes: float64(3), int64(6), object

In [36]:
test_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 39098 entries, 0 to 39097
Data columns (total 12 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   id                          39098 non-null  int64  
 1   person_age                  39098 non-null  int64  
 2   person_income               39098 non-null  int64  
 3   person_home_ownership       39098 non-null  object 
 4   person_emp_length           39098 non-null  float64
 5   loan_intent                 39098 non-null  object 
 6   loan_grade                  39098 non-null  object 
 7   loan_amnt                   39098 non-null  int64  
 8   loan_int_rate               39098 non-null  float64
 9   loan_percent_income         39098 non-null  float64
 10  cb_person_default_on_file   39098 non-null  object 
 11  cb_person_cred_hist_length  39098 non-null  int64  
dtypes: float64(3), int64(5), object(4)
memory usage: 3.6+ MB


In [37]:
train_data.describe()

Unnamed: 0,id,person_age,person_income,person_emp_length,loan_amnt,loan_int_rate,loan_percent_income,cb_person_cred_hist_length,loan_status
count,58645.0,58645.0,58645.0,58645.0,58645.0,58645.0,58645.0,58645.0,58645.0
mean,29322.0,27.550857,64046.17,4.701015,9217.556518,10.677874,0.159238,5.813556,0.142382
std,16929.497605,6.033216,37931.11,3.959784,5563.807384,3.034697,0.091692,4.029196,0.349445
min,0.0,20.0,4200.0,0.0,500.0,5.42,0.0,2.0,0.0
25%,14661.0,23.0,42000.0,2.0,5000.0,7.88,0.09,3.0,0.0
50%,29322.0,26.0,58000.0,4.0,8000.0,10.75,0.14,4.0,0.0
75%,43983.0,30.0,75600.0,7.0,12000.0,12.99,0.21,8.0,0.0
max,58644.0,123.0,1900000.0,123.0,35000.0,23.22,0.83,30.0,1.0


In [38]:
test_data.describe()

Unnamed: 0,id,person_age,person_income,person_emp_length,loan_amnt,loan_int_rate,loan_percent_income,cb_person_cred_hist_length
count,39098.0,39098.0,39098.0,39098.0,39098.0,39098.0,39098.0,39098.0
mean,78193.5,27.566781,64060.46,4.687068,9251.466188,10.661216,0.159573,5.830707
std,11286.764749,6.032761,37955.83,3.868395,5576.25468,3.02022,0.091633,4.072157
min,58645.0,20.0,4000.0,0.0,700.0,5.42,0.0,2.0
25%,68419.25,23.0,42000.0,2.0,5000.0,7.88,0.09,3.0
50%,78193.5,26.0,58000.0,4.0,8000.0,10.75,0.14,4.0
75%,87967.75,30.0,75885.0,7.0,12000.0,12.99,0.21,8.0
max,97742.0,94.0,1900000.0,42.0,35000.0,22.11,0.73,30.0


In [39]:
X = train_data.drop(['loan_status'], axis=1)
y = train_data['loan_status']

In [40]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)


In [41]:
numeric_cols = X.select_dtypes(include=['int64', 'float64']).columns
categorical_cols = X.select_dtypes(include=['object']).columns

In [42]:
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_cols),
        ('cat', categorical_transformer, categorical_cols)
    ])

In [43]:
X_train = preprocessor.fit_transform(X_train)
X_val = preprocessor.transform(X_val)

In [44]:
test_data_transformed = preprocessor.transform(test_data)


In [45]:
model = LogisticRegression(random_state=42)


In [46]:
model.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [47]:
y_pred_proba = model.predict_proba(X_val)[:, 1]

In [48]:
roc = roc_auc_score(y_val, y_pred_proba)
roc

0.9047398473717151

In [49]:
y_pred = model.predict(X_val)
classification_report(y_val, y_pred)

'              precision    recall  f1-score   support\n\n           0       0.93      0.98      0.95     10087\n           1       0.77      0.53      0.63      1642\n\n    accuracy                           0.91     11729\n   macro avg       0.85      0.75      0.79     11729\nweighted avg       0.91      0.91      0.91     11729\n'

In [50]:
accuracy = accuracy_score(y_val, y_pred)
accuracy

0.9122687356125841

In [51]:
test_pred_proba = model.predict_proba(test_data_transformed)[:, 1]

In [52]:
submission = pd.DataFrame({
    'id': test_data['id'],
    'loan_status': test_pred_proba
})


In [55]:
submission.to_csv('submission.csv', index=False)