In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
import statsmodels.api as sm
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report


In [2]:
#df = pd.read_csv('IOE373FinalQuery1.csv')
df = pd.read_csv('IOE373FinalQuery1.csv', encoding='unicode_escape')
df.head()

Unnamed: 0,ï»¿CustomerID,Age,ZIPCode,Family,Experience,Income,CCAvg,Mortgage,EducProf,EducGrad,EducUnder,Securities,CD,Online,CreditCard,PersonalLoan
0,1,25,91107,4,1,49,1.6,0,0,0,1,1,0,0,0,0
1,2,45,90089,3,19,34,1.5,0,0,0,1,1,0,0,0,0
2,3,39,94720,1,15,11,1.0,0,0,0,1,0,0,0,0,0
3,4,35,94112,1,9,100,2.7,0,0,1,0,0,0,0,0,0
4,5,35,91330,4,8,45,1.0,0,0,1,0,0,0,0,1,0


In [9]:
X = df.drop('PersonalLoan', axis=1)
y = df['PersonalLoan']

# Split data into training and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=42)

# Implement logistic regression
logmodel = LogisticRegression(max_iter=5000)
logmodel.fit(X_train, y_train)
# Predicting the test set results
y_pred = logmodel.predict(X_test)

# Evaluating model
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:")
print(classification_report(y_test, y_pred))



# Calculating Odds
coef_lg = logmodel.coef_
odds = np.exp(coef_lg)
cols = X_train.columns
feature_importance = pd.DataFrame(odds, columns = cols).T.sort_values(by = 0, ascending = False)
print("Odds:")
print(feature_importance)

Confusion Matrix:
[[2200   44]
 [ 135  121]]
Accuracy: 0.9284
Classification Report:
              precision    recall  f1-score   support

           0       0.94      0.98      0.96      2244
           1       0.73      0.47      0.57       256

    accuracy                           0.93      2500
   macro avg       0.84      0.73      0.77      2500
weighted avg       0.92      0.93      0.92      2500

Odds:
                      0
Family         2.293521
CCAvg          1.461246
EducProf       1.322243
EducGrad       1.303714
CD             1.278414
Securities     1.047099
Income         1.035658
CreditCard     1.017986
Experience     1.013019
Online         1.005519
Mortgage       1.001034
ï»¿CustomerID  0.999942
ZIPCode        0.999901
Age            0.999573
EducUnder      0.576542


In [4]:
X_train_sm = sm.add_constant(X_train)

# Fit a logistic regression model
logit_model = sm.Logit(y_train, X_train_sm)
result = logit_model.fit()

# Extract the coefficients and the p-values and put them in a DataFrame
summary = result.summary2().tables[1]
summary['Odds ratio'] = np.exp(summary['Coef.'])

# Print the DataFrame
print(summary)

# Get the significant variables
significant_vars = summary[summary['P>|z|'] < 0.05]
print("\nSignificant predictors at 5% level:")
print(significant_vars)


Optimization terminated successfully.
         Current function value: 0.121149
         Iterations 9
                  Coef.      Std.Err.             z         P>|z|  \
const         -3.430682  9.515432e+06 -3.605388e-07  9.999997e-01   
ï»¿CustomerID -0.000078  7.543735e-05 -1.038496e+00  2.990393e-01   
Age           -0.046507  9.631853e-02 -4.828436e-01  6.292068e-01   
ZIPCode       -0.000048  3.573535e-05 -1.335356e+00  1.817597e-01   
Family         0.687860  1.082337e-01  6.355315e+00  2.079994e-10   
Experience     0.061609  9.541154e-02  6.457185e-01  5.184617e-01   
Income         0.054693  3.949808e-03  1.384709e+01  1.324784e-43   
CCAvg          0.216446  6.390914e-02  3.386784e+00  7.071695e-04   
Mortgage       0.000854  8.061283e-04  1.059745e+00  2.892606e-01   
EducProf       0.036987  9.515432e+06  3.887024e-09  1.000000e+00   
EducGrad       0.017921  9.515432e+06  1.883394e-09  1.000000e+00   
EducUnder     -3.485590  9.515432e+06 -3.663091e-07  9.999997e-01   
S

In [5]:
summary_simplified = summary[['Coef.', 'P>|z|']]

# Add a column for whether or not the variable is significant at the 5% level
summary_simplified['Significant_5%'] = summary_simplified['P>|z|'] < 0.05

# Print the simplified DataFrame
print(summary_simplified)

                  Coef.         P>|z|  Significant_5%
const         -3.430682  9.999997e-01           False
ï»¿CustomerID -0.000078  2.990393e-01           False
Age           -0.046507  6.292068e-01           False
ZIPCode       -0.000048  1.817597e-01           False
Family         0.687860  2.079994e-10            True
Experience     0.061609  5.184617e-01           False
Income         0.054693  1.324784e-43            True
CCAvg          0.216446  7.071695e-04            True
Mortgage       0.000854  2.892606e-01           False
EducProf       0.036987  1.000000e+00           False
EducGrad       0.017921  1.000000e+00           False
EducUnder     -3.485590  9.999997e-01           False
Securities    -0.964108  2.052062e-02            True
CD             3.962628  4.249855e-17            True
Online        -0.741857  1.691752e-03            True
CreditCard    -1.016320  6.855499e-04            True


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  summary_simplified['Significant_5%'] = summary_simplified['P>|z|'] < 0.05
