**Run the following two cells before you begin.**

In [1]:
%autosave 10

Autosaving every 10 seconds


In [2]:
import pandas as pd
import numpy as np

______________________________________________________________________
**First, import your data set and define the sigmoid function.**
<details>
    <summary>Hint:</summary>
    The definition of the sigmoid is $f(x) = \frac{1}{1 + e^{-X}}$.
</details>

In [3]:
# Import the data set
df=pd.read_csv('cleaned_data.csv')
df.head()

Unnamed: 0,ID,LIMIT_BAL,SEX,EDUCATION,MARRIAGE,AGE,PAY_1,PAY_2,PAY_3,PAY_4,...,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6,default payment next month,EDUCATION_CAT,graduate school,high school,others,university
0,798fc410-45c1,20000,2,2,1,24,2,2,-1,-1,...,0,0,0,0,1,university,0,0,0,1
1,8a8c8f3b-8eb4,120000,2,2,2,26,-1,2,0,0,...,1000,1000,0,2000,1,university,0,0,0,1
2,85698822-43f5,90000,2,2,2,34,0,0,0,0,...,1000,1000,1000,5000,0,university,0,0,0,1
3,0737c11b-be42,50000,2,2,1,37,0,0,0,0,...,1200,1100,1069,1000,0,university,0,0,0,1
4,3b7f77cc-dbc0,50000,1,2,1,57,-1,0,-1,0,...,10000,9000,689,679,0,university,0,0,0,1


In [4]:
# Define the sigmoid function
def sigmoid(x):
    return 1 / (1 + np.exp(-x))

In [5]:
X=df[['LIMIT_BAL','PAY_1']]
Y=df['default payment next month']

In [6]:
from sklearn import preprocessing
X = preprocessing.StandardScaler().fit(X).transform(X)
Y=np.asarray(Y)

**Now, create a train/test split (80/20) with `PAY_1` and `LIMIT_BAL` as features and `default payment next month` as values. Use a random state of 24.**

In [7]:
from sklearn.model_selection import train_test_split
X_train,X_test,Y_train,Y_test=train_test_split(X,Y,random_state=24,test_size=0.2)
X_train.shape,X_test.shape,Y_train.shape,Y_test.shape


((21331, 2), (5333, 2), (21331,), (5333,))

______________________________________________________________________
**Next, import LogisticRegression, with the default options, but set the solver to `'liblinear'`.**

In [8]:
from sklearn.linear_model import LogisticRegression
clf=LogisticRegression(solver='liblinear')


______________________________________________________________________
**Now, train on the training data and obtain predicted classes, as well as class probabilities, using the testing data.**

In [9]:
# Fit the logistic regression model on training data
clf.fit(X_train,Y_train)


LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='liblinear', tol=0.0001, verbose=0,
                   warm_start=False)

In [10]:
# Make predictions using `.predict()`
Y_pred=clf.predict(X_test)

In [11]:
# Find class probabilities using `.predict_proba()`
Y_pred_proba=clf.predict_proba(X_test)
Y_pred_proba

array([[0.49664996, 0.50335004],
       [0.62382022, 0.37617978],
       [0.89839926, 0.10160074],
       ...,
       [0.87235885, 0.12764115],
       [0.69315222, 0.30684778],
       [0.47092419, 0.52907581]])

______________________________________________________________________
**Then, pull out the coefficients and intercept from the trained model and manually calculate predicted probabilities. You'll need to add a column of 1s to your features, to multiply by the intercept.**

In [12]:
dim=X_test.shape[0]
ones=np.ones(shape=(dim,1),dtype=int)

In [13]:
X_test=np.append(X_test,ones,axis=1)
X_test.shape

(5333, 3)

In [14]:
# Get coefficients and intercepts from trained model
coef=clf.coef_
intercept=clf.intercept_
coef=np.append(coef, intercept)
coef

array([-0.22296102,  0.79784068, -1.42896919])

In [16]:
z=np.dot(coef,X_test.T)
len(z)

5333

In [20]:
prob=sigmoid(z)
prob

array([0.50335004, 0.37617978, 0.10160074, ..., 0.12764115, 0.30684778,
       0.52907581])

In [25]:
prob_list=[]

for ele in prob:
    prob_list.append([1-ele,ele])

prob_list


[[0.4966499631590444, 0.5033500368409556],
 [0.6238202157684003, 0.37617978423159976],
 [0.8983992577117355, 0.10160074228826456],
 [0.3423583086910388, 0.6576416913089612],
 [0.6238202157684003, 0.37617978423159976],
 [0.9593950633157357, 0.040604936684264376],
 [0.9593950633157357, 0.040604936684264376],
 [0.7857926432267254, 0.21420735677327465],
 [0.8902895233881734, 0.10971047661182659],
 [0.8053313742780309, 0.19466862572196908],
 [0.7799549199656111, 0.22004508003438888],
 [0.4453519574249626, 0.5546480425750374],
 [0.8132810516566749, 0.1867189483433251],
 [0.7617629855335126, 0.23823701446648743],
 [0.029737191549104813, 0.9702628084508952],
 [0.9551857232472316, 0.04481427675276836],
 [0.7857926432267254, 0.21420735677327465],
 [0.8779860383618575, 0.12201396163814249],
 [0.8259704800504091, 0.17402951994959096],
 [0.7740038588867586, 0.22599614111324143],
 [0.4623764115522294, 0.5376235884477706],
 [0.9233434717342516, 0.07665652826574842],
 [0.6931522159112147, 0.3068477840

In [28]:
prob_list=np.asarray(prob_list)
type(prob_list)


numpy.ndarray

______________________________________________________________________
**Next, using a threshold of `0.5`, manually calculate predicted classes. Compare this to the class predictions output by scikit-learn.**

In [34]:
# Manually calculate predicted classes
y_pred=[]
for ele in prob_list[:,1]:
    if(ele>0.5):
        y_pred.append(1)
    else:
        y_pred.append(0)
        
type(y_pred)

[1,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 1,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 1,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,


In [35]:
# Compare to scikit-learn's predicted classes
# Find class probabilities using `.predict_proba()`
Y_pred_proba

array([[0.49664996, 0.50335004],
       [0.62382022, 0.37617978],
       [0.89839926, 0.10160074],
       ...,
       [0.87235885, 0.12764115],
       [0.69315222, 0.30684778],
       [0.47092419, 0.52907581]])

______________________________________________________________________
**Finally, calculate ROC AUC using both scikit-learn's predicted probabilities, and your manually predicted probabilities, and compare.**

In [38]:
# Use scikit-learn's predicted probabilities to calculate ROC AUC
from sklearn.metrics import roc_curve,roc_auc_score
ruc_score=roc_auc_score(Y_test,Y_pred_proba[:,1])
ruc_score

0.7014549516224959

In [39]:
# Use manually calculated predicted probabilities to calculate ROC AUC
from sklearn.metrics import roc_curve,roc_auc_score
ruc_score=roc_auc_score(Y_test,prob_list[:,1])
ruc_score

0.7014549516224959