In [1]:
import numpy as np
import pandas as pd
import statsmodels.api as sm
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import precision_score, recall_score, accuracy_score, confusion_matrix, roc_curve, auc

### Example 1 - Fit a logistic regression model

In [2]:
df = pd.read_csv('../../Data/fraud_dataset.csv')
df.head()

Unnamed: 0,transaction_id,duration,day,fraud
0,28891,21.3026,weekend,False
1,61629,22.932765,weekend,False
2,53707,32.694992,weekday,False
3,47812,32.784252,weekend,False
4,43455,17.756828,weekend,False


In [3]:
df.describe()

Unnamed: 0,transaction_id,duration
count,8793.0,8793.0
mean,55243.38451,29.704626
std,21792.120147,7.464452
min,17301.0,0.215113
25%,36454.0,25.211787
50%,55420.0,29.92316
75%,74131.0,34.532567
max,92828.0,60.412763


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8793 entries, 0 to 8792
Data columns (total 4 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   transaction_id  8793 non-null   int64  
 1   duration        8793 non-null   float64
 2   day             8793 non-null   object 
 3   fraud           8793 non-null   bool   
dtypes: bool(1), float64(1), int64(1), object(1)
memory usage: 214.8+ KB


In [5]:
# Create dummy variables for 'fraud'column
df['fraud'] = df['fraud'].astype(int)

# Create dummy variables for categorical column - 'weekday'
df['weekday'] = 1-pd.get_dummies(df['day'], drop_first=True)

df.head()

Unnamed: 0,transaction_id,duration,day,fraud,weekday
0,28891,21.3026,weekend,0,0
1,61629,22.932765,weekend,0,0
2,53707,32.694992,weekday,0,1
3,47812,32.784252,weekend,0,0
4,43455,17.756828,weekend,0,0


In [6]:
df.describe()

Unnamed: 0,transaction_id,duration,fraud,weekday
count,8793.0,8793.0,8793.0,8793.0
mean,55243.38451,29.704626,0.012169,0.345275
std,21792.120147,7.464452,0.109645,0.475485
min,17301.0,0.215113,0.0,0.0
25%,36454.0,25.211787,0.0,0.0
50%,55420.0,29.92316,0.0,0.0
75%,74131.0,34.532567,0.0,1.0
max,92828.0,60.412763,1.0,1.0


In [7]:
print('Average duration for fraudulent charges:', df['duration'].where(df['fraud']==1).mean())
print('Average duration for non-fraudulent charges:', df['duration'].where(df['fraud']==0).mean())

Average duration for fraudulent charges: 4.624247370615658
Average duration for non-fraudulent charges: 30.013583132522555


From the above we can see that
* The proportion of `fraud` charges is $0.012$.
* The average `duration` for fraudulent transaction is $4.62$.
* The proportion of `weekday` transactions is $0.345$.
* The average `duration` for non-fraudulent transactions is $30.01$.

In [8]:
# Create intercept column
df['intercept'] = 1

logit_mod = sm.Logit(df['fraud'], df[['intercept', 'duration', 'weekday']])
results = logit_mod.fit()
results.summary2()

Optimization terminated successfully.
         Current function value: 0.002411
         Iterations 16


0,1,2,3
Model:,Logit,Pseudo R-squared:,0.963
Dependent Variable:,fraud,AIC:,48.4009
Date:,2024-01-16 01:10,BIC:,69.646
No. Observations:,8793,Log-Likelihood:,-21.2
Df Model:,2,LL-Null:,-578.1
Df Residuals:,8790,LLR p-value:,1.39e-242
Converged:,1.0000,Scale:,1.0
No. Iterations:,16.0000,,

0,1,2,3,4,5,6
,Coef.,Std.Err.,z,P>|z|,[0.025,0.975]
intercept,9.8709,1.9438,5.0783,0.0000,6.0613,13.6806
duration,-1.4637,0.2905,-5.0389,0.0000,-2.0331,-0.8944
weekday,2.5465,0.9043,2.8160,0.0049,0.7741,4.3188


##### Finding the multiplicative change in the odds

Remember,
* Categorical variables: When in category $x_1$, we expect a multiplicative change in the odds of a 1 by $e^{b_1}$ compared to the baseline.
* Quantitative variables: For every one unit increase in $x_1$, expect a multiplicative change in the odds of a $1$ by $e^{b_1}$.

In [9]:
np.exp(results.params)

intercept    19359.702805
duration         0.231370
weekday         12.761978
dtype: float64

Thus, we can make the following interpretations.
* Fraud is 12.76 times as likely on weekdays than weekends (holding all else constant).
* For each unit one unit increase in duration, fraud is 0.23 times as likely holding all else constant.

It is often also useful to compute the reciprocal.

In [10]:
1/np.exp(results.params)

intercept    0.000052
duration     4.322087
weekday      0.078358
dtype: float64

From this, we can see that for every one unit decrease in duration, fraud is $4.32$ times as likely holding all other variables constant.

### Example 2 - Interpreting Results of Logistics Regression

The admissions dataset contains four variables: `admit`, `gre`, `gpa`, and `prestige`:

* `admit` is a binary variable. It indicates whether or not a candidate was admitted into UCLA (admit = 1) our not (admit = 0).
* `gre` is the GRE score. GRE stands for Graduate Record Examination.
* `gpa` stands for Grade Point Average.
* `prestige` is the prestige of an applicant alta mater (the school attended before applying), with 1 being the highest (highest prestige) and 4 as the lowest (not prestigious).

In [11]:
df = pd.read_csv('../../Data/admissions.csv')
df.head()

Unnamed: 0,admit,gre,gpa,prestige
0,0,380,3.61,3
1,1,660,3.67,3
2,1,800,4.0,1
3,1,640,3.19,4
4,0,520,2.93,4


In [12]:
df.describe()

Unnamed: 0,admit,gre,gpa,prestige
count,397.0,397.0,397.0,397.0
mean,0.31738,587.858942,3.392242,2.488665
std,0.466044,115.717787,0.380208,0.947083
min,0.0,220.0,2.26,1.0
25%,0.0,520.0,3.13,2.0
50%,0.0,580.0,3.4,2.0
75%,1.0,660.0,3.67,3.0
max,1.0,800.0,4.0,4.0


In [13]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 397 entries, 0 to 396
Data columns (total 4 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   admit     397 non-null    int64  
 1   gre       397 non-null    int64  
 2   gpa       397 non-null    float64
 3   prestige  397 non-null    int64  
dtypes: float64(1), int64(3)
memory usage: 12.5 KB


There are a few different ways you might choose to work with the `prestige` column in this dataset.  For this dataset, we will want to allow for the change from prestige 1 to prestige 2 to allow a different acceptance rate than changing from prestige 3 to prestige 4.

In [14]:
df = df.join(pd.get_dummies(df['prestige'], prefix='prest'))
df = df.drop('prestige', axis=1)
df.head()

Unnamed: 0,admit,gre,gpa,prest_1,prest_2,prest_3,prest_4
0,0,380,3.61,0,0,1,0
1,1,660,3.67,0,0,1,0
2,1,800,4.0,1,0,0,0
3,1,640,3.19,0,0,0,1
4,0,520,2.93,0,0,0,1


In [15]:
df[['prest_1', 'prest_2', 'prest_3', 'prest_4']].sum()

prest_1     61
prest_2    148
prest_3    121
prest_4     67
dtype: int64

In [16]:
# Create intercept column
df['intercept'] = 1

# Create logistic regression model
logit_mod = sm.Logit(df['admit'], df[['intercept', 'gre', 'gpa', 'prest_2', 'prest_3', 'prest_4']])
results = logit_mod.fit()
results.summary2()

Optimization terminated successfully.
         Current function value: 0.573854
         Iterations 6


0,1,2,3
Model:,Logit,Pseudo R-squared:,0.082
Dependent Variable:,admit,AIC:,467.6399
Date:,2024-01-16 01:10,BIC:,491.5435
No. Observations:,397,Log-Likelihood:,-227.82
Df Model:,5,LL-Null:,-248.08
Df Residuals:,391,LLR p-value:,1.1761e-07
Converged:,1.0000,Scale:,1.0
No. Iterations:,6.0000,,

0,1,2,3,4,5,6
,Coef.,Std.Err.,z,P>|z|,[0.025,0.975]
intercept,-3.8769,1.1425,-3.3934,0.0007,-6.1161,-1.6376
gre,0.0022,0.0011,2.0280,0.0426,0.0001,0.0044
gpa,0.7793,0.3325,2.3438,0.0191,0.1276,1.4311
prest_2,-0.6801,0.3169,-2.1459,0.0319,-1.3013,-0.0589
prest_3,-1.3387,0.3449,-3.8819,0.0001,-2.0146,-0.6628
prest_4,-1.5534,0.4175,-3.7211,0.0002,-2.3716,-0.7352


From the above we see that all of the explanatory variables appear significant ($p \lt 0.05$).

In [17]:
# Compute odds ratios
np.exp(results.params)

intercept    0.020716
gre          1.002221
gpa          2.180027
prest_2      0.506548
prest_3      0.262192
prest_4      0.211525
dtype: float64

In [18]:
# Compute the reciprocal of the coefficients to compare the baseline prestige to the others
1/np.exp(results.params)

intercept    48.272116
gre           0.997784
gpa           0.458710
prest_2       1.974147
prest_3       3.813995
prest_4       4.727566
dtype: float64

From the above we see that
* If an individual attended the most prestigious alma mater, they are $4.73$ times more likely to be admitted than if they attended the least prestigious, holding all other variables constant.
* If an individual attended the most prestigious alma mater, they are $3.81$ times more likely to be admitted than if they attended the second lowest in prestigious-ness, holding all other variables constant.
* If an individual attended the most prestigious alma mater, they are $1.97$ times more likely to be admitted than if they attended the second most prestigious, holding all other variables constant.
* For every one point increase in gpa, an individual is $2.18$ times more likely to be admitted, holding all other variables constant.

### Example 3 - Model Diagnostics



In [19]:
df = pd.read_csv('../../Data/admissions.csv')
df.head()

Unnamed: 0,admit,gre,gpa,prestige
0,0,380,3.61,3
1,1,660,3.67,3
2,1,800,4.0,1
3,1,640,3.19,4
4,0,520,2.93,4


In [20]:
# Create dummy variables for 'prestige' column
df = df.join(pd.get_dummies(df['prestige'], prefix='prest'))
df = df.drop('prestige', axis=1)
df.head()

Unnamed: 0,admit,gre,gpa,prest_1,prest_2,prest_3,prest_4
0,0,380,3.61,0,0,1,0
1,1,660,3.67,0,0,1,0
2,1,800,4.0,1,0,0,0
3,1,640,3.19,0,0,0,1
4,0,520,2.93,0,0,0,1


In [21]:
X = df[['gre', 'gpa', 'prest_1', 'prest_2', 'prest_3']]
y = df['admit']

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.10, random_state=42)

In [22]:
logit_mod = LogisticRegression()
logit_mod.fit(X_train, y_train)
y_preds = logit_mod.predict(X_test)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [23]:
print('Precision Score:', precision_score(y_test, y_preds))
print('Recall Score:', recall_score(y_test, y_preds))
print('Accuracy Score:', accuracy_score(y_test, y_preds))
print('Confusion Matrix:\n', confusion_matrix(y_test, y_preds))

tn, fp, fn, tp = confusion_matrix(y_test, y_preds).ravel()
print('True Negative (predicted no admission & true not admitted):', tn)
print('False Positive (predicted admitted & true not admitted):', fp)
print('False Negative (predicted not admitted & true admitted):', fn)
print('True Positive (predicted admitted & true admitted):', tp)

Precision Score: 0.3333333333333333
Recall Score: 0.0625
Accuracy Score: 0.575
Confusion Matrix:
 [[22  2]
 [15  1]]
True Negative (predicted no admission & true not admitted): 22
False Positive (predicted admitted & true not admitted): 2
False Negative (predicted not admitted & true admitted): 15
True Positive (predicted admitted & true admitted): 1


##### Questions

If we really care about correctly identifying the accepted students as accepted, which metric do we care about the most?  
Answer: Recall

If we only care about obtaining the most correctly identified cases, whether accepted or non-accepted, which metric do we care about the most?  
Answer: Accuracy

In [24]:
### Unless you install the ggplot library in the workspace, you will 
### get an error when running this code!
preds = log_mod.predict_proba(X_test)[:,1]
fpr, tpr, _ = roc_curve(y_test, preds)

df = pd.DataFrame(dict(fpr=fpr, tpr=tpr))
ggplot(df, aes(x='fpr', y='tpr')) + geom_line() + geom_abline(linetype='dashed')

NameError: name 'log_mod' is not defined