In [1]:
import numpy as np
import pandas as pd
import statsmodels.api as sm

df = pd.read_csv('fraud_dataset.csv')
df.head()

Unnamed: 0,transaction_id,duration,day,fraud
0,28891,21.3026,weekend,False
1,61629,22.932765,weekend,False
2,53707,32.694992,weekday,False
3,47812,32.784252,weekend,False
4,43455,17.756828,weekend,False


In [2]:
df[['no_fraud', 'fraud']] = pd.get_dummies(df['fraud'])
df.head()

Unnamed: 0,transaction_id,duration,day,fraud,no_fraud
0,28891,21.3026,weekend,0,1
1,61629,22.932765,weekend,0,1
2,53707,32.694992,weekday,0,1
3,47812,32.784252,weekend,0,1
4,43455,17.756828,weekend,0,1


In [3]:
df = df.drop('no_fraud', axis=1)

In [4]:
df['intercept'] = 1
logit_mod = sm.Logit(df['fraud'], df[['intercept', 'duration']])
results = logit_mod.fit()
results.summary()

Optimization terminated successfully.
         Current function value: inf
         Iterations 16


  return 1/(1+np.exp(-X))
  return np.sum(np.log(self.cdf(q*np.dot(X,params))))


0,1,2,3
Dep. Variable:,fraud,No. Observations:,8793.0
Model:,Logit,Df Residuals:,8791.0
Method:,MLE,Df Model:,1.0
Date:,"Mon, 24 Jan 2022",Pseudo R-squ.:,inf
Time:,12:04:05,Log-Likelihood:,-inf
converged:,True,LL-Null:,0.0
Covariance Type:,nonrobust,LLR p-value:,1.0

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
intercept,10.3827,1.756,5.912,0.000,6.940,13.825
duration,-1.3404,0.237,-5.649,0.000,-1.805,-0.875


In [5]:
# proportion of fraudulent transactions

df.fraud.mean()

0.012168770612987604

In [9]:
# proportion of weekday transactions

df.day.value_counts() / len(df)

weekend    0.654725
weekday    0.345275
Name: day, dtype: float64

In [11]:
# avg duration for transactions

df.groupby(['fraud']).mean()  

Unnamed: 0_level_0,transaction_id,duration,intercept
fraud,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,55262.229219,30.013583,1.0
1,53713.616822,4.624247,1.0


In [16]:
# fit a logistic regression model to predict if a transaction is fraudulent using both day and duration

df[['weekday', 'weekend']] = pd.get_dummies(df['day'])  
df = df.drop('weekend', axis=1)

log_mod = sm.Logit(df['fraud'], df[['intercept', 'weekday', 'duration']])
results = log_mod.fit()
results.summary()

Optimization terminated successfully.
         Current function value: inf
         Iterations 16


  return 1/(1+np.exp(-X))
  return np.sum(np.log(self.cdf(q*np.dot(X,params))))


0,1,2,3
Dep. Variable:,fraud,No. Observations:,8793.0
Model:,Logit,Df Residuals:,8790.0
Method:,MLE,Df Model:,2.0
Date:,"Mon, 24 Jan 2022",Pseudo R-squ.:,inf
Time:,12:20:02,Log-Likelihood:,-inf
converged:,True,LL-Null:,0.0
Covariance Type:,nonrobust,LLR p-value:,1.0

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
intercept,9.8709,1.944,5.078,0.000,6.061,13.681
weekday,2.5465,0.904,2.816,0.005,0.774,4.319
duration,-1.4637,0.290,-5.039,0.000,-2.033,-0.894


In [17]:
# to interpret results, need to exponentiate the intercepts

np.exp(-1.4637), np.exp(2.5465)

(0.2313785882117941, 12.762357271496972)

These values represent the multiplicative change in the odds.\
On weekdays, fraud is 12.76x as likely on weekdays than weekends holding all else constant\
For each 1 unit increase in duration, fraud is 0.23x as likely holding all else constant

In [18]:
# with returned values less than 1, it is often beneficial to obtain teh reciprocal

1/np.exp(-1.4637)

4.321921089278333

For every 1 unit decrease in duration, fraud is 4.32x as likely holding all else constant 

#### Model Diagnostics - Confusion Matrix

In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import precision_score, recall_score, accuracy_score, confusion_matrix

In [2]:
df = pd.read_csv('admissions.csv')
df.head()

Unnamed: 0,admit,gre,gpa,prestige
0,0,380,3.61,3
1,1,660,3.67,3
2,1,800,4.0,1
3,1,640,3.19,4
4,0,520,2.93,4


In [4]:
y = df['admit']

# create dummy vars for prestige
df[['level1', 'level2', 'level3', 'level4']] = pd.get_dummies(df['prestige'])

X = df[['gre', 'gpa', 'level1', 'level2', 'level3']]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.10, random_state=42)

In [6]:
# perform on training set

log_mod = LogisticRegression()
log_mod.fit(X_train, y_train)
y_preds = log_mod.predict(X_test)

# see how well model is doing
print(precision_score(y_test, y_preds))
print(recall_score(y_test, y_preds))
print(accuracy_score(y_test, y_preds))
confusion_matrix(y_test, y_preds)

0.3333333333333333
0.0625
0.575


array([[22,  2],
       [15,  1]])

Precision score - 33%, recall score - 6.25%, accuracy score - 57.5%

Confusion matrix:
- 22 not admitted predicted to be not admitted
- 2 not admitted predicted to be admitted
- 15 admitted predicted to be not admitted
- 1 admitted predicted to be admitted