In [98]:
#!pip3 install sklearn

In [6]:
from sklearn.datasets import make_classification
from sklearn.calibration import CalibratedClassifierCV
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score, brier_score_loss
from sklearn.preprocessing import MinMaxScaler
import pandas as pd
import numpy as np

## Create Dataset

Making a ton of adjustments to make the dataset as real as actual transaction data as possible.
- `price` is the value of the laptop
- `num_past_orders` is the number of orders this person has made in the past with grandma fixes

In [31]:
X, y = make_classification(n_samples=10000, 
                           n_features=2, 
                           n_redundant=0, 
                           random_state=42, 
                           weights=[0.9])

scaler = MinMaxScaler()
X = scaler.fit_transform(X)
y = scaler.fit_transform(y.reshape(-1,1))

Xs = pd.DataFrame(X, columns = ['price', 'num_past_orders'])
ys = pd.DataFrame(y, columns=['label'])

Xs['price'] = Xs['price'].apply(lambda x: 50 + int(x*2000))
Xs['num_past_orders'] = Xs['num_past_orders'].apply(lambda x: int(x*50))

In [32]:
Xs.describe()

Unnamed: 0,price,num_past_orders
count,10000.0,10000.0
mean,1240.6492,23.3478
std,326.33401,6.20178
min,50.0,0.0
25%,984.0,20.0
50%,1398.5,23.0
75%,1502.0,26.0
max,2050.0,50.0


In [33]:
X_train_raw, X_test, y_train_raw, y_test = train_test_split(Xs, ys, test_size=0.10, shuffle=False)
X_train, X_val, y_train, y_val = train_test_split(X_train_raw, y_train_raw, test_size=0.10, shuffle=False)

In [34]:
y_train['label'].value_counts()

0.0    7281
1.0     819
Name: label, dtype: int64

In [41]:
y_test['label'].value_counts()

0.0    890
1.0    110
Name: label, dtype: int64

## Create (and calibrate) model

Calibration is done to ensure the output of the model is actually a probability. Required depending on the model you use. If you sample a subset of data, or weight certain samples over others, calibration becomes more important. 

We will take a look into this more in another video

In [35]:
clf = LogisticRegression(class_weight='balanced')
calibrated_clf = CalibratedClassifierCV(base_estimator=clf, cv=3, method='isotonic')
calibrated_clf.fit(X_train, y_train.values.ravel())
y_pred = calibrated_clf.predict_proba(X_test)[:, 1]

In [36]:
roc_auc_score(y_test, y_pred)

0.9462257405515833

In [37]:
y_pred_df = pd.DataFrame(y_pred, columns=['prediction'])
pred_df = pd.concat([y_pred_df, y_test.reset_index()],axis=1)[['prediction', 'label']]

In [44]:
y_pred_df.describe()

Unnamed: 0,prediction
count,1000.0
mean,0.102361
std,0.226684
min,0.0
25%,0.002907
50%,0.010897
75%,0.027101
max,0.984848


## Cost Calculations

In [None]:
df = X_test.merge(y_test,left_index=True, right_index=True)

### Case 1: Insure nothing

We pay full price for the laptops we lose

In [52]:
df['price'][df['label']==1].sum()

141801

### Case 2: Insure Everything

We pay \\$30 for every laptop regardless of whether we lose them or not

In [57]:
df.shape[0] * 30

30000

### Case 3: Insure Based on Model

In [87]:
predictions = df.reset_index().drop('index', axis=1).merge(pred_df[['prediction']], left_index=True, right_index=True)

In [88]:
predictions.sample(2)

Unnamed: 0,price,num_past_orders,label,prediction
963,1179,18,0.0,0.001733
297,1477,24,0.0,0.021482


In [89]:
predictions['E_x'] = predictions['price'] * predictions['prediction']

In [90]:
predictions['insure'] = predictions['E_x'] > 30

In [97]:
predictions.sample(2)

Unnamed: 0,price,num_past_orders,label,prediction,E_x,insure
536,1452,20,0.0,0.006692,9.716724,False
53,1431,17,0.0,0.001733,2.480627,False


In [91]:
predictions['insure'].value_counts()

False    691
True     309
Name: insure, dtype: int64

In [80]:
def cal_loss(x):
    if x['insure']:
        return 30
    if not x['insure'] and x['label']==1:
        return x['price']
    return 0

In [83]:
predictions['loss'] = predictions.apply(cal_loss, axis=1)

In [85]:
predictions['loss'].sum()

18963