In [None]:
# packages
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.pyplot import subplots
import statsmodels.api as sm
from ISLP import (load_data, confusion_table)
from ISLP.models import (ModelSpec as MS, summarize, contrast)
from sklearn.model_selection import train_test_split 
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import RocCurveDisplay, roc_auc_score

roc_curve_est = RocCurveDisplay.from_estimator 
roc_curve_pred = RocCurveDisplay.from_predictions 


# set seed
seed = 5331

### We will use the OJ dataset, which contains information about orange juice purchases across different stores.

In [None]:
OJ = load_data('OJ')
OJ

### What are the variables and their types?

In [None]:
OJ.dtypes

### We will use all stores other than ID=7 as training data, and Store #7 will be used as test.

In [None]:
OJ["StoreID"].value_counts()

In [None]:
Train = OJ[OJ['StoreID'] != 7]
Train

In [None]:
Test = OJ[OJ['StoreID']==7]

### We will predict whether each customer purchased Citrus Hill or Minute Maid.

In [None]:
y_train = Train.Purchase == 'CH'
y_test = Test.Purchase == 'CH'

In [None]:
X_train = Train[['WeekofPurchase', 'PriceCH', 'PriceMM', 'DiscCH', 'DiscMM', 'SpecialCH', 'SpecialMM', 'LoyalCH', 'SalePriceCH', 'SalePriceMM', 'PriceDiff', 'PctDiscCH', 'PctDiscMM', 'ListPriceDiff']]
X_test = Test[['WeekofPurchase', 'PriceCH', 'PriceMM', 'DiscCH', 'DiscMM', 'SpecialCH', 'SpecialMM', 'LoyalCH', 'SalePriceCH', 'SalePriceMM', 'PriceDiff', 'PctDiscCH', 'PctDiscMM', 'ListPriceDiff']]

In [None]:
X_train.corr()

In [None]:
# Initialize the plots before drawing them
nrows = 5
ncols = 3
figsize = (5*nrows, 10*ncols)

fig, axes = subplots(nrows=nrows,
                     ncols=ncols,
                     figsize=figsize)

# Assign a grid location to each index
def range_to_grid(i, nrows, ncols):
    x=[]
    y=[]
    for n in range(nrows*ncols):
        x.append(n // ncols)
        y.append(n % ncols)
        # print(n,x[n],y[n]) # for testing this function
    return x[i],y[i]

# Plot the variables as paired boxplots
for j, col in enumerate(X_train.columns):
    r, c = range_to_grid(j, nrows, ncols)
    ax = axes[r, c]

    data_0 = X_train.loc[y_train == 0, col]
    data_1 = X_train.loc[y_train == 1, col]

    ax.boxplot(
        [data_0, data_1],
        labels=["y = 0", "y = 1"],
        showfliers=False
    )

    ax.set_xlabel(col)


In [None]:
X_train['intercept'] = np.ones(X_train.shape[0])
X_test['intercept'] = np.ones(X_test.shape[0])

## Logistic Regression

### Since the many of the variables are correlated, a model with all of them will likely be overfit.

In [None]:
# build model
initial_glm = sm.GLM(y_train,
             X_train,
             family=sm.families.Binomial())

# fit model
initial_results = initial_glm.fit()

# analyze model
summarize(initial_results)

### Create a logistic model using only the intercept, Citrus Hill loyalty, and the difference in price between the brands.

In [None]:
var_list = #fillin

# build model
glm = #fillin

# fit model
results = #fillin

# analyze model
summarize(results)

### Get the predicted probabilities

In [None]:
def predict(X, model):
    # the built-in get_prediction tool returns an array, so we need to convert to a dataframe
    predictions_df = pd.DataFrame(model.get_prediction(X).predicted, columns=['y_hat'], index=X.index)
    return predictions_df['y_hat']

In [None]:
probs_train=predict(X_train[var_list],results)
probs_test=predict(X_test[var_list],results)

### We'll use 0.5 as the threshold for True vs. False

In [None]:
predictions_train = np.array([True]*len(y_train))
predictions_train[probs_train<0.5] = False

predictions_test = np.array([True]*len(y_test))
predictions_test[probs_test<0.5] = False

### Training results

In [None]:
train_table = confusion_table(predictions_train, y_train)
train_table

### Test results

In [None]:
test_table = confusion_table(predictions_test, y_test)
test_table

### Calculate the false positive rate and false negative rate for this model.

You can hard code the exact numbers to at least **three** decimal places, or you can code a formula that correcly calculates this. 

In [None]:
false_positive_rate = #fillin
print("fpr =",false_positive_rate)
false_negative_rate = #fillin
print("fnr =",false_negative_rate)

## Naive Bayes and k-Nearest Neighbors

### Create arrays of train and test sets

In [None]:
var_list.remove('intercept')

X_train_array, X_test_array = [np.asarray(X) for X in [X_train[var_list], X_test[var_list]]]

### Build Naive Bayes classifer

In [None]:
naive = GaussianNB()
naive.fit(X_train_array, y_train)
naive_test = naive.predict(X_test_array)
naive_probs = naive.predict_proba(X_test_array)[:,1]

confusion_table(naive_test, y_test)

### Try different k-Nearest Neigbors classifiers

In [None]:
# Let's try 5 nearest neighbors

knn5 = KNeighborsClassifier(n_neighbors=5)
knn5.fit(X_train_array, y_train)
knn5_test = knn5.predict(X_test_array)

confusion_table(knn5_test, y_test)

In [None]:
# How about 50 nearest neighbors?

knn50 = KNeighborsClassifier(n_neighbors=50)
knn50.fit(X_train_array, y_train)
knn50_test = knn50.predict(X_test_array)

confusion_table(knn50_test, y_test)

In [None]:
# How about 500 nearest neighbors?

knn500 = KNeighborsClassifier(n_neighbors=500)
knn500.fit(X_train_array, y_train)
knn500_test = knn500.predict(X_test_array)

confusion_table(knn500_test, y_test)

### Write a loop to test all kNN models from k=1 to 500. 

### Return the lowest value of k which maximizes the number of correct predictions on the test set. You can obtain these numbers from the main diagonal of the confusion matrix.

In [None]:
best_k = 0
num_correct_pred = 0

for k in range(1,500):
    #fillin

print(best_k)
print(num_correct_pred)


### Now that you've found an optimal choice of k, let's construct that model and store it.

In [None]:
knn_opt = KNeighborsClassifier(n_neighbors=#fillin
                               )
knn_opt.fit(X_train_array, y_train)
knn_opt_test = knn_opt.predict(X_test_array)

confusion_table(knn_opt_test, y_test)

### ROC Curves

In [None]:
fig, ax = subplots(figsize=(8,8))

roc_curve_est(knn5,
              X_test_array,
              y_test,
              name='kNN5 (Test)',
              color='r',
              ax=ax);

roc_curve_est(knn500,
              X_test_array,
              y_test,
              name='kNN500 (Test)',
              color='y',
              ax=ax);

roc_curve_est(knn_opt,
              X_test_array,
              y_test,
              name='kNN Optimal (Test)',
              color='g',
              ax=ax);

roc_curve_est(naive,
              X_test_array,
              y_test,
              name='Naive Bayes (Test)',
              color='m',
              ax=ax);

roc_curve_pred(y_test,
               probs_test,
               name='Logistic:Prob (Test)',
               color='b',
               ax=ax);
#fig

# Discussion Questions

### Of the models that were built in this notebook, which would you choose to implement? Why?

Type your answer here.

### Suppose we "build" a model on this data that **always predicts true**, i.e., that every customer will purchase Citrus Hill rather than Minute Maid orange juice. What would be the total misclassification rate on this test set?

You can hard code the exact number to at least **three** decimal places, or you can code a formula that correcly calculates this. 

In [None]:
misclassification_rate = #fillin
print(misclassification_rate)