## BloomTech Data Science

---


# Define ML Problems
- Data Leakage
- ROC/AUC Curve



In [None]:
%%capture
import sys

# If you're on Colab:
if 'google.colab' in sys.modules:
    DATA_PATH = 'https://raw.githubusercontent.com/bloominstituteoftechnology/DS-Unit-2-Applied-Modeling/master/data/'
    !pip install category_encoders==2.*

# If you're working locally:
else:
    DATA_PATH = '../data/'

In [None]:
import pandas as pd
from sklearn.pipeline import make_pipeline
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, roc_curve, ConfusionMatrixDisplay, RocCurveDisplay, roc_auc_score
import matplotlib.pyplot as plt

In [None]:
plot_confusion_matrix = ConfusionMatrixDisplay.from_estimator
plot_roc_curve = RocCurveDisplay.from_estimator

In [None]:
pd.set_option('display.max_columns', 500)

# Define ML problems

# I. Wrangle Data

In [None]:
def wrangle(filepath):

  df = pd.read_csv(filepath)

  # # Import w/ DateTimeIndex
  # df = pd.read_csv(filepath, parse_dates=['Date'], index_col = 'Date')

  # # drop rows with no overall rating
  # df.dropna(subset=['overall'], inplace = True)

  # # Create `'great'` column as target
  # df['great'] = (df['overall'] >= 4).astype(int)

  # # Drop overall column to prevent data leakage
  # df.drop(columns='overall', inplace = True)

  # # Clean binary encoded columns
  # categorical_cols = df.select_dtypes('object').columns
  # # use categorical columns which are basically binary encoded
  # binary_cols = [col for col in categorical_cols if df[col].nunique() < 4]
  # for col in binary_cols:
  #   df[col] = df[col].apply(lambda x: 1 if isinstance(x, str) else 0)

  # # Drop high-cardinality categorical variables
  # threshold = 20

  # high_card_cols =  [col for col in categorical_columns
  #                    if df[col].nunique() > threshold ]
  # df.drop(high_card_cols, axis=1, inplace=True)

  # # Dropping columns with high number of NaN values
  # df.dropna(axis=1, thresh=300, inplace = True)

  return df

In [None]:
df = wrangle(DATA_PATH + 'burritos/burritos.csv')

# II. Split Data

In [None]:
target = ...
y = df[target]
X = df.drop(target, axis=1)

In [None]:
y.value_counts(normalize=True).plot(kind='bar')

In [None]:
# Training
train_mask = ...
X_train, y_train = X.loc[train_mask], y.loc[train_mask]

# Validation
validation_mask = ...
X_val, y_val = X.loc[validation_mask], y.loc[validation_mask]

# Testing
test_mask = ...
X_test, y_test = X.loc[test_mask], y.loc[test_mask]

# III. Establish Baseline

In [None]:
print('Baseline accuracy:', y_train.value_counts(normalize=True).max())

# IV. Build Model

In [None]:
# Model 1: Logistic Regression

model_lr = make_pipeline(
    SimpleImputer(),
    StandardScaler(),
    LogisticRegression()
)

model_lr.fit(X_train, y_train);

In [None]:
# Model 2: Random Forest
model_rf = make_pipeline(
    SimpleImputer(),
    RandomForestClassifier(n_jobs=-1)
)

model_rf.fit(X_train, y_train);

# V. Check Metrics

**Accuracy**

accuracy = (tp + tn) / (tp + fp + tn + fn)

In [None]:
print('Training Accuracy (LOGR):', model_lr.score(X_train, y_train))
print('Validation Accuracy (LOGR):', model_lr.score(X_val, y_val))

In [None]:
print('Training Accuracy (RF):', model_rf.score(X_train, y_train))
print('Validation Accuracy (RF):', model_rf.score(X_val, y_val))

**Precision, Recall, F1**

precision = tp / (tp + fp)


recall = tp / (tp + fn)

In [None]:
print('Logistic Regression')
print(classification_report(..., ...))
plot_confusion_matrix(..., ..., ...)

In [None]:
print('Random Forest')
print(classification_report(..., ...))
plot_confusion_matrix(..., ..., ...)

**ROC curve**


In [None]:
# Get predicted probabilities from model
y_pred_prob = model_rf.predict_proba(X_val)[:, -1]

# Input true labels and probability predictions
fpr, tpr, thresholds = ...

# Put data into dictionary
data = {'false_pos_rate': fpr,
        'true_pos_rate': tpr,
        'thresholds':thresholds}

pd.DataFrame(data)

## Demonstrating how changing the threshold (from roc-auc curve) changes the metrics

In [None]:
y_pred_prob[y_pred_prob>= 0.61] = 1
y_pred_prob[y_pred_prob < 0.61] = 0

In [None]:
from sklearn.metrics import accuracy_score
print('Validation Accuracy (RF):', accuracy_score(y_val, y_pred_prob))

In [None]:
print(classification_report(y_val, model_rf.predict(X_val)))
plot_confusion_matrix(model_rf, X_val, y_pred_prob);

## Regression Example

What if we were predicting the cost of a burrito?


In [None]:
# target = 'Cost'
# y = df[target]
# X = df.drop(columns=target)

In [None]:
# df['Cost'].hist(bins=20) # to check for target skewness

In [None]:
# Radomized Train Test Split
# from sklearn.model_selection import train_test_split
# X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.33, random_state=42)

In [None]:
# from sklearn.metrics import mean_absolute_error
# baseline_pred = [y_val.mean()] * len(y_val)
# mean_absolute_error(y_val, baseline_pred)

In [None]:

# model_lr = make_pipeline(SimpleImputer(),
#                       StandardScaler(),
#                       LinearRegression())

# model_lr.fit(X_train, y_train)

In [None]:

# model_rf = make_pipeline(SimpleImputer(),
#                       RandomForestRegressor())

# model_rf.fit(X_train, y_train)

In [None]:
# print("Training Accuracy - Linear Regression", mean_absolute_error(y_train, model_lr.predict(X_train)))
# print("Validation Accuracy - Linear Regression", mean_absolute_error(y_val, model_lr.predict(X_val)))

In [None]:
# print("Training Accuracy - Random Forest", mean_absolute_error(y_train, model_rf.predict(X_train)))
# print("Validation Accuracy - Random Forest", mean_absolute_error(y_val, model_rf.predict(X_val)))

# Log Transformation of skewed Target (for regression)

In [None]:
# The chosen target for the above example (burrito cost) is not skewed. But if your regression target is skewed, then you should log transform it, using the following steps:

# log transform your target (training set)
# y_train_log = np.log1p(y_train)

# fit your chosen model to this log tranformed target
# model.fit(X_train, y_train_log)

# get your predictions for this log tranformed target
# y_pred_log = model.predict(X_val)

# reverse log tranform your log tranformed target
# y_pred = np.expm1(y_pred_log)

# check your metrics
# print(mean_absolute_error(y_val, y_pred))


# Sampling of imbalance data (using .sample function)

This is just one way to over or under sample. Go over these links and read how to implement SMOTE(Synthetic Minority Oversampling Technique) technique too, where you create synthetic observations of the minority class!

* https://imbalanced-learn.org/dev/references/generated/imblearn.over_sampling.SMOTE.html
* https://www.section.io/engineering-education/imbalanced-data-in-ml/

In [None]:
# # Sampling of imbalance data (using .sample function)

# # If train_minority contains only those rows in your training dataset that correspond to minority class, you can oversample like this.
# # When you over sample your minority class, you always have to sample with replacement

# num_minority_samples = 10 # number of additional minority class rows to create.
# train_minority_sample = train_minority.sample(num_minority_samples,replace = True)


# # If train_majority contains only those rows in your dataset that correspond to majority class, you can undersample like this.
# # Under sampling doesnt need with replacement
# num_majority_samples = 10 # number of majority class rows to remove.
# train_majority_sample = train_majority.sample(num_majority_samples,replace=False)

# # You can then append the two dataframes to form a final train df.
# train_final = train_minority_sample.append(train_majority_sample, ignore_index=True)
