# Creating Good Features

In [None]:
import warnings

import numpy as np
import pandas as pd
import seaborn as sns

from sklearn.preprocessing import scale, MinMaxScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_curve, auc, confusion_matrix, roc_auc_score, f1_score
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, LabelBinarizer, label_binarize

import matplotlib.pyplot as plt

from google.cloud import bigquery as bq

warnings.filterwarnings('ignore')
%matplotlib inline

In [None]:
from google.colab import auth
auth.authenticate_user()
print('Authenticated')

Authenticated


# Import the data

In [None]:
SQL = """
SELECT DriverId
     , TripNumber
     , Distance
     , TripType
     , DayOfWeek
     , HourOfDay
     , AvgSpeed
     , Loaded
  FROM `geotab-bootcamp.DemoData.PredictingLoaded2`
 ORDER BY RAND()
"""

df = bq.Client(project='geotab-bootcamp').query(SQL).to_dataframe()

# First look at the DataFrame

In [None]:
df.shape

In [None]:
df.info()

We can see that there are 15,895 rows and 8 columns. We can already notice a few things:
1. We have both numerical and non-numerical columns
2. We have both categorical and continuous data
3. It looks like some of the columns have missing values

Let's first take a look at the head of the DataFrame.

In [None]:
df.head(3)

We can see that TripType, and DayOfWeek are categorical and included as strings. Our algorithms can't take strings as input so we'll have to encode those somehow. We see that hour of day is numeric and we should double-check that it ranges from 0 to 23.

In [None]:
print('Min: ', df['HourOfDay'].min(), ', Max: ', df['HourOfDay'].max(), end='', sep='')

The column *Loaded* is the one we will be trying to predict, based on the other columns. Are all of these columns useful? 

Do you think DriverId or TripNumber would be related to whether the vehicle is Loaded or Unloaded? Why or why not? How would you check?

Before any modelling happens, we need to do more quality checks on our data. We already know there are missing values and we'll have to address that somehow. Let's first see how many missing values we have.

In [None]:
df[df.isna().any(axis=1)].shape

So we have some rows that have a missing value in at least one of the columns. What about outliers? Let's describe our numeric columns and see what we find.

In [None]:
df.describe()

We can see that most of our columns look good, but there's something odd about AvgSpeed. It's pretty unlikely that a driving trip would have an average speed of over 200 km/hr. Let's look at a histogram of AvgSpeed and take a closer look.

In [None]:
fig = plt.figure(figsize=(12, 4))
ax = fig.add_subplot(111)
_ = ax.hist(df['AvgSpeed'], bins=20, color='#00aeef', edgecolor='white', zorder=4)
_ = ax.grid(linestyle='-.')
_ = ax.set_ylim([0, 3500])
_ = ax.set_xlim([0, 250])

In [None]:
df.sort_values(by='AvgSpeed', ascending=False).head()

We can see most of the values fall into a fairly reasonable range, but we have some outliers clustered around 200 that we'll need to fix or remove.

# Imputing missing values and fixing outliers

There are many ways to impute missing values, ranging from taking the average to building a model to predict the missing features. Let's first take a look at what percentage of rows have missing values for each of our numeric features.

In [None]:
df.groupby('TripType')[['Distance', 'DayOfWeek', 'HourOfDay', 'AvgSpeed']].apply(lambda x: 100-x.notnull().sum()/len(x)*100)

For our purposes, the number of rows with missing values is quite low (452 out of 15,895), and doesn't seem strongly biased to a particular trip type, so it should be safe to simply drop those rows which contain null values. What about the outliers? Let's define a speed outlier as an average trip speed exceeding 100km/hr.

In [None]:
def gthan(row):
    return row>100

In [None]:
df.groupby(by='TripType')[['AvgSpeed']].apply(lambda x: gthan(x).sum()/len(x)*100)

In [None]:
df[df['AvgSpeed'] > 100].shape

There are only a few hundred rows with excessive speed outliers; we could probably safely drop them, but we should be mindful that the outlier speeds are imbalanced in our data by TripType. A larger percentage of them appear in highway driving. Instead of dropping them, let's replace them with the average for each type of trip; at the same time, we'll drop our null rows.

In [None]:
trip_means = df[df['AvgSpeed'] < 100].groupby(by='TripType', as_index=True)[['AvgSpeed']].mean().rename(columns={'AvgSpeed': 'MeanSpeed'})

In [None]:
df_imputed = df.dropna().join(trip_means, on='TripType', how='left')
df_imputed['AvgSpeed'] = df_imputed.apply(lambda x: x['MeanSpeed'] if x['AvgSpeed']>100 else x['AvgSpeed'], axis=1)

# SECOND PART

# Building a model

To feed our features into a model, they must be made numeric. There are two main ways to encode categorical variables:
1. One Hot Encoding
2. Label Encoding

One Hot Encoding turns each category into a new column with a binary indicator for whether the row belongs to that category. Label encoding creates an incrementing integer value to substitute for each category. One is not better than the other; each has its uses and drawbacks and you need to consider the type of algorithm used and how it will treat each of these types of encoding.

Let's try both, on two types of models (logistic regression, and random forest) and see what happens.

In [None]:
dayofweek_ohe = OneHotEncoder().fit_transform(df_imputed['DayOfWeek'].values.reshape(-1, 1)).todense()
dayofweek_le = LabelEncoder().fit_transform(df_imputed['DayOfWeek'].values.reshape(-1, 1))[:, np.newaxis]

# TODO: Apply one hot encoding and label encoding to the other appropriate feature columns 
#

distance = df_imputed['Distance'].values.reshape(-1, 1)

# TODO: Apply reshaping to the other appropriate feature columns 
#

We chose some of the columns for scaling and some for one hot encoding. Do they all make sense to you? Do they match your choices? Feel free to change the concatenation code below to match your chioces. Let's keep going and re-evaluate later.

In [None]:
X_ohe  = np.concatenate([dayofweek_ohe, triptype_ohe, hourofday_ohe, distance, speed], axis=1)
X_le  = np.concatenate([dayofweek_le, triptype_le, hourofday_le, distance, speed, driver_id, trip_num], axis=1)
y = label_binarize(df_imputed['Loaded'].values, classes=['Loaded', 'Unloaded'])

X_test_ohe, X_train_ohe, y_test_ohe, y_train_ohe = train_test_split(X_ohe, y, shuffle=True)
X_test_le, X_train_le, y_test_le, y_train_le = train_test_split(X_le, y, shuffle=True)

## Label Encoding

In [None]:
logreg = LogisticRegression(penalty='l2')
rf = RandomForestClassifier(n_estimators=100)

_ = logreg.fit(X_train_le, y_train_le)
_ = rf.fit(X_train_le, y_train_le)

print('Logistic Regression Accuracy: ', int(100*round(logreg.score(X_test_le, y_test_le), 2)), '%', sep='')
print('      Random Forest Accuracy: ', int(100*round(rf.score(X_test_le, y_test_le), 2)), '%', sep='')

Logistic Regression Accuracy: 61%
      Random Forest Accuracy: 71%


The accuracy of both the Logistic Regression and Random Forest model are similar. Accuracy isn't the whole picture though; let's look at the F1 score and the ROC curve.

In [None]:
print('Logistic Regression F1 Score: ', round(f1_score(logreg.predict(X_test_le), y_test_le), 2), sep='')
print('      Random Forest F1 Score: ', round(f1_score(rf.predict(X_test_le), y_test_le), 2), sep='')

Logistic Regression F1 Score: 0.67
      Random Forest F1 Score: 0.73


In [None]:
y_score_lr = logreg.decision_function(X_test_le)
y_score_rf = rf.predict_proba(X_test_le)[:, 1]

fpr = {'lr': {0: 0, 1: 0}, 'rf': {0: 0, 1: 0}}
tpr = {'lr': {0: 0, 1: 0}, 'rf': {0: 0, 1: 0}}
roc_auc = {'lr': {0: 0, 1: 0}, 'rf': {0: 0, 1: 0}}
for i in range(1):
    fpr['lr'][i], tpr['lr'][i], _ = roc_curve(y_test_le, y_score_lr)
    roc_auc['lr'][i] = auc(fpr['lr'][i], tpr['lr'][i])
    fpr['rf'][i], tpr['rf'][i], _ = roc_curve(y_test_le, y_score_rf)
    roc_auc['rf'][i] = auc(fpr['rf'][i], tpr['rf'][i])
    
fig = plt.figure(figsize=(6, 6))
ax = fig.add_subplot(111)
lw = 2
_ = ax.plot(fpr['lr'][0], tpr['lr'][0], color='#00aeef',
         lw=lw, label='ROC Curve Logistic Regression (area = %0.2f)' % roc_auc['lr'][0], zorder=4)
_ = ax.plot(fpr['rf'][0], tpr['rf'][0], color='#93c83d',
         lw=lw, label='ROC Curve Random Forest (area = %0.2f)' % roc_auc['rf'][0], zorder=4)
_ = ax.plot([0, 1], [0, 1], color='#66788c', lw=lw, linestyle='--', zorder=3)
_ = ax.set_xlim([0.0, 1.0])
_ = ax.set_ylim([0.0, 1.0])
_ = ax.set_xlabel('False Positive Rate')
_ = ax.set_ylabel('True Positive Rate (recall)')
_ = ax.legend(loc="lower right")
_ = ax.grid(linestyle='-.')

We can see from the ROC curve that the logistic regression is performing poorly compared to the random forest model. Why might that be? This is a case which demonstrates why you can't treat machine learning as a black box. Because of the way the algorithms work for the two models, label encoding categorical variables is poorly suited to use in logistic regression. Random forest models have multiple classifiers (it's an ensemble method, with many trees) and thus multiple hyperplane decision boundaries. Each label in a random forest model can thus have a decision associated with it. In logistic regression, you have only one hyperplane, which means that the order/value of the labels will be interpreted by the model as meaningful; in our case, they are not, so label encoding our categorical variables is a poor choice for logistic regression. Let's try one hot encoding instead.

# FEATURE IMPORTANCE
One advantage of decision tree based algorithms is that we can compute the "features importance". It represents a function of the node impurity (e.g. Gini impurity or entropy) and the probability of reaching that node. As the name suggests, the higher the importance, the more important the feature. 
Try plotting feature importance: https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html

Note: If you chose different columns/orders than those given above in our concatenation step, make sure to change the feature names list below to match.

In [None]:
features = ['DayOfWeek', 'TripType', 'HourOfDay', 'Distance', 'AvgSpeed', 'DriverId', 'TripNumber']

ft_importance = rf.feature_importances_

fi = pd.DataFrame({'Feature': features,
                   'importance':ft_importance}).\
                    sort_values('importance', ascending=False)

fi.plot(kind='barh', x='Feature', y='importance', figsize=(15,5), title='Feature Importance', color='#00aeef').invert_yaxis()

Can you exclude features which are not significative for our model? Which ones?

## One Hot Encoding

In [None]:
logreg = LogisticRegression(penalty='l2')
rf = RandomForestClassifier(n_estimators=100)

_ = logreg.fit(X_train_ohe, y_train_ohe)
_ = rf.fit(X_train_ohe, y_train_ohe)

print('Logistic Regression Accuracy: ', int(100*round(logreg.score(X_test_ohe, y_test_ohe), 2)), '%', sep='')
print('      Random Forest Accuracy: ', int(100*round(rf.score(X_test_ohe, y_test_ohe), 2)), '%', sep='')

The accuracy of both the Logistic Regression and Random Forest model are again similar, with logistic regression slightly improved. Accuracy isn't the whole picture though; let's look at the F1 score and the ROC curve again.

In [None]:
print('Logistic Regression F1 Score: ', round(f1_score(logreg.predict(X_test_ohe), y_test_ohe), 2), sep='')
print('      Random Forest F1 Score: ', round(f1_score(rf.predict(X_test_ohe), y_test_ohe), 2), sep='')

In [None]:
y_score_lr = logreg.decision_function(X_test_ohe)
y_score_rf = rf.predict_proba(X_test_ohe)[:, 1]

fpr = {'lr': {0: 0, 1: 0}, 'rf': {0: 0, 1: 0}}
tpr = {'lr': {0: 0, 1: 0}, 'rf': {0: 0, 1: 0}}
roc_auc = {'lr': {0: 0, 1: 0}, 'rf': {0: 0, 1: 0}}
for i in range(1):
    fpr['lr'][i], tpr['lr'][i], _ = roc_curve(y_test_ohe, y_score_lr)
    roc_auc['lr'][i] = auc(fpr['lr'][i], tpr['lr'][i])
    fpr['rf'][i], tpr['rf'][i], _ = roc_curve(y_test_ohe, y_score_rf)
    roc_auc['rf'][i] = auc(fpr['rf'][i], tpr['rf'][i])
    
fig = plt.figure(figsize=(6, 6))
ax = fig.add_subplot(111)
lw = 2
_ = ax.plot(fpr['lr'][0], tpr['lr'][0], color='#00aeef',
         lw=lw, label='ROC Curve Logistic Regression (area = %0.2f)' % roc_auc['lr'][0], zorder=4)
_ = ax.plot(fpr['rf'][0], tpr['rf'][0], color='#93c83d',
         lw=lw, label='ROC Curve Random Forest (area = %0.2f)' % roc_auc['rf'][0], zorder=4)
_ = ax.plot([0, 1], [0, 1], color='#66788c', lw=lw, linestyle='--', zorder=3)
_ = ax.set_xlim([0.0, 1.0])
_ = ax.set_ylim([0.0, 1.0])
_ = ax.set_xlabel('False Positive Rate')
_ = ax.set_ylabel('True Positive Rate (recall)')
_ = ax.legend(loc="lower right")
_ = ax.grid(linestyle='-.')

We see that with one hot encoding, both the logistic regression model and the random forest model are now performing similarly. One hot encoding is better suited to encoding for logistic regression.

What about feature scaling? Because logistic regression has a single hyperplane decision boundary, features that have significant differences in magnitude can make it difficult for the algorithm to fit the plane. The differences here are small, so scaling likely will not have a big impact, but let's take a look. We'll use a min-max scaler and re-run our models.

In [None]:
min_max_scaler = MinMaxScaler()

X_test_ohe[:, X_test_ohe.shape[1]-2] = scale(X_test_ohe[:, X_test_ohe.shape[1]-2])
X_test_ohe[:, X_test_ohe.shape[1]-1] = scale(X_test_ohe[:, X_test_ohe.shape[1]-1])
X_train_ohe[:, X_train_ohe.shape[1]-2] = scale(X_train_ohe[:, X_train_ohe.shape[1]-2])
X_train_ohe[:, X_train_ohe.shape[1]-1] = scale(X_train_ohe[:, X_train_ohe.shape[1]-1])
X_test_ohe[:, X_test_ohe.shape[1]-2] = min_max_scaler.fit_transform(X_test_ohe[:, X_test_ohe.shape[1]-2])
X_test_ohe[:, X_test_ohe.shape[1]-1] = min_max_scaler.fit_transform(X_test_ohe[:, X_test_ohe.shape[1]-1])
X_train_ohe[:, X_train_ohe.shape[1]-2] = min_max_scaler.fit_transform(X_train_ohe[:, X_train_ohe.shape[1]-2])
X_train_ohe[:, X_train_ohe.shape[1]-1] = min_max_scaler.fit_transform(X_train_ohe[:, X_train_ohe.shape[1]-1])

In [None]:
logreg = LogisticRegression(penalty='l2')
rf = RandomForestClassifier(n_estimators=100)

_ = logreg.fit(X_train_ohe, y_train_ohe)
_ = rf.fit(X_train_ohe, y_train_ohe)

print('Logistic Regression Accuracy: ', int(100*round(logreg.score(X_test_ohe, y_test_ohe), 2)), '%', sep='')
print('      Random Forest Accuracy: ', int(100*round(rf.score(X_test_ohe, y_test_ohe), 2)), '%', sep='')

In [None]:
y_score_lr = logreg.decision_function(X_test_ohe)
y_score_rf = rf.predict_proba(X_test_ohe)[:, 1]

fpr = {'lr': {0: 0, 1: 0}, 'rf': {0: 0, 1: 0}}
tpr = {'lr': {0: 0, 1: 0}, 'rf': {0: 0, 1: 0}}
roc_auc = {'lr': {0: 0, 1: 0}, 'rf': {0: 0, 1: 0}}
for i in range(1):
    fpr['lr'][i], tpr['lr'][i], _ = roc_curve(y_test_ohe, y_score_lr)
    roc_auc['lr'][i] = auc(fpr['lr'][i], tpr['lr'][i])
    fpr['rf'][i], tpr['rf'][i], _ = roc_curve(y_test_ohe, y_score_rf)
    roc_auc['rf'][i] = auc(fpr['rf'][i], tpr['rf'][i])
    
fig = plt.figure(figsize=(6, 6))
ax = fig.add_subplot(111)
lw = 2
_ = ax.plot(fpr['lr'][0], tpr['lr'][0], color='#00aeef',
         lw=lw, label='ROC Curve Logistic Regression (area = %0.2f)' % roc_auc['lr'][0], zorder=4)
_ = ax.plot(fpr['rf'][0], tpr['rf'][0], color='#93c83d',
         lw=lw, label='ROC Curve Random Forest (area = %0.2f)' % roc_auc['rf'][0], zorder=4)
_ = ax.plot([0, 1], [0, 1], color='#66788c', lw=lw, linestyle='--', zorder=3)
_ = ax.set_xlim([0.0, 1.0])
_ = ax.set_ylim([0.0, 1.0])
_ = ax.set_xlabel('False Positive Rate')
_ = ax.set_ylabel('True Positive Rate (recall)')
_ = ax.legend(loc="lower right")
_ = ax.grid(linestyle='-.')

The results are very similar to before. But what would happen if we had another feature with much larger units? Let's say the distance was for some reason in centimeters instead of kilometers, and we didn't do any scaling.

In [None]:
dayofweek_ohe = OneHotEncoder().fit_transform(df_imputed['DayOfWeek'].values.reshape(-1, 1)).todense()
triptype_ohe = OneHotEncoder().fit_transform(df_imputed['TripType'].values.reshape(-1, 1)).todense()
hourofday_ohe = OneHotEncoder().fit_transform(df_imputed['HourOfDay'].values.reshape(-1, 1)).todense()

distance = df_imputed['Distance'].values.reshape(-1, 1)*100000
speed = df_imputed['AvgSpeed'].values.reshape(-1, 1)

X_ohe  = np.concatenate([dayofweek_ohe, triptype_ohe, hourofday_ohe, distance, speed], axis=1)
y = label_binarize(df_imputed['Loaded'].values, classes=['Loaded', 'Unloaded'])

X_test_ohe, X_train_ohe, y_test_ohe, y_train_ohe = train_test_split(X_ohe, y, shuffle=True)

logreg = LogisticRegression(penalty='l2')
rf = RandomForestClassifier(n_estimators=100)

_ = logreg.fit(X_train_ohe, y_train_ohe)
_ = rf.fit(X_train_ohe, y_train_ohe)

print('Logistic Regression Accuracy: ', int(100*round(logreg.score(X_test_ohe, y_test_ohe), 2)), '%', sep='')
print('      Random Forest Accuracy: ', int(100*round(rf.score(X_test_ohe, y_test_ohe), 2)), '%', sep='')

Our logistic regression accuracy has dropped down to 55%, while the random forest accuracy is relatively unchanged. Let's take a look at the ROC curve.

In [None]:
y_score_lr = logreg.decision_function(X_test_ohe)
y_score_rf = rf.predict_proba(X_test_ohe)[:, 1]

fpr = {'lr': {0: 0, 1: 0}, 'rf': {0: 0, 1: 0}}
tpr = {'lr': {0: 0, 1: 0}, 'rf': {0: 0, 1: 0}}
roc_auc = {'lr': {0: 0, 1: 0}, 'rf': {0: 0, 1: 0}}
for i in range(1):
    fpr['lr'][i], tpr['lr'][i], _ = roc_curve(y_test_ohe, y_score_lr)
    roc_auc['lr'][i] = auc(fpr['lr'][i], tpr['lr'][i])
    fpr['rf'][i], tpr['rf'][i], _ = roc_curve(y_test_ohe, y_score_rf)
    roc_auc['rf'][i] = auc(fpr['rf'][i], tpr['rf'][i])
    
fig = plt.figure(figsize=(6, 6))
ax = fig.add_subplot(111)
lw = 2
_ = ax.plot(fpr['lr'][0], tpr['lr'][0], color='#00aeef',
         lw=lw, label='ROC Curve Logistic Regression (area = %0.2f)' % roc_auc['lr'][0], zorder=4)
_ = ax.plot(fpr['rf'][0], tpr['rf'][0], color='#93c83d',
         lw=lw, label='ROC Curve Random Forest (area = %0.2f)' % roc_auc['rf'][0], zorder=4)
_ = ax.plot([0, 1], [0, 1], color='#66788c', lw=lw, linestyle='--', zorder=3)
_ = ax.set_xlim([0.0, 1.0])
_ = ax.set_ylim([0.0, 1.0])
_ = ax.set_xlabel('False Positive Rate')
_ = ax.set_ylabel('True Positive Rate (recall)')
_ = ax.legend(loc="lower right")
_ = ax.grid(linestyle='-.')

We can see from the ROC curve that the logistic regression model is now performing worse than random chance, due to the inappropriate scaling. Let's scale our inputs and try again.

In [None]:
min_max_scaler = MinMaxScaler()

X_test_ohe[:, X_test_ohe.shape[1]-2] = scale(X_test_ohe[:, X_test_ohe.shape[1]-2])
X_test_ohe[:, X_test_ohe.shape[1]-1] = scale(X_test_ohe[:, X_test_ohe.shape[1]-1])
X_train_ohe[:, X_train_ohe.shape[1]-2] = scale(X_train_ohe[:, X_train_ohe.shape[1]-2])
X_train_ohe[:, X_train_ohe.shape[1]-1] = scale(X_train_ohe[:, X_train_ohe.shape[1]-1])
X_test_ohe[:, X_test_ohe.shape[1]-2] = min_max_scaler.fit_transform(X_test_ohe[:, X_test_ohe.shape[1]-2])
X_test_ohe[:, X_test_ohe.shape[1]-1] = min_max_scaler.fit_transform(X_test_ohe[:, X_test_ohe.shape[1]-1])
X_train_ohe[:, X_train_ohe.shape[1]-2] = min_max_scaler.fit_transform(X_train_ohe[:, X_train_ohe.shape[1]-2])
X_train_ohe[:, X_train_ohe.shape[1]-1] = min_max_scaler.fit_transform(X_train_ohe[:, X_train_ohe.shape[1]-1])

In [None]:
logreg = LogisticRegression(penalty='l2')
rf = RandomForestClassifier(n_estimators=100)

_ = logreg.fit(X_train_ohe, y_train_ohe)
_ = rf.fit(X_train_ohe, y_train_ohe)

print('Logistic Regression Accuracy: ', int(100*round(logreg.score(X_test_ohe, y_test_ohe), 2)), '%', sep='')
print('      Random Forest Accuracy: ', int(100*round(rf.score(X_test_ohe, y_test_ohe), 2)), '%', sep='')

In [None]:
y_score_lr = logreg.decision_function(X_test_ohe)
y_score_rf = rf.predict_proba(X_test_ohe)[:, 1]

fpr = {'lr': {0: 0, 1: 0}, 'rf': {0: 0, 1: 0}}
tpr = {'lr': {0: 0, 1: 0}, 'rf': {0: 0, 1: 0}}
roc_auc = {'lr': {0: 0, 1: 0}, 'rf': {0: 0, 1: 0}}
for i in range(1):
    fpr['lr'][i], tpr['lr'][i], _ = roc_curve(y_test_ohe, y_score_lr)
    roc_auc['lr'][i] = auc(fpr['lr'][i], tpr['lr'][i])
    fpr['rf'][i], tpr['rf'][i], _ = roc_curve(y_test_ohe, y_score_rf)
    roc_auc['rf'][i] = auc(fpr['rf'][i], tpr['rf'][i])
    
fig = plt.figure(figsize=(6, 6))
ax = fig.add_subplot(111)
lw = 2
_ = ax.plot(fpr['lr'][0], tpr['lr'][0], color='#00aeef',
         lw=lw, label='ROC Curve Logistic Regression (area = %0.2f)' % roc_auc['lr'][0], zorder=4)
_ = ax.plot(fpr['rf'][0], tpr['rf'][0], color='#93c83d',
         lw=lw, label='ROC Curve Random Forest (area = %0.2f)' % roc_auc['rf'][0], zorder=4)
_ = ax.plot([0, 1], [0, 1], color='#66788c', lw=lw, linestyle='--', zorder=3)
_ = ax.set_xlim([0.0, 1.0])
_ = ax.set_ylim([0.0, 1.0])
_ = ax.set_xlabel('False Positive Rate')
_ = ax.set_ylabel('True Positive Rate (recall)')
_ = ax.legend(loc="lower right")
_ = ax.grid(linestyle='-.')

We're back to normal again! You can see the importance of understanding the underlying algorithms and how they treat your features.