In [15]:
# packages
import pandas as pd
from mod02_build_bot_predictor import train_model, train_model_RandomCV, train_model_GridCV
from sklearn.metrics import accuracy_score
import math

### Define a function to extract predictions from the model

In [2]:
def predict_bot(df, model=None):
    """
    Predict whether each account is a bot (1) or human (0).
    """
    if model is None:
        model = train_model()

    preds = model.predict(df)
    return pd.Series(preds, index=df.index)

### Define a function to evaluate model error

In [3]:
def confusion_matrix_and_metrics(y_true, y_pred):
    """
    Computes confusion matrix and common error rates for binary classification.

    Assumes labels:
      0 = negative class
      1 = positive class

    Returns:
      dict with:
        tn, fp, fn, tp
        misclassification_rate
        false_positive_rate
        false_negative_rate
    """
    tn = fp = fn = tp = 0

    for yt, yp in zip(y_true, y_pred):
        if yt == 0 and yp == 0:
            tn += 1
        elif yt == 0 and yp == 1:
            fp += 1
        elif yt == 1 and yp == 0:
            fn += 1
        elif yt == 1 and yp == 1:
            tp += 1
        else:
            raise ValueError("Labels must be 0 or 1")

    total = tn + fp + fn + tp

    misclassification_rate = (fp + fn) / total if total > 0 else 0.0
    false_positive_rate = fp / (fp + tn) if (fp + tn) > 0 else 0.0
    false_negative_rate = fn / (fn + tp) if (fn + tp) > 0 else 0.0

    return {
        "tp": tp,
        "tn": tn,
        "fp": fp,
        "fn": fn,
        "misclassification_rate": misclassification_rate,
        "false_positive_rate": false_positive_rate,
        "false_negative_rate": false_negative_rate,
    }


### Load the data

In [4]:
TRAIN_PATH = "mod02_data/train.csv"
train = pd.read_csv(TRAIN_PATH)

TEST_PATH = "mod02_data/test.csv"
test = pd.read_csv(TEST_PATH)

### Format the data by independent vs. dependent variables

In [5]:
X_train = train.drop(columns=["is_bot"])
y_train = train['is_bot']

X_test = test.drop(columns=["is_bot"])
y_test = test['is_bot']

### Build the model on training data

In [6]:
model = train_model(X_train, y_train)

### Get the model predictions on training and test data

In [7]:
y_pred_train = predict_bot(X_train, model)
y_pred_test = predict_bot(X_test, model)

### Check results on the training set (data used to build the model)

In [8]:
confusion_matrix_and_metrics(y_train, y_pred_train)

{'tp': 65,
 'tn': 2619,
 'fp': 18,
 'fn': 298,
 'misclassification_rate': 0.10533333333333333,
 'false_positive_rate': 0.006825938566552901,
 'false_negative_rate': 0.8209366391184573}

### Check results on the test set (new data not yet seen by the model)

In [None]:
confusion_matrix_and_metrics(y_test, y_pred_test)


{'tp': 21,
 'tn': 871,
 'fp': 3,
 'fn': 105,
 'misclassification_rate': 0.108,
 'false_positive_rate': 0.003432494279176201,
 'false_negative_rate': 0.8333333333333334}

In [None]:
#model.best_params_
#After grid search CV
'''
{'learning_rate': 0.01,
 'max_depth': 2,
 'min_samples_leaf': 1,
 'min_samples_split': 50,
 'n_estimators': 500,
 'subsample': 0.75}
 '''

#After doing a grid search I noteced learning rate was at min so I tried lowering it more, and increased number of estimators to compensate.
# This got pretty solid results


#Also to explain the RandomCV function I tried that first but got awful results so tried gridsearch instead. 
#Decided to leave it just cuz.

"\n{'learning_rate': 0.01,\n 'max_depth': 2,\n 'min_samples_leaf': 1,\n 'min_samples_split': 50,\n 'n_estimators': 500,\n 'subsample': 0.75}\n "

In [None]:
def confidence_interval(y_test, y_pred_test, z):
    '''calculates confidence interval for true prediction accuracy'''
    acc = accuracy_score(y_test, y_pred_test)
    n = X_test.shape[0]

    return (acc - z * math.sqrt((acc * (1- acc)/n)), acc + z *  math.sqrt((acc * (1- acc)/n)))

confidence_interval(y_test, y_pred_test, 1.96)


(0.8727624227720848, 0.9112375772279152)

# Discussion Questions

### Based on the misclassification rate of your model, discuss your confidence in the ability to predict a bot. 

I can say with 95% confidence that the true prediction accuracy of my model lies somewhere between the interval: (0.8727624227720848, 0.9112375772279152)

So in not nerd terms basically the model correctly deduces whether or not a person is a bot roughly 90% of the time. However it's really not that simple.
the false_negative_rate is .833333 which is really quite bad, will go into detail of this in further questions.

### What are potential ramifications of false positives from the model?

The model has a false positive rate of 0.03% extremely low. Meaning there is a quite unlikely chance for the model to falsely classify a real human as a bot. It's not perfect though so there is still risk. of 874 people 3 were falsely flagged positive. You scale that up to thousands of people and that's quite a few.

### What are potential ramifications of false negatives from the model?

This is where the model really kinda sucks. You could honestly consider the model a failure off of this alone. Basically of the 126 bots in our sample 105 were falsely marked as negative. So ultimately if your goal for this model would be to find out which humans were bots-- which I mean like, why else would you make this model--it really sucks pretty bad. Ultimately this is a pretty tough fix too, if I were to try something I would try stratifying the train and test sets so there is a much higher representation of bots, but then you run into a risk of the model potentially not being able to learn what isn't a bot as well and increasing your false positive rate. That or maybe more features could be collected to see if there is some feature that correlates more with a is_bot being true.