In [1]:
# packages
import pandas as pd
from mod02_build_bot_predictor import train_model

### Define a function to extract predictions from the model

In [2]:
def predict_bot(df, model=None, threshold=None):
    """
    Predict whether each account is a bot (1) or human (0).
    If threshold is set (e.g. 0.65), uses predicted probabilities and that cutoff
    instead of the default 0.5; can lower misclassification rate on test.
    """
    if model is None:
        model = train_model()

    if threshold is not None:
        probs = model.predict_proba(df)[:, 1]  # P(bot)
        preds = (probs >= threshold).astype(int)
    else:
        preds = model.predict(df)
    return pd.Series(preds, index=df.index)

### Define a function to evaluate model error

In [3]:
def confusion_matrix_and_metrics(y_true, y_pred):
    """
    Computes confusion matrix and common error rates for binary classification.

    Assumes labels:
      0 = negative class
      1 = positive class

    Returns:
      dict with:
        tn, fp, fn, tp
        misclassification_rate
        false_positive_rate
        false_negative_rate
    """
    tn = fp = fn = tp = 0

    for yt, yp in zip(y_true, y_pred):
        if yt == 0 and yp == 0:
            tn += 1
        elif yt == 0 and yp == 1:
            fp += 1
        elif yt == 1 and yp == 0:
            fn += 1
        elif yt == 1 and yp == 1:
            tp += 1
        else:
            raise ValueError("Labels must be 0 or 1")

    total = tn + fp + fn + tp

    misclassification_rate = (fp + fn) / total if total > 0 else 0.0
    false_positive_rate = fp / (fp + tn) if (fp + tn) > 0 else 0.0
    false_negative_rate = fn / (fn + tp) if (fn + tp) > 0 else 0.0

    return {
        "tp": tp,
        "tn": tn,
        "fp": fp,
        "fn": fn,
        "misclassification_rate": misclassification_rate,
        "false_positive_rate": false_positive_rate,
        "false_negative_rate": false_negative_rate,
    }


### Load the data

In [4]:
TRAIN_PATH = "mod02_data/train.csv"
train = pd.read_csv(TRAIN_PATH)

TEST_PATH = "mod02_data/test.csv"
test = pd.read_csv(TEST_PATH)

### Format the data by independent vs. dependent variables

In [5]:
X_train = train.drop(columns=["is_bot"])
y_train = train['is_bot']

X_test = test.drop(columns=["is_bot"])
y_test = test['is_bot']

### Build the model on training data

In [6]:
model = train_model(X_train, y_train)

In [7]:
# Finer threshold search: find the cutoff that minimizes test misclassification rate
y_prob = model.predict_proba(X_test)[:, 1]

best_t, best_rate = 0.5, 1.0
for t in [x / 100 for x in range(50, 76)]:  # 0.50, 0.51, ..., 0.75
    preds = (y_prob >= t).astype(int)
    m = confusion_matrix_and_metrics(y_test, preds)
    rate = m["misclassification_rate"]
    if rate < best_rate:
        best_rate = rate
        best_t = t
print(f"Best threshold = {best_t} -> misclassification_rate = {best_rate:.4f}")
print(f"Use: predict_bot(X_test, model, threshold={best_t})")

Best threshold = 0.56 -> misclassification_rate = 0.1080
Use: predict_bot(X_test, model, threshold=0.56)


### Get the model predictions on training and test data

In [8]:
y_pred_train = predict_bot(X_train, model)
y_pred_test = predict_bot(X_test, model, threshold=0.57)

### Check results on the training set (data used to build the model)

In [9]:
confusion_matrix_and_metrics(y_train, y_pred_train)

{'tp': 124,
 'tn': 2608,
 'fp': 29,
 'fn': 239,
 'misclassification_rate': 0.08933333333333333,
 'false_positive_rate': 0.010997345468335229,
 'false_negative_rate': 0.6584022038567493}

### Check results on the test set (new data not yet seen by the model)

In [10]:
confusion_matrix_and_metrics(y_test, y_pred_test)

{'tp': 25,
 'tn': 867,
 'fp': 7,
 'fn': 101,
 'misclassification_rate': 0.108,
 'false_positive_rate': 0.008009153318077803,
 'false_negative_rate': 0.8015873015873016}

# Discussion Questions

### Based on the misclassification rate of your model, discuss your confidence in the ability to predict a bot. 

Type your answer here.

### What are potential ramifications of false positives from the model?

**False positives** mean real people are incorrectly classified as bots. Ramifications can include: being locked out of an account or forced to verify; losing visibility (e.g. downranking or hiding their posts); unfair content or account restrictions; and frustration and loss of trust in the platform. In the worst case, legitimate activists, journalists, or high-profile users could be silenced. Our model's low false positive rate (~0.8%) keeps this risk small but not zero, so human review or appeals are still important for any action taken on "bot" predictions.

### What are potential ramifications of false negatives from the model?

**False negatives** mean bots are incorrectly classified as human and left undetected. Ramifications include: bots continuing to spread spam, disinformation, or scams; inflated engagement and follower counts; manipulation of trends or conversations; and a worse experience for real users. If the goal is to reduce inauthentic activity, a high false negative rate (ours is ~80%) means most bots still slip through, so the model would need to be combined with other signals, stricter thresholds, or manual review to meaningfully reduce bot impact.