# Ensemble Model
Creating a logistic regression model to find the best weighting of BERT and GPT inputs.

## Imports and Constants

In [102]:
from joblib import dump
import numpy as np
import os
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
import time

In [71]:
GPT_PRED = "gpt_preds.csv"
BERT_PRED = "bert_preds.csv"
TEST_FILE = "full_test.csv"

In [72]:
N_TEST_FOR_ENSEMBLE = 500 # the number of test cases used to create the ensemble predictions

## Mounting Google Drive

In [73]:
GOOGLE_DRIVE_MOUNT_PATH_PREFIX = '/content/drive'
MY_CS152_DATA_FILE_PATH = "drive/MyDrive/Senior/SenSpr/CS152/CS152 Group Project/Milestone 3/Code/Data/" # NOTE: you have to modify this to fit wherever the CS152 Group Project/Milestone 3/Code/Data is in your Google Drive

In [74]:
from google.colab import drive
drive.mount(GOOGLE_DRIVE_MOUNT_PATH_PREFIX)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [75]:
cd $MY_CS152_DATA_FILE_PATH

[Errno 2] No such file or directory: 'drive/MyDrive/Senior/SenSpr/CS152/CS152 Group Project/Milestone 3/Code/Data/'
/content/drive/MyDrive/Senior/SenSpr/CS152/CS152 Group Project/Milestone 3/Code/Data


## Reading in data
Note: gpt preds are set to -1 where no prediction could be made, 0.5 where an ambiguous response was given.

In [76]:
full_gpt_pred = pd.read_csv(GPT_PRED, header = None)
full_bert_pred = pd.read_csv(BERT_PRED, header = None)

In [77]:
def rename_single_col(new_name, df):
  column_names = df.columns.tolist()

  df = df.rename(columns={column_names[0]: new_name})
  return df

In [78]:
full_gpt_pred = rename_single_col("gpt", full_gpt_pred)
full_bert_pred = rename_single_col("bert", full_bert_pred)

In [79]:
test_data = pd.read_csv(TEST_FILE)

# only keep the "label" column
test_labels = test_data[["label"]]

In [80]:
print(len(gpt_pred))
print(len(bert_pred))
print(len(test_labels))

500
500
2133


In [81]:
gpt_pred = full_gpt_pred.head(N_TEST_FOR_ENSEMBLE)
bert_pred = full_bert_pred.head(N_TEST_FOR_ENSEMBLE)
train_labels = test_labels.head(N_TEST_FOR_ENSEMBLE)

In [82]:
train_labels.head()

Unnamed: 0,label
0,real
1,real
2,fake
3,fake
4,fake


In [83]:
# turn labels into binary class labels
train_labels["label"] = train_labels["label"].map({'real': 0, 'fake': 1})
test_labels["label"] = test_labels["label"].map({'real': 0, 'fake': 1})

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_labels["label"] = train_labels["label"].map({'real': 0, 'fake': 1})
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_labels["label"] = test_labels["label"].map({'real': 0, 'fake': 1})


In [84]:
train_labels["label"].head()

0    0
1    0
2    1
3    1
4    1
Name: label, dtype: int64

In [85]:
unique_values = train_labels.values.flatten().tolist()
unique_values = list(set(unique_values))

# Print the unique values
print(unique_values)

[0, 1]


In [86]:
# filter out rows for which the gpt_preds are -1
# test_labels = test_labels[gpt_pred != -1]
# bert_pred = bert_pred[gpt_pred != -1]
# gpt_pred = gpt_pred[gpt_pred != -1]

## Train Model

In [87]:
X = pd.concat([gpt_pred, bert_pred], axis=1)

In [88]:
model = LogisticRegression()
model.fit(X, train_labels)

  y = column_or_1d(y, warn=True)


In [89]:
X_test = pd.concat([full_gpt_pred, full_bert_pred], axis=1)

In [90]:
y_pred = model.predict(X_test)

In [95]:
y_scores = model.predict_proba(X_test)[:, 1]

In [96]:
print(len(y_pred))
print(len(y_scores))

2133
2133


In [91]:
# get only the samples not used to train the ensemble
holdout_y_pred = y_pred[N_TEST_FOR_ENSEMBLE:]
holdout_test_labels = test_labels.tail(len(test_labels) - N_TEST_FOR_ENSEMBLE)

In [92]:
accuracy = accuracy_score(test_labels, y_pred)
print("Accuracy:", accuracy)

Accuracy: 0.9681200187529302


In [93]:
# Get the feature weights (coefficients)
feature_weights = model.coef_

# Print the feature weights
for feature, weight in zip(X.columns, feature_weights[0]):
    print(f"Feature: {feature}, Weight: {weight}")

Feature: gpt, Weight: 2.01009329182047
Feature: bert, Weight: 4.982761534978756


## Save Prediction Outputs

In [99]:
np.save("ensemble_preds", y_pred)
np.save("ensemble_scores", y_scores)

In [100]:
np.savetxt("ensemble_preds.csv", y_pred, delimiter=",")
np.savetxt("ensemble_scores.csv", y_scores, delimiter=",")

## Save Ensemble Model

In [103]:
dump(model, 'ensemble_model.joblib')

['ensemble_model.joblib']