# Generate Sample Predictions
Load the open-ai, gpt, and ensemble models to generate predictions over manual examples.

## Constants and Imports

In [4]:
!pip install openai
!pip install pytorch-transformers
!pip install transformers
!pip install nltk
import nltk
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')
!pip install tweet-preprocessor

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting openai
  Downloading openai-0.27.7-py3-none-any.whl (71 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m72.0/72.0 kB[0m [31m3.0 MB/s[0m eta [36m0:00:00[0m
Collecting aiohttp (from openai)
  Downloading aiohttp-3.8.4-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.0/1.0 MB[0m [31m20.2 MB/s[0m eta [36m0:00:00[0m
Collecting multidict<7.0,>=4.5 (from aiohttp->openai)
  Downloading multidict-6.0.4-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (114 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m114.5/114.5 kB[0m [31m13.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting async-timeout<5.0,>=4.0.0a3 (from aiohttp->openai)
  Downloading async_timeout-4.0.2-py3-none-any.whl (5.8 kB)
Collecting yarl<2.0,>=1.0 (from aiohttp->openai)
  Downloadin

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting tweet-preprocessor
  Downloading tweet_preprocessor-0.6.0-py3-none-any.whl (27 kB)
Installing collected packages: tweet-preprocessor
Successfully installed tweet-preprocessor-0.6.0


In [5]:
from joblib import load
import numpy as np
import openai
import os
import pandas as pd
from scipy.special import softmax
from sklearn.linear_model import LogisticRegression
import time
from typing import List

In [6]:
import torch
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from keras.utils import pad_sequences
from sklearn.model_selection import train_test_split
import preprocessor as p

from transformers import XLMModel, BertTokenizer, BertForSequenceClassification, RobertaTokenizerFast, RobertaForSequenceClassification
from transformers import AdamW
import nltk
from nltk.stem import 	WordNetLemmatizer
from nltk.stem.porter import PorterStemmer
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))

from tqdm import tqdm, trange
import pandas as pd
import io
import numpy as np
import matplotlib.pyplot as plt

In [7]:
BERT_CHECKPOINT_FILE = "BERT_base_uncased_best_model.ckpt"
ENSEMBLE_MODEL_FILE = "'ensemble_model.joblib'"

TRAIN_FILE = "full_train.csv" # contains examples that Chat-GPT uses to learn how to predict

MAX_LEN = 128 # used for BERT model

MAXIMUM_NUM_CHAT_GPT_MESSAGES = 2048 # maximum number of messages
NUM_REQUIRED_CHAT_GPT_MESSAGES = 2 # number of structuring messages we must include to Chat-GPT


MAX_TRAIN_ROWS = (MAXIMUM_NUM_CHAT_GPT_MESSAGES - NUM_REQUIRED_CHAT_GPT_MESSAGES) // 150


ZERO_LABEL_KEYWORD = "real"
ONE_LABEL_KEYWORD = "fake"

NO_GPT_PRED_NUM_LABEL = -1

In [8]:
# TODO: import these from token.json when we move this notebook to our github code

openai.organization = "REDACTED"
openai.api_key = "REDACTED"

## Mounting Google Drive

In [9]:
GOOGLE_DRIVE_MOUNT_PATH_PREFIX = '/content/drive'
MY_CS152_DATA_FILE_PATH = "drive/MyDrive/Senior/SenSpr/CS152/CS152 Group Project/Milestone 3/Code/Data/"  # NOTE: you have to modify this to fit wherever the CS152 Group Project/Milestone 3/Code/Data is in your Google Drive

In [10]:
from google.colab import drive
drive.mount(GOOGLE_DRIVE_MOUNT_PATH_PREFIX)

Mounted at /content/drive


In [11]:
cd $MY_CS152_DATA_FILE_PATH

/content/drive/MyDrive/Senior/SenSpr/CS152/CS152 Group Project/Milestone 3/Code/Data


## Loading in Each Model

### BERT

In [12]:
bert_model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)

Downloading (…)lve/main/config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

In [14]:
bert_model.load_state_dict(torch.load(BERT_CHECKPOINT_FILE, map_location=torch.device('cpu')))
bert_model.eval()

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12,

### Chat-GPT

### Ensemble

In [29]:
ensemble_model = load('ensemble_model.joblib')

## Data Preprocessing and Model Application Helper Functions

### BERT

In [30]:
wordnet_lemmatizer = WordNetLemmatizer()
porter_stemmer  = PorterStemmer()

In [31]:
p.set_options(p.OPT.URL, p.OPT.EMOJI)

def text_preprocess(text, lemmatizer, stemmer):
    # text = text.strip('\xa0')
    text = p.clean(text)
    tokenization = nltk.word_tokenize(text)     
    tokenization = [w for w in tokenization if not w in stop_words]
    #   text = ' '.join([porter_stemmer.stem(w) for w in tokenization])
    #   text = ' '.join([lemmatizer.lemmatize(w) for w in tokenization])
    # text = re.sub(r'\([0-9]+\)', '', text).strip()    
    return text

In [32]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

In [33]:
def Encode_TextWithAttention(sentence,tokenizer,maxlen,padding_type='max_length',attention_mask_flag=True):
    encoded_dict = tokenizer.encode_plus(sentence, add_special_tokens=True, max_length=maxlen, truncation=True, padding=padding_type, return_attention_mask=attention_mask_flag)
    return encoded_dict['input_ids'],encoded_dict['attention_mask']

def Encode_TextWithoutAttention(sentence,tokenizer,maxlen,padding_type='max_length',attention_mask_flag=False):
    encoded_dict = tokenizer.encode_plus(sentence, add_special_tokens=True, max_length=maxlen, truncation=True, padding=padding_type, return_attention_mask=attention_mask_flag)
    return encoded_dict['input_ids']

def get_TokenizedTextWithAttentionMask(sentenceList, tokenizer):
    token_ids_list,attention_mask_list = [],[]
    for sentence in sentenceList:
        token_ids,attention_mask = Encode_TextWithAttention(sentence,tokenizer,MAX_LEN)
        token_ids_list.append(token_ids)
        attention_mask_list.append(attention_mask)
    return token_ids_list,attention_mask_list

def get_TokenizedText(sentenceList, tokenizer):
    token_ids_list = []
    for sentence in sentenceList:
        token_ids = Encode_TextWithoutAttention(sentence,tokenizer,MAX_LEN)
        token_ids_list.append(token_ids)
    return token_ids_list

In [34]:
def bert_preprocess(text_inputs: List, tokenizer = tokenizer, wordnet_lemmatizer = wordnet_lemmatizer, porter_stemmer = porter_stemmer):
  preprocessed_texts = []
  for text in text_inputs:
    preprocessed_texts.append(text_preprocess(text, wordnet_lemmatizer, porter_stemmer))
  
  token_ids, attention_masks = torch.tensor(get_TokenizedTextWithAttentionMask(preprocessed_texts, tokenizer))

  return token_ids, attention_masks

In [55]:
def generate_bert_predictions(text_inputs: List, bert_model = bert_model):
  # might need to shape into batches
  token_ids, attention_masks = bert_preprocess(text_inputs)

  output = bert_model(token_ids, token_type_ids=None, attention_mask=attention_masks)
  logits = output[0]
  
  logits = logits.detach().cpu().numpy()
  pred = np.argmax(logits, axis=1).flatten()

  # check the dimensions to make sure we're doing the right thing
  print(logits)
  score = torch.sigmoid(torch.tensor(logits)).numpy()[:,1]
  print(score)
  return pred, score

### Chat-GPT

In [36]:
train_df = pd.read_csv(TRAIN_FILE)

In [37]:
gpt_messages = [{"role": "system", "content": "You are a content moderation system. Classify input as either 'real' or 'fake'. Do not use more than one word."}]

In [38]:
for index, row in train_df.head(MAX_TRAIN_ROWS).iterrows():
  gpt_messages.append({"role": "user", "content": f"{row['text']}"})
  gpt_messages.append({"role": "assistant", "content": f"{row['label']}"})

In [39]:
def clean_pred(pred):
  if pred == None:
    return pred
  cleaned = pred.lower()
  cleaned = pred.strip()
  cleaned = ''.join([i for i in cleaned if i.isalpha()])
  return cleaned

def assign_label(pred):
  if pred == ZERO_LABEL_KEYWORD:
    return 0
  elif pred == ONE_LABEL_KEYWORD:
    return 1
  elif pred != None:
    return 0.5 
  else:  # prediciton was None (gpt response was not correctly produced)
    return NO_GPT_PRED_NUM_LABEL

In [40]:
def generate_gpt_predictions(text_inputs, prefix_messages = gpt_messages):
  preds = []
  for input in text_inputs:
    messages = prefix_messages[:]
    messages.append({"role": "user", "content": f"{row['text']}"})  

    try:
      response = openai.ChatCompletion.create(
      model="gpt-3.5-turbo",
      messages=messages
      )
      
      preds.append(response['choices'][0]['message']['content'])

    except:
      preds.append(None)

  num_preds = [assign_label(clean_pred(pred)) for pred in preds]
  return num_preds

## Ensemble

In [41]:
def generate_ensemble_preds_and_scores(text_inputs, ensemble_model = ensemble_model):
  bert_preds, bert_scores = generate_bert_predictions(text_inputs)
  gpt_preds = generate_gpt_predictions(text_inputs)

  ensemble_preds = []
  ensemble_scores = []
  for idx in range(len(gpt_preds)):
    if gpt_preds[idx] == NO_GPT_PRED_NUM_LABEL:
      ensemble_preds.append(bert_preds[idx])
      ensemble_scores.append(bert_scores[idx])
    else:
      ensemble_input = np.array([gpt_preds[idx], bert_preds[idx]])
      ensemble_preds.append(ensemble_model.predict(X_test))
      ensemble_preds.append(ensemble_model.predict_proba(X_test)[:, 1])

  return ensemble_preds, ensemble_scores

## Feed in samples

In [65]:
text_samples_disinfo = [
  "You can cure Covid by injecting bleach, Ivermectin, or Hydroxychloroquine into your bloodstream",
  "The CDC states that we don’t need to mask after you’ve had COVID-19, as it is no longer contagious",
  "Some racial groups might be immune to COVID-19",
  "Joe Biden is lying to us about the severity of the vaccine – he’s just wants more control",
  "The government is lying to us – COVID-19 does not exist",
  "Bill Gates is using the vaccine as a vehicle for inserting tracking chips into our bodies",
  "COVID-19 was definitely made as a bioweapon – we should blame China"
]

In [43]:
non_examples = [
  "CDC data show disproportionate COVID-19 impact in American Indian/Alaska Native populations",
  "COVID-19 outbreak hits large Bay Area hospital, prompting new mask rules",
  "The CDC sucks",
  "Joe Biden should do more to address the pandemic",
  "It was life as usual for Brittany, who lives in Laurel Hollow, NY. She was taking her pups on a long walk around the block when she started to get an upset stomach. 'I just thought it was anxiety and nerves, since I had just heard the news about coronavirus in New York, so I paid it no mind.'",
]

In [66]:
ex_preds, ex_scores = generate_ensemble_preds_and_scores(text_samples_disinfo)

[[-4.809136   4.1480427]
 [-4.527791   3.9271705]
 [-3.565544   2.8630652]
 [-4.528927   4.000493 ]
 [-5.0487394  4.434396 ]
 [-4.909004   4.2238917]
 [-5.1625233  4.4584665]]
[0.9844503  0.98068124 0.9459901  0.9820225  0.9882768  0.9855698
 0.98855245]


In [57]:
print(ex_preds)

[1, 1, 1, 1, 1, 1, 1]


In [61]:
non_ex_preds, non_ex_scores = generate_ensemble_preds_and_scores(non_examples)

[[-1.1073209  0.9049733]
 [-5.057027   4.410523 ]
 [-4.1829567  3.2456396]
 [-4.5703564  3.9447515]
 [-3.4746425  3.440519 ]]
[0.71197045 0.987997   0.9625161  0.9810115  0.9689472 ]


In [62]:
print(non_ex_preds)

[1, 1, 1, 1, 1]
