# Data Collation
Taking our raw dataset(s) we want to use, and performing any preprocessing / feature engineering as well as train/test, X/y splits for ease of use in classifier training/evaluation.

## Imports

In [37]:
import numpy as np
import os
import pandas as pd

In [38]:
!pip install pytorch-transformers
!pip install transformers
!pip install nltk
import nltk
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [39]:
!pip install tweet-preprocessor

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [40]:
import torch
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from keras.utils import pad_sequences
from sklearn.model_selection import train_test_split
import preprocessor as p

from transformers import XLMModel, BertTokenizer, BertForSequenceClassification, RobertaTokenizerFast, RobertaForSequenceClassification
from transformers import AdamW
import nltk
from nltk.stem import 	WordNetLemmatizer
from nltk.stem.porter import PorterStemmer
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))

from tqdm import tqdm, trange
import pandas as pd
import io
import numpy as np
import matplotlib.pyplot as plt

## Constants

In [41]:
# Dataset filenames

# constraint dataset (https://github.com/diptamath/covid_fake_news)
# CONSTRAINT_TRAIN_FILE = "Constraint_Train.csv"
# CONSTRAINT_VAL_FILE = "Constraint_Val.csv"
# CONSTRAINT_TEST_FILE = "Constraint_Test.csv"


TRAIN_FILE = "full_train.csv"
TEST_FILE = "full_test.csv"

TRAIN_FRAC_FOR_VAL = 0.2 # this fraction of the training data will become validation data

TEXT_COLUMN_LABEL = "text"

In [42]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
n_gpu = torch.cuda.device_count()
print(n_gpu)
# torch.cuda.get_device_name(0)

1


## Mounting Google Drive

In [43]:
GOOGLE_DRIVE_MOUNT_PATH_PREFIX = '/content/drive'
MY_CS152_DATA_FILE_PATH = "drive/MyDrive/Senior/SenSpr/CS152/CS152 Group Project/Milestone 3/Code/Data/"  # NOTE: you have to modify this to fit wherever the CS152 Group Project/Milestone 3/Code/Data is in your Google Drive

In [44]:
from google.colab import drive
drive.mount(GOOGLE_DRIVE_MOUNT_PATH_PREFIX)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [45]:
cd $MY_CS152_DATA_FILE_PATH

[Errno 2] No such file or directory: 'drive/MyDrive/Senior/SenSpr/CS152/CS152 Group Project/Milestone 3/Code/Data/'
/content/drive/MyDrive/Senior/SenSpr/CS152/CS152 Group Project/Milestone 3/Code/Data


# Loading in the dataset(s)


In [60]:
train_df = pd.read_csv(TRAIN_FILE)
test_df = pd.read_csv(TEST_FILE)

In [61]:
print(len(train_df))
print(len(test_df))

8531
2133


In [62]:
MAX_LEN = 128

# split the training data further into train subset and validation subset
df, val_df = train_test_split(train_df, train_size=(1-TRAIN_FRAC_FOR_VAL))

In [63]:
print(len(df))
print(len(val_df))

6824
1707


### Preprocessing

In [64]:
wordnet_lemmatizer = WordNetLemmatizer()
porter_stemmer  = PorterStemmer()

In [65]:
p.set_options(p.OPT.URL, p.OPT.EMOJI)

def preprocess(row, lemmatizer, stemmer):
    text = row[TEXT_COLUMN_LABEL]
    # text = text.strip('\xa0')
    text = p.clean(text)
    tokenization = nltk.word_tokenize(text)     
    tokenization = [w for w in tokenization if not w in stop_words]
    #   text = ' '.join([porter_stemmer.stem(w) for w in tokenization])
    #   text = ' '.join([lemmatizer.lemmatize(w) for w in tokenization])
    # text = re.sub(r'\([0-9]+\)', '', text).strip()    
    return text

In [66]:
df[TEXT_COLUMN_LABEL] = df.apply(lambda x: preprocess(x, wordnet_lemmatizer, porter_stemmer), 1)
val_df[TEXT_COLUMN_LABEL] = val_df.apply(lambda x: preprocess(x, wordnet_lemmatizer, porter_stemmer), 1)
test_df[TEXT_COLUMN_LABEL] = test_df.apply(lambda x: preprocess(x, wordnet_lemmatizer, porter_stemmer), 1)

In [67]:
def map_label(row):
    return 0 if row['label']=='real' else 1

df['label_encoded'] = df.apply(lambda x: map_label(x), 1)
val_df['label_encoded'] = val_df.apply(lambda x: map_label(x), 1)
# test_df['label_encoded'] = test_df.apply(lambda x: map_label(x), 1)

In [69]:
train_sentences = df[TEXT_COLUMN_LABEL].values
val_sentences = val_df[TEXT_COLUMN_LABEL].values
test_sentences = test_df[TEXT_COLUMN_LABEL].values

train_labels = df.label_encoded.values
val_labels = val_df.label_encoded.values

In [70]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)


In [71]:
def Encode_TextWithAttention(sentence,tokenizer,maxlen,padding_type='max_length',attention_mask_flag=True):
    encoded_dict = tokenizer.encode_plus(sentence, add_special_tokens=True, max_length=maxlen, truncation=True, padding=padding_type, return_attention_mask=attention_mask_flag)
    return encoded_dict['input_ids'],encoded_dict['attention_mask']

def Encode_TextWithoutAttention(sentence,tokenizer,maxlen,padding_type='max_length',attention_mask_flag=False):
    encoded_dict = tokenizer.encode_plus(sentence, add_special_tokens=True, max_length=maxlen, truncation=True, padding=padding_type, return_attention_mask=attention_mask_flag)
    return encoded_dict['input_ids']

def get_TokenizedTextWithAttentionMask(sentenceList, tokenizer):
    token_ids_list,attention_mask_list = [],[]
    for sentence in sentenceList:
        token_ids,attention_mask = Encode_TextWithAttention(sentence,tokenizer,MAX_LEN)
        token_ids_list.append(token_ids)
        attention_mask_list.append(attention_mask)
    return token_ids_list,attention_mask_list

def get_TokenizedText(sentenceList, tokenizer):
    token_ids_list = []
    for sentence in sentenceList:
        token_ids = Encode_TextWithoutAttention(sentence,tokenizer,MAX_LEN)
        token_ids_list.append(token_ids)
    return token_ids_list

In [72]:
train_token_ids,train_attention_masks = torch.tensor(get_TokenizedTextWithAttentionMask(train_sentences,tokenizer))
val_token_ids,val_attention_masks = torch.tensor(get_TokenizedTextWithAttentionMask(val_sentences,tokenizer))
test_token_ids,test_attention_masks = torch.tensor(get_TokenizedTextWithAttentionMask(test_sentences,tokenizer))

train_labels = torch.tensor(train_labels)
val_labels = torch.tensor(val_labels)

### Creating Data Loaders

In [73]:
batch_size = 32

train_data = TensorDataset(train_token_ids, train_attention_masks, train_labels)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

validation_data = TensorDataset(val_token_ids, val_attention_masks, val_labels)
validation_sampler = SequentialSampler(validation_data)
validation_dataloader = DataLoader(validation_data, sampler=validation_sampler, batch_size=batch_size)

test_data = TensorDataset(test_token_ids, test_attention_masks)
test_sampler = SequentialSampler(test_data)
test_dataloader = DataLoader(test_data, sampler=test_sampler, batch_size=batch_size)

# Model

In [74]:
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2).cuda()


Downloading pytorch_model.bin:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

In [75]:
param_optimizer = list(model.named_parameters())
no_decay = ['bias', 'gamma', 'beta']
optimizer_grouped_parameters = [
    {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
     'weight_decay_rate': 0.01},
    {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
     'weight_decay_rate': 0.0}
]

In [76]:
optimizer = AdamW(optimizer_grouped_parameters, lr=2e-5)




In [77]:
def flat_accuracy(preds, labels):
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return np.sum(pred_flat == labels_flat) / len(labels_flat)

# Training

In [79]:
train_loss_set = []
best_val_accuracy = 0.90
directory_path = ''
epochs = 15

for _ in trange(epochs, desc="Epoch"):
    model.train()
    tr_loss = 0
    nb_tr_examples, nb_tr_steps = 0, 0
  
    for step, batch in enumerate(train_dataloader):
        batch = tuple(t.to(device) for t in batch)
        b_input_ids, b_input_mask, b_labels = batch
        optimizer.zero_grad()
        outputs = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask, labels=b_labels)
        loss = outputs[0]
        logits = outputs[1]
        train_loss_set.append(loss.item())    
        loss.backward()
        optimizer.step()

        tr_loss += loss.item()
        nb_tr_examples += b_input_ids.size(0)
        nb_tr_steps += 1

    print("Train loss: {}".format(tr_loss/nb_tr_steps))

    model.eval()

    eval_loss, eval_accuracy = 0, 0
    nb_eval_steps, nb_eval_examples = 0, 0

    for batch in validation_dataloader:
        batch = tuple(t.to(device) for t in batch)
        b_input_ids, b_input_mask, b_labels = batch
        with torch.no_grad():
          output = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask)
          logits = output[0]

        logits = logits.detach().cpu().numpy()
        label_ids = b_labels.to('cpu').numpy()

        tmp_eval_accuracy = flat_accuracy(logits, label_ids)

        eval_accuracy += tmp_eval_accuracy
        nb_eval_steps += 1

    print("Validation Accuracy: {}".format(eval_accuracy/nb_eval_steps))
    Validation_Accuracy = (eval_accuracy/nb_eval_steps)
    if(Validation_Accuracy >= best_val_accuracy):
        torch.save(model.state_dict(), "BERT_base_uncased_best_model.ckpt")
        best_val_accuracy = Validation_Accuracy
        print('Model Saved')

Epoch:   0%|          | 0/15 [00:00<?, ?it/s]

Train loss: 0.07640239860749343
Validation Accuracy: 0.9722222222222222


Epoch:   7%|▋         | 1/15 [02:22<33:18, 142.76s/it]

Model Saved
Train loss: 0.026815256007585813


Epoch:  13%|█▎        | 2/15 [04:47<31:14, 144.20s/it]

Validation Accuracy: 0.9699074074074074
Train loss: 0.02119943873753345


Epoch:  20%|██        | 3/15 [07:13<28:56, 144.70s/it]

Validation Accuracy: 0.9670138888888888
Train loss: 0.012290876979719865


Epoch:  27%|██▋       | 4/15 [09:38<26:32, 144.78s/it]

Validation Accuracy: 0.9699074074074074
Train loss: 0.0071937723754944335


Epoch:  33%|███▎      | 5/15 [12:03<24:09, 144.98s/it]

Validation Accuracy: 0.96875
Train loss: 0.007950378175107893


Epoch:  40%|████      | 6/15 [14:28<21:45, 145.00s/it]

Validation Accuracy: 0.9658564814814815
Train loss: 0.009105750651925119
Validation Accuracy: 0.9728009259259259


Epoch:  47%|████▋     | 7/15 [16:59<19:34, 146.85s/it]

Model Saved
Train loss: 0.00574045061124554


Epoch:  53%|█████▎    | 8/15 [19:24<17:04, 146.36s/it]

Validation Accuracy: 0.9704861111111112
Train loss: 0.0011982201350318098


Epoch:  60%|██████    | 9/15 [21:49<14:35, 145.90s/it]

Validation Accuracy: 0.9629629629629629
Train loss: 0.003961907899521006


Epoch:  67%|██████▋   | 10/15 [24:14<12:08, 145.69s/it]

Validation Accuracy: 0.9641203703703703
Train loss: 0.0067810308567705785


Epoch:  73%|███████▎  | 11/15 [26:39<09:42, 145.53s/it]

Validation Accuracy: 0.9704861111111112
Train loss: 0.0005078717055945401


Epoch:  80%|████████  | 12/15 [29:04<07:16, 145.37s/it]

Validation Accuracy: 0.9704861111111112
Train loss: 0.00019982386485561425


Epoch:  87%|████████▋ | 13/15 [31:29<04:50, 145.27s/it]

Validation Accuracy: 0.9699074074074074
Train loss: 6.840639343863586e-05


Epoch:  93%|█████████▎| 14/15 [33:54<02:25, 145.20s/it]

Validation Accuracy: 0.9722222222222222
Train loss: 6.128657388837604e-05


Epoch: 100%|██████████| 15/15 [36:19<00:00, 145.33s/it]

Validation Accuracy: 0.9716435185185185





## Generating Test Predictions

In [80]:
model.eval()

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12,

In [102]:
import torch.nn.functional as F
from scipy.special import softmax

preds = []
scores = [] # for the one label

for input_ids, input_mask in test_data:
  # reshape to get size-1 batches
  b_input_ids = torch.reshape(input_ids, (1, -1)).to(device)
  b_input_mask = torch.reshape(input_mask, (1, -1)).to(device)

  with torch.no_grad():
    output = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask)
    logits = output[0]
  
  logits = logits.detach().cpu().numpy()
  label_ids = b_labels.to('cpu').numpy()
  pred = np.argmax(logits, axis=1).flatten()
  preds.append(pred[0])
  scores.append(softmax(logits.flatten())[1])

In [111]:
print(len(test_data))

2133


In [112]:
print(len(scores))

2133


In [113]:
np_preds = np.array(preds)
np_scores = np.array(scores)

In [114]:
print(len(np_preds))

2133


In [115]:
print(len(np_scores))

2133


In [116]:
np.save("bert_preds", np_preds)
np.save("bert_scores", np_scores)

In [117]:
np.savetxt("bert_preds.csv", np_preds, delimiter=",")
np.savetxt("bert_scores.csv", np_scores, delimiter=",")