In [None]:
!pip install transformers
!pip install datasets
#!pip install src

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.19.4-py3-none-any.whl (4.2 MB)
[K     |████████████████████████████████| 4.2 MB 30.4 MB/s 
Collecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.7.0-py3-none-any.whl (86 kB)
[K     |████████████████████████████████| 86 kB 6.7 MB/s 
Collecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 70.7 MB/s 
Collecting tokenizers!=0.11.3,<0.13,>=0.11.1
  Downloading tokenizers-0.12.1-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.6 MB)
[K     |████████████████████████████████| 6.6 MB 66.8 MB/s 
Installing collected packages: pyyaml, tokenizers, huggingface-hub, transformers
  Attempting uninstall: pyyaml
    Found existing installation: PyYAML 3.13
    Uninstalling PyY

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
%cd /content/drive/Shareddrives/NLP/data_collection.py

In [None]:
import sys

import numpy as np
import pandas as pd

import torch
import torch.nn as nn
from torch.optim import AdamW
from torch.utils.data import Dataset, DataLoader

from transformers import BertTokenizer, BertModel

from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import f1_score, classification_report

from tqdm import tqdm

sys.path.insert(0, '..')
#from data_collection import get_data

pd.set_option("display.max_colwidth", None)

In [None]:
import pandas as pd
import datasets
from collections import Counter


def get_data(dataset_name="ucberkeley-dlab/measuring-hate-speech", columns=["text", "hatespeech"]):
    """
    Helper method which fetches the requested dataset, narrows it down to the
    relevant columns, aggregates second column to the most frequent value
    based on the first column, and returns it

    Parameters
    ----------
    dataset_name : str, optional
        Name of the dataset to be downloaded. For this project, the default
        value is "ucberkeley-dlab/measuring-hate-speech".
    columns : list, optional
        A list of columns to be extracted. For this project, the default value
        is  ["text", "hatespeech"].

    Returns
    -------
    data : pandas.DataFrame
        The fetched and processed dataset.

    """
    print("Fetching data...")
    dataset = datasets.load_dataset(dataset_name, "binary")
    data = dataset["train"].to_pandas()[columns]

    print("Processing...")
    data[columns[1]] = pd.to_numeric(
        data[columns[1]],
        downcast="integer"
    )

    data.loc[data[columns[1]] == 2, columns[1]] = 1

    data = data.groupby(columns[0]).agg(
        lambda x: Counter(x).most_common(1)[0][0]
    ).reset_index()

    print("Done!")
    return data


def clean_text(text):
    BAD_WORDS = {
        "nigga": "n***a",
        "fuck": "f**k",
        "bitch": "b***h",
        "dick": "d**k",
        "cock": "c**k",
        "ass": "a**",
        "pussy": "p***y",
        "sex": "s**",
        "nigger": "n****r",
        "faggot": "f****t",
        "slut": "s**t",
        "shit": "s**t",
        "retard": "r****d",
        "killed": "k****d",
        "suck": "s**k",
        "hoe": "h**",
        "ugly": "u**y",
        "nazi": "n**i",
        "cunt": "c**t",
        "cum": "c**"
    }
    text = text.lower()
    for word, replacement in BAD_WORDS.items():
        text = text.replace(word, replacement)
    return text

## Data Preprocessing

In [None]:
dataset = get_data()

In [None]:
print(len(dataset))
print(dataset["text"])
print(dataset["hatespeech"])

In [None]:
type(dataset)

In [None]:
dataset["hatespeech"].value_counts()

In [None]:
device = torch.device("cuda:0") if torch.cuda.is_available() else "cpu"
device

In [None]:
MODEL_NAME = "bert-base-uncased"  
BATCH_SIZE = 16
MAX_LEN = 128
EPOCHS = 10
LEARNING_RATE = 1e-05
TOKENIZER = BertTokenizer.from_pretrained(MODEL_NAME, truncation=True, do_lower_case=True)

In [None]:
class Dataset_Preprocess(Dataset):

    def __init__(self, dataframe, tokenizer, max_len):
        self.tokenizer = tokenizer
        self.data = dataframe
        self.text = dataframe.text
        self.targets = OneHotEncoder(sparse=False).fit_transform(np.array(self.data["hatespeech"]).reshape(-1, 1))
        self.max_len = max_len

    def __len__(self):
        return len(self.text)

    def __getitem__(self, index):
        text = str(self.text[index])
        text = " ".join(text.split())

        inputs = self.tokenizer.encode_plus(
            text,
            None,
            add_special_tokens=True,
            max_length=self.max_len,
            pad_to_max_length=True,
            return_token_type_ids=True
        )

        ids = inputs["input_ids"]
        mask = inputs["attention_mask"]
        token_type_ids = inputs["token_type_ids"]

        return {
            "ids": torch.tensor(ids, dtype=torch.long),
            "mask": torch.tensor(mask, dtype=torch.long),
            "token_type_ids": torch.tensor(token_type_ids, dtype=torch.long),
            "targets": torch.tensor(self.targets[index], dtype=torch.float)
        }

In [None]:
# Dataloader creation for dataset 

train_size = 0.8
val_size = 0.1

train_data = dataset.sample(frac = train_size)
test_data = dataset.drop(train_data.index).reset_index(drop=True)
train_data = train_data.reset_index(drop=True)
val_data = test_data.sample(frac=val_size / (1 - train_size), random_state=220).reset_index()
test_data = test_data.drop(val_data.index).reset_index(drop=True)

print(f"Full Dataset Size: {dataset.shape}")
print(f"Train Dataset Size: {train_data.shape}")
print(f"Validation Dataset Size: {val_data.shape}")
print(f"Test Dataset Size: {test_data.shape}")

training_set = Dataset_Preprocess(train_data, TOKENIZER, MAX_LEN)
validation_set = Dataset_Preprocess(val_data, TOKENIZER, MAX_LEN)
testing_set = Dataset_Preprocess(test_data, TOKENIZER, MAX_LEN)

In [None]:
train_data.dtypes
train_data.head()

In [None]:
train_params = {
    "batch_size": BATCH_SIZE,
    "shuffle": True,
    "num_workers": 0
}

val_params = {
    "batch_size": 1,
    "shuffle": False,
    "num_workers": 0
}

test_params = {
    "batch_size": 1,
    "shuffle": False,
    "num_workers": 0
}

training_loader = DataLoader(training_set, **train_params)
validation_loader = DataLoader(validation_set, **val_params)
testing_loader = DataLoader(testing_set, **test_params)

## BERT Base Model

In [None]:
import gc
import numpy as np
import torch
import torch.nn as nn
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score
from transformers import AutoModel
from transformers import BertModel
import pandas as pd


In [None]:
class BERT_Base(nn.Module):
    def __init__(self, n_classes):
        super(BERT_Base, self).__init__()
        self.l1 = BertModel.from_pretrained('bert-base-uncased')
        self.pre_classifier = nn.Linear(768, 768)
        self.dropout = nn.Dropout(0.1)
        self.classifier = nn.Linear(768, n_classes)

    def forward(self, input_ids, attention_mask, token_type_ids):
        output_1 = self.l1(input_ids=input_ids, attention_mask=attention_mask)
        hidden_state = output_1[0]
        pooler = hidden_state[:, 0]
        pooler = self.pre_classifier(pooler)
        pooler = nn.Tanh()(pooler)
        pooler = self.dropout(pooler)
        output = self.classifier(pooler)
        return output

In [None]:
# class BERT_CNN(nn.Module):

#     def __init__(self):
#         super(BERT_CNN, self).__init__()
#         self.bert = BertModel.from_pretrained('bert-base-uncased')
#         self.conv = nn.Conv2d(in_channels=16, out_channels=16, kernel_size=(3, 768), padding='valid')
#         self.relu = nn.ReLU()
#         self.pool = nn.MaxPool2d(kernel_size=(3,1), stride=1)
#         self.dropout = nn.Dropout(0.1)
#         self.fc = nn.Linear(416, 3)
#         self.flat = nn.Flatten()
#         self.softmax = nn.LogSoftmax(dim=1)

#     def forward(self, sent_id, mask, token_type_ids):
#         _, all_layers = self.bert(input_ids = sent_id, attention_mask=mask, token_type_ids=token_type_ids, return_dict=False)
#         # all_layers  = [13, 32, 64, 768]
#         print('all layers', all_layers)
#         print('all layers', all_layers.shape)
#         x = torch.transpose(torch.cat(tuple([t.unsqueeze(0) for t in all_layers]), 0), 0, 1)
#         del all_layers
#         gc.collect()
#         torch.cuda.empty_cache()
#         print('Before dropout',x.shape)
#         x = self.dropout(x)
#         print('After dropout', x.shape)
#         x = self.conv(x)
#         print('After Convolutional layer', x.shape)
#         x = self.pool(self.dropout(self.relu(x)))
#         x = self.fc(self.dropout(self.flat(self.dropout(x))))
#         return self.softmax(x)

In [None]:
num_classes = dataset["hatespeech"].nunique()
model = BERT_Base(n_classes = num_classes)
model.to(device)
#model.summary()

## Model Training

In [None]:
def loss_fn(outputs, targets):
    return torch.nn.BCEWithLogitsLoss()(outputs, targets)

In [None]:
optimizer = AdamW(params=model.parameters(), lr=LEARNING_RATE)

In [None]:
def train(epoch):
    model.train()
    for _, data in tqdm(enumerate(training_loader, 0)):
        ids = data["ids"].to(device, dtype=torch.long)
        mask = data["mask"].to(device, dtype=torch.long)
        token_type_ids = data["token_type_ids"].to(device, dtype=torch.long)
        targets = data["targets"].to(device, dtype=torch.float)
        # print('ids', type(ids))
        # print('mask', type(mask))
        # print('token type ids', type(token_type_ids))
        outputs = model(ids, mask, token_type_ids)

        optimizer.zero_grad()
        loss = loss_fn(outputs, targets)
        if _ % 1000 == 0:
            print(f"Epoch: {epoch}, Loss: {loss.item()}")
        loss.backward()
        optimizer.step()

In [None]:
for epoch in range(EPOCHS):
    train(epoch)

## Model Evaluation

In [None]:
def validation(model, loader):
    model.eval()
    fin_targets = []
    fin_outputs = []
    with torch.no_grad():
        for _, data in tqdm(enumerate(loader, 0)):
            ids = data["ids"].to(device, dtype=torch.long)
            mask = data["mask"].to(device, dtype=torch.long)
            token_type_ids = data["token_type_ids"].to(device, dtype=torch.long)
            targets = data["targets"].to(device, dtype=torch.float)
            outputs = model(ids, mask, token_type_ids)
            fin_targets.extend(targets.cpu().detach().numpy().tolist())
            fin_outputs.extend(torch.sigmoid(outputs).cpu().detach().numpy().tolist())
    return fin_outputs, fin_targets

In [None]:
outputs, targets = validation(model, validation_loader)

final_outputs = np.argmax(outputs, axis=1)
targets = np.argmax(targets, axis=1)

In [None]:
print(f"Got {sum(final_outputs == targets)} / {len(final_outputs)} correct")

In [None]:
micro_f1 = f1_score(targets, final_outputs, average="micro")
macro_f1 = f1_score(targets, final_outputs, average="macro")
weighted_f1 = f1_score(targets, final_outputs, average="weighted")

print(f"Micro F1 score:\t\t{round(micro_f1, 3)}")
print(f"Macro F1 score:\t\t{round(macro_f1, 3)}")
print(f"Weighted F1 score:\t{round(weighted_f1, 3)}")

In [None]:
print(classification_report(targets, final_outputs))

In [None]:
output_model_file = "../content/drive/Shareddrives/NLP/pytorch_bert_cnn.bin"

output_vocab_file = "../content/drive/Shareddrives/NLP/vocab_bert_cnn.bin"

torch.save(model, output_model_file)
TOKENIZER.save_vocabulary(output_vocab_file)

print("Model Saved")

### Application using YouTube videos

In [None]:
import pandas as pd
import numpy as np

In [None]:
import json
from csv import writer
from apiclient.discovery import build
import pickle
import urllib.request
import urllib

In [None]:
key = 'AIzaSyA7IAgb20S12ZVAmvrsR6GVDY6iz-pVLxA' #replace with your youtube data api key
#videoId = 'eYndEoy5Vr8' # This is embedded in the URL after "v="

videoId = 'ubKmjE3lEHI'
# channelId = 'UC2UXDak6o7rBm23k3Vv5dww' 

In [None]:
def build_service():
    YOUTUBE_API_SERVICE_NAME = "youtube"
    YOUTUBE_API_VERSION = "v3"
    return build(YOUTUBE_API_SERVICE_NAME,
                 YOUTUBE_API_VERSION,
                 developerKey=key)

In [None]:
def get_comments(part='snippet', 
                 maxResults=100, 
                 textFormat='plainText',
                 order='time',
                 videoId=videoId,
                 csv_filename="google2021search"):

    #3 create empty lists to store desired information
    comments, commentsId, repliesCount, likesCount, viewerRating = [], [], [], [], []
       
    # build our service from path/to/apikey
    service = build_service()
    
    #4 make an API call using our service
    response = service.commentThreads().list(
        part=part,
        maxResults=maxResults,
        textFormat=textFormat,
        order=order,
        videoId=videoId
    ).execute()
                 

    while response: # this loop will continue to run until you max out your quota
                 
        for item in response['items']:
            #5 index item for desired data features
            comment = item['snippet']['topLevelComment']['snippet']['textDisplay']
            comment_id = item['snippet']['topLevelComment']['id']
            reply_count = item['snippet']['totalReplyCount']
            like_count = item['snippet']['topLevelComment']['snippet']['likeCount']
            
            #6 append to lists
            comments.append(comment)
            commentsId.append(comment_id)
            repliesCount.append(reply_count)
            likesCount.append(like_count)

            #7 write line by line
            with open(f'{csv_filename}.csv', 'a+') as f:
                # https://thispointer.com/python-how-to-append-a-new-row-to-an-existing-csv-file/#:~:text=Open%20our%20csv%20file%20in,in%20the%20associated%20csv%20file
                csv_writer = writer(f)
                csv_writer.writerow([comment, comment_id, reply_count, like_count])
        
        #8 check for nextPageToken, and if it exists, set response equal to the JSON response
        if 'nextPageToken' in response:
            response = service.commentThreads().list(
                part=part,
                maxResults=maxResults,
                textFormat=textFormat,
                order=order,
                videoId=videoId,
                pageToken=response['nextPageToken']
            ).execute()
        else:
            break

    #9 return our data of interest
    return {
        'Comments': comments,
        'Comment ID': commentsId,
        'Reply Count' : repliesCount,
        'Like Count' : likesCount
    }


In [None]:
if __name__ == '__main__':
    google2021 = get_comments()
    #df = pd.DataFrame(google2021)
   # print(df.shape)
    #print(df.head())
    #df['date'] = pd.to_datetime(df['date'], errors='coerce')
    #df['just_date'] = df['date'].dt.date
   # df.to_csv('./googlecomment.csv')


In [None]:
with open("googlereview.json", "w") as fh:
    json.dump(google2021, fh)

In [None]:
comments = pd.read_json('googlereview.json')
comments.head()

In [None]:
len(comments)

In [None]:
pd.set_option('display.max_colwidth', None)
comments.columns = comments.columns.str.replace(' ','_')

In [None]:
comments['text'] = comments['Comments']

In [None]:
class Dataset_Preprocess_youtube(Dataset):

    def __init__(self, dataframe, tokenizer, max_len):
        self.tokenizer = tokenizer
        self.data = dataframe
        self.text = dataframe.text
        #self.targets = OneHotEncoder(sparse=False).fit_transform(np.array(self.data["hatespeech"]).reshape(-1, 1))
        self.max_len = max_len

    def __len__(self):
        return len(self.text)

    def __getitem__(self, index):
        text = str(self.text[index])
        text = " ".join(text.split())

        inputs = self.tokenizer.encode_plus(
            text,
            None,
            add_special_tokens=True,
            max_length=self.max_len,
            pad_to_max_length=True,
            return_token_type_ids=True
        )

        ids = inputs["input_ids"]
        mask = inputs["attention_mask"]
        token_type_ids = inputs["token_type_ids"]

        return {
            "ids": torch.tensor(ids, dtype=torch.long),
            "mask": torch.tensor(mask, dtype=torch.long),
            "token_type_ids": torch.tensor(token_type_ids, dtype=torch.long),
            #"targets": torch.tensor(self.targets[index], dtype=torch.float)
        }

In [None]:
test = Dataset_Preprocess_youtube(comments, TOKENIZER, MAX_LEN)

In [None]:
def prediction(model, loader):
    model.eval()
    fin_outputs = []
    with torch.no_grad():
        for _, data in tqdm(enumerate(loader, 0)):
            ids = data["ids"].to(device, dtype=torch.long)
            mask = data["mask"].to(device, dtype=torch.long)
            token_type_ids = data["token_type_ids"].to(device, dtype=torch.long)
            outputs = model(ids, mask, token_type_ids)
            fin_outputs.extend(torch.sigmoid(outputs).cpu().detach().numpy().tolist())
    return fin_outputs

In [None]:
test_loader = DataLoader(test, **val_params)
predictions = prediction(model, test_loader)

In [None]:
final_result = pd.DataFrame()
final_predictions = np.argmax(predictions, axis=1)
final_result['comments'] = comments['Comments']
final_result['predictions'] = final_predictions

In [None]:
final_result.tail(40)