In [1]:
import gensim
import re
import torch
import pandas as pd 
import numpy as np
import nlpaug.augmenter.word as naw
from tqdm import tqdm

The following resources were used as references for code and methodology: 
https://www.auroria.io/nlp-disaster-tweet-text-classification-roberta-pytorch/
https://medium.com/@khang.pham.exxact/text-classification-with-bert-7afaacc5e49b 

In [2]:
import os 

os.environ["CUDA_VISIBLE_DEVICES"] = "1" 

In [3]:
sent_df = pd.read_csv("twitter_sentiment_data.csv")


In [4]:
sent_df.head()

Unnamed: 0,sentiment,message,tweetid
0,-1,@tiniebeany climate change is an interesting h...,792927353886371840
1,1,RT @NatGeoChannel: Watch #BeforeTheFlood right...,793124211518832641
2,1,Fabulous! Leonardo #DiCaprio's film on #climat...,793124402388832256
3,1,RT @Mick_Fanning: Just watched this amazing do...,793124635873275904
4,2,"RT @cnalive: Pranita Biswasi, a Lutheran from ...",793125156185137153


In [5]:
len(sent_df)

43943

In [6]:
sent_df_three = sent_df[sent_df["sentiment"] != 2]

In [7]:
sent_df = sent_df_three
sent_df = sent_df.reset_index()

In [8]:
len(sent_df)

34667

In [9]:
tweets = sent_df["message"].to_list()
no_link_tweets = []
for tweet in tweets: 
    no_link_tweets.append(re.sub(r'http\S+', '', tweet))

no_at_tweets = []
for tweet in no_link_tweets:
    sen = []
    for word in tweet.split():
        if"@" not in word:
            sen.append(word)
    no_at_tweets.append(" ".join(sen))

sent_df["no_at_tweet"] = no_at_tweets

sent_df["clean_tweet"] = sent_df["no_at_tweet"].apply(lambda x: gensim.utils.simple_preprocess(x))

clean_text_list = sent_df["clean_tweet"].to_list()
clean_join = []
for text in clean_text_list:
    clean_join.append(" ".join(text))
clean_join[0]

features = clean_join
labels = sent_df["sentiment"]
labels = labels.replace(-1, 3)

labels = labels.replace(0, 'Neutral')
labels = labels.replace(1, 'Pro')
labels = labels.replace(3, 'Anti')

df = pd.DataFrame()
df["features"] = features
df["labels"] = labels

In [10]:
set(df["labels"].to_list())

{'Anti', 'Neutral', 'Pro'}

In [11]:
df["features"].to_list()[0], sent_df["message"].to_list()[0]

('climate change is an interesting hustle as it was global warming but the planet stopped warming for yes while the suv boom',
 '@tiniebeany climate change is an interesting hustle as it was global warming but the planet stopped warming for 15 yes while the suv boom')

In [12]:
from transformers import RobertaTokenizer
from transformers import RobertaModel

In [13]:
tokenizer = RobertaTokenizer.from_pretrained("roberta-base")

In [14]:
from torch.utils.data import dataset

In [15]:
class Dataset(torch.utils.data.Dataset):
    def __init__(self, df):
        self.labels = [labels[label] for label in df["labels"]]
        self.texts = [tokenizer(features, 
                            padding = "max_length", max_length = 64,
                            truncation = True, return_tensors = "pt") for features in df["features"]]
    def classes(self):
        return self.labels

    def __len__(self):
        return len(self.labels)
  
    def get_batch_labels(self, idx):
        return np.array(self.labels[idx])
  
    def get_batch_texts(self, idx):
        return self.texts[idx]
  
    def __getitem__(self, idx):
        batch_texts = self.get_batch_texts(idx)
        batch_y = self.get_batch_labels(idx)

        return batch_texts, batch_y

np.random.seed(112)



In [16]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(df[["features"]], 
                                                    df[["labels"]], 
                                                    test_size=0.2,
                                                    random_state=42,
                                                    stratify = df["labels"].to_list())

In [17]:
labels = {
    'Neutral': 0,
    'Pro': 1,
    'Anti': 2
}

In [18]:
labels_cat = {
    0: 'Neutral',
    1: 'Pro',
    2: 'Anti'
}

In [19]:
df_train = pd.DataFrame({"features": X_train["features"].to_list(),
                         "labels": y_train["labels"].to_list()})
df_val = pd.DataFrame({"features": X_test["features"].to_list(),
                       "labels": y_test["labels"].to_list()})

In [20]:
df_train["labels"].value_counts()

labels
Pro        18369
Neutral     6172
Anti        3192
Name: count, dtype: int64

In [21]:
from torch import nn
from transformers import BertModel

class BertClassifier(nn.Module):

    def __init__(self, dropout=0.5):

        super(BertClassifier, self).__init__()

        self.bert = RobertaModel.from_pretrained('roberta-base')
        self.dropout = nn.Dropout(dropout)
        self.linear_output = nn.Linear(768, 3)
        self.relu = nn.ReLU()
        
    def forward(self, input_id, mask):
        _, pooled_output = self.bert(input_ids= input_id, attention_mask=mask,return_dict=False)
        dropout_output = self.dropout(pooled_output)
        
        linear_output = self.linear_output(dropout_output)
        final_layer = self.relu(linear_output)

        return final_layer

In [22]:
y_pred = []

In [23]:
batch_size = 2

In [24]:
from torch.optim import Adam
from tqdm import tqdm
def train(model, train_data, learning_rate, epochs):

    train = Dataset(train_data)

    train_dataloader = torch.utils.data.DataLoader(train, batch_size=batch_size)

    use_cuda = torch.cuda.is_available()
    device = torch.device("cuda" if use_cuda else "cpu")
    criterion = nn.CrossEntropyLoss()
    optimizer = Adam(model.parameters(), lr= learning_rate)

    if use_cuda:

            model = model.cuda()
            criterion = criterion.cuda()
    
    for epoch_num in range(epochs):
            model.train()
            total_acc_train = 0
            total_loss_train = 0

            for train_input, train_label in tqdm(train_dataloader):
                train_label = train_label.to(device)
                mask = train_input['attention_mask'].to(device)
                input_id = train_input['input_ids'].squeeze(1).to(device)

                output = model(input_id, mask)
                
                batch_loss = criterion(output, train_label.long())
                total_loss_train += batch_loss.item()
                
                acc = (output.argmax(dim=1) == train_label).sum().item()
                y_pred.append(output)
                total_acc_train += acc

                model.zero_grad()
                batch_loss.backward()
                optimizer.step()
                
            print(f'Epochs: {epoch_num + 1} | Train Loss: {total_loss_train / len(train_data): .3f} \| Train Accuracy: {total_acc_train / len(train_data): .3f}')
                    
EPOCHS = 1
model = BertClassifier()
LR = 1e-6
              
train(model, df_train, LR, EPOCHS)

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaModel: ['lm_head.dense.bias', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.bias', 'lm_head.layer_norm.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
100%|███████████████████████████

Epochs: 4 | Train Loss:  0.342 \| Train Accuracy:  0.715





In [25]:
torch.cuda.mem_get_info()

(4553441280, 25388515328)

In [26]:
df_val = df_val.replace(labels_cat)

In [27]:
model.eval()
total_acc_val = 0
use_cuda = torch.cuda.is_available()
device = torch.device("cuda" if use_cuda else "cpu")
val = Dataset(df_val)
val_dataloader = torch.utils.data.DataLoader(val, batch_size=batch_size)
val_pred = []
with torch.no_grad():
    for val_input, val_label in val_dataloader:
        val_label = val_label.to(device)
        mask = val_input['attention_mask'].to(device)
        input_id = val_input['input_ids'].squeeze(1).to(device)

        output = model(input_id, mask)   
        val_pred.append(output.argmax(dim =1))

        acc = (output.argmax(dim=1) == val_label).sum().item()
        total_acc_val += acc
    
print(total_acc_val / len(df_val))    

0.7575713873665993


In [28]:
val_predictions = []
for batch in val_pred:
    for pred in batch:
        val_predictions.append(pred)

In [29]:
assert(len(val_predictions) == len(df_val))

In [30]:
df_val = df_val.replace(labels)
val_true = df_val["labels"].to_list()

In [31]:
len(val_predictions)

6934

In [32]:
from torcheval.metrics.functional import multiclass_f1_score
val_true_tensor = torch.tensor(val_true)
val_pred_tensor = torch.tensor(val_predictions)
multiclass_f1_score(val_pred_tensor, val_true_tensor, num_classes=3, average=None)

tensor([0.5368, 0.8553, 0.4726])

In [33]:
comments_df = pd.read_csv("comments.csv")

In [34]:
comments_df.head()

Unnamed: 0,type,id,subreddit.id,subreddit.name,subreddit.nsfw,created_utc,permalink,body,sentiment,score
0,comment,imlddn9,2qh3l,news,False,1661990368,https://old.reddit.com/r/news/comments/x2cszk/...,Yeah but what the above commenter is saying is...,0.5719,2
1,comment,imldbeh,2qn7b,ohio,False,1661990340,https://old.reddit.com/r/Ohio/comments/x2awnp/...,Any comparison of efficiency between solar and...,-0.9877,2
2,comment,imldado,2qhma,newzealand,False,1661990327,https://old.reddit.com/r/newzealand/comments/x...,I'm honestly waiting for climate change and th...,-0.1143,1
3,comment,imld6cb,2qi09,sacramento,False,1661990278,https://old.reddit.com/r/Sacramento/comments/x...,Not just Sacramento. It's actually happening a...,0.0,4
4,comment,imld0kj,2qh1i,askreddit,False,1661990206,https://old.reddit.com/r/AskReddit/comments/x2...,I think climate change tends to get some peopl...,0.6634,1


In [35]:
import nltk
import string
from nltk import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import RegexpTokenizer

nltk.download("stopwords")
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/ec22283/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /home/ec22283/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [36]:
nltk_tokenizer = RegexpTokenizer(r'\w+')
stop_words = stopwords.words('english')
lem = WordNetLemmatizer()
custom_stop = ["www", "https", "com", "http", "people", "like", "gt", "___ ", "amp", "org", "ve", "en", "httml", "np", "pdf"]

def process_text(document):

    document = document.lower()
    no_punc = nltk_tokenizer.tokenize(document)

    no_num = [word for word in no_punc if word.isdigit() == False]
    no_stop = [word for word in no_num if word not in stop_words]
    custom_no_stop = [word for word in no_stop if word not in custom_stop]
    lemmed = [lem.lemmatize(word) for word in custom_no_stop]

    return lemmed



In [37]:
working_text = comments_df["body"].to_list()

In [38]:
processed_text = [process_text(doc) for doc in tqdm(working_text)]

100%|█████████████████████████████████████████████████| 4600698/4600698 [30:49<00:00, 2487.42it/s]


In [39]:
processed_join = [" ".join(doc) for doc in processed_text]

In [40]:
from collections import Counter

In [41]:
counter = Counter()
for doc in tqdm(processed_text):
    counter.update(set(doc))
ordered_counter = counter.most_common()
under_list = [word for (word, count) in ordered_counter if count < 100]
over_list = [word for (word, count) in ordered_counter if count > (len(processed_text)*0.4)]

100%|████████████████████████████████████████████████| 4600698/4600698 [00:53<00:00, 85506.04it/s]


In [42]:
from gensim.parsing.preprocessing import remove_stopwords
from gensim.parsing.preprocessing import STOPWORDS

In [43]:
custom_stop = ["www", "https", "com", "http", "people", "like", "gt", "___ "]
all_stopwords = gensim.parsing.preprocessing.STOPWORDS
all_stopwords_gensim = STOPWORDS.union(set(under_list + over_list + custom_stop))

In [44]:
process = [remove_stopwords(doc, stopwords = all_stopwords_gensim) for doc in tqdm(processed_join)]


100%|████████████████████████████████████████████████| 4600698/4600698 [00:53<00:00, 85557.86it/s]


In [45]:
all_joined = ["".join(doc) for doc in tqdm(process)]

100%|███████████████████████████████████████████████| 4600698/4600698 [00:23<00:00, 193977.97it/s]


In [46]:
processed_reddit_df = pd.DataFrame({"features" : processed_join})

In [47]:
#from datasets import load_dataset, Dataset

In [48]:
#processed_reddit_df = pd.DataFrame(import_data["train"])

In [53]:
processed_reddit_df = pd.DataFrame(all_joined, columns = ["features"])

In [54]:
from torch.utils.data import dataset

In [55]:
class Dataset_reddit(torch.utils.data.Dataset):
  def __init__(self, df):
    # maybe add tqdm to df["features"]
    self.texts = [tokenizer(features, 
                            padding = "max_length", max_length = 64,
                            truncation = True, return_tensors = "pt") for features in tqdm(df["features"])]
  def __len__(self):
    return len(self.texts)

  def get_batch_texts(self, idx):
    return self.texts[idx]
  
  def __getitem__(self, idx):
    batch_texts = self.get_batch_texts(idx)

    return batch_texts

In [56]:
reddit_dataset = Dataset_reddit(processed_reddit_df)

100%|█████████████████████████████████████████████████| 4600698/4600698 [38:43<00:00, 1980.40it/s]


In [57]:
model.eval()
use_cuda = torch.cuda.is_available()
device = torch.device("cuda" if use_cuda else "cpu")
rd_dataloader = torch.utils.data.DataLoader(reddit_dataset, batch_size=248)
rd_pred = []
rd_conf = []
with torch.no_grad():
    for rd_input in tqdm(rd_dataloader):
        mask = rd_input['attention_mask'].to(device)
        input_id = rd_input['input_ids'].squeeze(1).to(device)

        output = model(input_id, mask) 
        rd_pred.append(output.argmax(dim =1))
        
    

100%|█████████████████████████████████████████████████████| 18552/18552 [2:25:48<00:00,  2.12it/s]


In [58]:
prediction_list = []
for batch in tqdm(rd_pred):
    for i in batch:
        prediction_list.append(int(i))

100%|██████████████████████████████████████████████████████| 18552/18552 [00:44<00:00, 413.66it/s]


In [59]:
prediction_df = pd.DataFrame({"predictions":prediction_list}).replace(labels_cat)

In [60]:
from datasets import load_dataset, Dataset

In [61]:
#processed_dataset = Dataset.from_pandas(prediction_df)
#processed_dataset.push_to_hub("bartoszmaj/stance_predictions")
