In [1]:
import torch
import random
import numpy as np

In [2]:
device = "cuda" if torch.cuda.is_available() else "cpu"
device

'cuda'

In [3]:
seed_val = 0
random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)

In [4]:
import pandas as pd
from google.colab import drive
drive.mount("/content/gdrive")

Mounted at /content/gdrive


In [5]:
train_data = pd.read_csv("/content/gdrive/MyDrive/Stance Detection with Bidirectional Conditional Encoding/dataset/train.csv", engine = "python", encoding = "latin")
test_data = pd.read_csv("/content/gdrive/MyDrive/Stance Detection with Bidirectional Conditional Encoding/dataset/test.csv", engine = "python", encoding = "latin")

In [6]:
train_data

Unnamed: 0,Tweet,Target,Stance,Opinion Towards,Sentiment
0,"@tedcruz And, #HandOverTheServer she wiped cle...",Hillary Clinton,AGAINST,1. The tweet explicitly expresses opinion abo...,neg
1,Hillary is our best choice if we truly want to...,Hillary Clinton,FAVOR,1. The tweet explicitly expresses opinion abo...,pos
2,@TheView I think our country is ready for a fe...,Hillary Clinton,AGAINST,1. The tweet explicitly expresses opinion abo...,neg
3,I just gave an unhealthy amount of my hard-ear...,Hillary Clinton,AGAINST,1. The tweet explicitly expresses opinion abo...,neg
4,@PortiaABoulger Thank you for adding me to you...,Hillary Clinton,NONE,3. The tweet is not explicitly expressing opi...,pos
...,...,...,...,...,...
2909,"There's a law protecting unborn eagles, but no...",Legalization of Abortion,AGAINST,1. The tweet explicitly expresses opinion abo...,neg
2910,I am 1 in 3... I have had an abortion #Abortio...,Legalization of Abortion,AGAINST,2. The tweet does NOT expresses opinion about ...,other
2911,How dare you say my sexual preference is a cho...,Legalization of Abortion,AGAINST,2. The tweet does NOT expresses opinion about ...,neg
2912,"Equal rights for those 'born that way', no rig...",Legalization of Abortion,AGAINST,2. The tweet does NOT expresses opinion about ...,neg


In [7]:
test_data

Unnamed: 0,Tweet,Target,Stance,Opinion Towards,Sentiment
0,He who exalts himself shall be humbled; a...,Atheism,AGAINST,1. The tweet explicitly expresses opinion abo...,pos
1,RT @prayerbullets: I remove Nehushtan -previou...,Atheism,AGAINST,1. The tweet explicitly expresses opinion abo...,other
2,@Brainman365 @heidtjj @BenjaminLives I have so...,Atheism,AGAINST,1. The tweet explicitly expresses opinion abo...,pos
3,#God is utterly powerless without Human interv...,Atheism,AGAINST,1. The tweet explicitly expresses opinion abo...,neg
4,@David_Cameron Miracles of #Multiculturalism...,Atheism,AGAINST,2. The tweet does NOT expresses opinion about ...,neg
...,...,...,...,...,...
1951,@realDonaldTrump we all want you as the next p...,Donald Trump,FAVOR,1. The tweet explicitly expresses opinion abo...,pos
1952,@RSherman_25 Join Twitter Trump brigade #oneth...,Donald Trump,FAVOR,1. The tweet explicitly expresses opinion abo...,pos
1953,@JoeyBats19 Join Twitter Trump brigade #onetho...,Donald Trump,FAVOR,1. The tweet explicitly expresses opinion abo...,pos
1954,Trump's outlandish statements is political str...,Donald Trump,AGAINST,1. The tweet explicitly expresses opinion abo...,neg


In [8]:
train_data.drop(columns = ["Opinion Towards", "Sentiment"], inplace = True)
test_data.drop(columns = ["Opinion Towards", "Sentiment"], inplace = True)

In [9]:
train_data

Unnamed: 0,Tweet,Target,Stance
0,"@tedcruz And, #HandOverTheServer she wiped cle...",Hillary Clinton,AGAINST
1,Hillary is our best choice if we truly want to...,Hillary Clinton,FAVOR
2,@TheView I think our country is ready for a fe...,Hillary Clinton,AGAINST
3,I just gave an unhealthy amount of my hard-ear...,Hillary Clinton,AGAINST
4,@PortiaABoulger Thank you for adding me to you...,Hillary Clinton,NONE
...,...,...,...
2909,"There's a law protecting unborn eagles, but no...",Legalization of Abortion,AGAINST
2910,I am 1 in 3... I have had an abortion #Abortio...,Legalization of Abortion,AGAINST
2911,How dare you say my sexual preference is a cho...,Legalization of Abortion,AGAINST
2912,"Equal rights for those 'born that way', no rig...",Legalization of Abortion,AGAINST


In [10]:
test_data

Unnamed: 0,Tweet,Target,Stance
0,He who exalts himself shall be humbled; a...,Atheism,AGAINST
1,RT @prayerbullets: I remove Nehushtan -previou...,Atheism,AGAINST
2,@Brainman365 @heidtjj @BenjaminLives I have so...,Atheism,AGAINST
3,#God is utterly powerless without Human interv...,Atheism,AGAINST
4,@David_Cameron Miracles of #Multiculturalism...,Atheism,AGAINST
...,...,...,...
1951,@realDonaldTrump we all want you as the next p...,Donald Trump,FAVOR
1952,@RSherman_25 Join Twitter Trump brigade #oneth...,Donald Trump,FAVOR
1953,@JoeyBats19 Join Twitter Trump brigade #onetho...,Donald Trump,FAVOR
1954,Trump's outlandish statements is political str...,Donald Trump,AGAINST


In [11]:
train_data.isna().sum()

Tweet     0
Target    0
Stance    0
dtype: int64

In [12]:
test_data.isna().sum()

Tweet     0
Target    0
Stance    0
dtype: int64

In [13]:
train_data["Stance"].value_counts()

AGAINST    1395
NONE        766
FAVOR       753
Name: Stance, dtype: int64

In [14]:
test_data["Stance"].value_counts()

AGAINST    1014
NONE        490
FAVOR       452
Name: Stance, dtype: int64

In [15]:
mapping = {"AGAINST": 0, "FAVOR": 1, "NONE": 2}
train_data["Stance"] = train_data["Stance"].replace(mapping)
test_data["Stance"] = test_data["Stance"].replace(mapping)

In [16]:
train_data

Unnamed: 0,Tweet,Target,Stance
0,"@tedcruz And, #HandOverTheServer she wiped cle...",Hillary Clinton,0
1,Hillary is our best choice if we truly want to...,Hillary Clinton,1
2,@TheView I think our country is ready for a fe...,Hillary Clinton,0
3,I just gave an unhealthy amount of my hard-ear...,Hillary Clinton,0
4,@PortiaABoulger Thank you for adding me to you...,Hillary Clinton,2
...,...,...,...
2909,"There's a law protecting unborn eagles, but no...",Legalization of Abortion,0
2910,I am 1 in 3... I have had an abortion #Abortio...,Legalization of Abortion,0
2911,How dare you say my sexual preference is a cho...,Legalization of Abortion,0
2912,"Equal rights for those 'born that way', no rig...",Legalization of Abortion,0


In [17]:
test_data

Unnamed: 0,Tweet,Target,Stance
0,He who exalts himself shall be humbled; a...,Atheism,0
1,RT @prayerbullets: I remove Nehushtan -previou...,Atheism,0
2,@Brainman365 @heidtjj @BenjaminLives I have so...,Atheism,0
3,#God is utterly powerless without Human interv...,Atheism,0
4,@David_Cameron Miracles of #Multiculturalism...,Atheism,0
...,...,...,...
1951,@realDonaldTrump we all want you as the next p...,Donald Trump,1
1952,@RSherman_25 Join Twitter Trump brigade #oneth...,Donald Trump,1
1953,@JoeyBats19 Join Twitter Trump brigade #onetho...,Donald Trump,1
1954,Trump's outlandish statements is political str...,Donald Trump,0


In [18]:
val_data = train_data[train_data["Target"] == "Hillary Clinton"]
train_data = train_data[train_data["Target"] != "Hillary Clinton"]
test_data = test_data[test_data["Target"] == "Donald Trump"]

In [19]:
print(train_data.shape, test_data.shape, val_data.shape)

(2225, 3) (707, 3) (689, 3)


In [20]:
import string
import nltk
import re

from bs4 import BeautifulSoup
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

nltk.download("stopwords")
nltk.download("wordnet")
nltk.download("punkt")

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [21]:
def remove_whitespaces(text):
  cleaned_string = text.strip()
  cleaned_string = re.sub(r"\s+", " ", cleaned_string)
  return cleaned_string

def convert_to_lowercase(text):
  return text.lower()

def remove_numbers(text):
  return re.sub(r'\d+', '', text)

def remove_punctuations(text):
  return text.translate(str.maketrans("", "", string.punctuation))

def remove_url_html(text):
  text = re.sub(r"http\S+", "", text)
  text = BeautifulSoup(text, "html.parser").get_text()
  return text

def remove_stopwords(text):
  arr = []
  w = list(stopwords.words("english"))
  tokens = word_tokenize(text)
  for i in range(len(tokens)):
    if tokens[i] not in w:
      arr.append(tokens[i])
  return " ".join(arr)

def lemmatization(text):
  lemmatizer = WordNetLemmatizer()
  tokens = word_tokenize(text)
  for i in range(len(tokens)):
    w = lemmatizer.lemmatize(tokens[i])
    tokens[i] = w
  return " ".join(tokens)

In [22]:
train_data["Tweet"] = train_data["Tweet"].apply(remove_whitespaces)
train_data["Tweet"] = train_data["Tweet"].apply(convert_to_lowercase)
train_data["Tweet"] = train_data["Tweet"].apply(remove_numbers)
train_data["Tweet"] = train_data["Tweet"].apply(remove_punctuations)
train_data["Tweet"] = train_data["Tweet"].apply(remove_url_html)



test_data["Tweet"] = test_data["Tweet"].apply(remove_whitespaces)
test_data["Tweet"] = test_data["Tweet"].apply(convert_to_lowercase)
test_data["Tweet"] = test_data["Tweet"].apply(remove_numbers)
test_data["Tweet"] = test_data["Tweet"].apply(remove_punctuations)
test_data["Tweet"] = test_data["Tweet"].apply(remove_url_html)



val_data["Tweet"] = val_data["Tweet"].apply(remove_whitespaces)
val_data["Tweet"] = val_data["Tweet"].apply(convert_to_lowercase)
val_data["Tweet"] = val_data["Tweet"].apply(remove_numbers)
val_data["Tweet"] = val_data["Tweet"].apply(remove_punctuations)
val_data["Tweet"] = val_data["Tweet"].apply(remove_url_html)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_data["Tweet"] = train_data["Tweet"].apply(remove_whitespaces)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_data["Tweet"] = train_data["Tweet"].apply(convert_to_lowercase)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_data["Tweet"] = train_data["Tweet"].apply(remove_numbers)
A v

In [23]:
train_data["Stance"].value_counts()

0    1002
1     635
2     588
Name: Stance, dtype: int64

In [24]:
train_data.groupby(["Target", "Stance"]).size().reset_index(name = "counts")

Unnamed: 0,Target,Stance,counts
0,Atheism,0,304
1,Atheism,1,92
2,Atheism,2,117
3,Climate Change is a Real Concern,0,15
4,Climate Change is a Real Concern,1,212
5,Climate Change is a Real Concern,2,168
6,Feminist Movement,0,328
7,Feminist Movement,1,210
8,Feminist Movement,2,126
9,Legalization of Abortion,0,355


In [25]:
# !pip install nlpaug
# !pip install transformers
# import nlpaug.augmenter.word.context_word_embs as aug

In [26]:
# augmenter = aug.ContextualWordEmbsAug(model_path = "bert-base-uncased", action = "insert")

In [27]:
# from tqdm.auto import tqdm
# from sklearn.utils import shuffle
# import numpy as np

In [28]:
# def f(train_data, augmenter, repetitions = 1):
#   augmented_texts = []
#   class00 = train_data[(train_data["Target"] == "Atheism") & (train_data["Stance"] == 0)].reset_index(drop = True)
#   for i in tqdm(np.random.randint(0, len(class00), 196)):
#     for _ in range(repetitions):
#       augmented_text = augmenter.augment(class00["Tweet"].iloc[i])
#       augmented_texts.append(augmented_text)
#   data = {
#       "Tweet" : augmented_texts,
#       "Target": "Atheism",
#       "Stance": 0
#   }
#   aug_df = pd.DataFrame(data)
#   d1 = pd.concat([train_data, aug_df], axis = 0).reset_index(drop = True)



#   augmented_texts = []
#   class01 = train_data[(train_data["Target"] == "Atheism") & (train_data["Stance"] == 1)].reset_index(drop = True)
#   for i in tqdm(np.random.randint(0, len(class01), 408)):
#     for _ in range(repetitions):
#       augmented_text = augmenter.augment(class01["Tweet"].iloc[i])
#       augmented_texts.append(augmented_text)
#   data = {
#       "Tweet" : augmented_texts,
#       "Target": "Atheism",
#       "Stance": 1
#   }
#   aug_df = pd.DataFrame(data)
#   d2 = pd.concat([d1, aug_df], axis = 0).reset_index(drop = True)



#   augmented_texts = []
#   class02 = train_data[(train_data["Target"] == "Atheism") & (train_data["Stance"] == 2)].reset_index(drop = True)
#   for i in tqdm(np.random.randint(0, len(class02), 383)):
#     for _ in range(repetitions):
#       augmented_text = augmenter.augment(class02["Tweet"].iloc[i])
#       augmented_texts.append(augmented_text)
#   data = {
#       "Tweet" : augmented_texts,
#       "Target": "Atheism",
#       "Stance": 2
#   }
#   aug_df = pd.DataFrame(data)
#   d3 = pd.concat([d2, aug_df], axis = 0).reset_index(drop = True)



#   augmented_texts = []
#   class10 = train_data[(train_data["Target"] == "Climate Change is a Real Concern") & (train_data["Stance"] == 0)].reset_index(drop = True)
#   for i in tqdm(np.random.randint(0, len(class10), 485)):
#     for _ in range(repetitions):
#       augmented_text = augmenter.augment(class10["Tweet"].iloc[i])
#       augmented_texts.append(augmented_text)
#   data = {
#       "Tweet" : augmented_texts,
#       "Target": "Climate Change is a Real Concern",
#       "Stance": 0
#   }
#   aug_df = pd.DataFrame(data)
#   d4 = pd.concat([d3, aug_df], axis = 0).reset_index(drop = True)



#   augmented_texts = []
#   class11 = train_data[(train_data["Target"] == "Climate Change is a Real Concern") & (train_data["Stance"] == 1)].reset_index(drop = True)
#   for i in tqdm(np.random.randint(0, len(class11), 288)):
#     for _ in range(repetitions):
#       augmented_text = augmenter.augment(class11["Tweet"].iloc[i])
#       augmented_texts.append(augmented_text)
#   data = {
#       "Tweet" : augmented_texts,
#       "Target": "Climate Change is a Real Concern",
#       "Stance": 1
#   }
#   aug_df = pd.DataFrame(data)
#   d5 = pd.concat([d4, aug_df], axis = 0).reset_index(drop = True)



#   augmented_texts = []
#   class12 = train_data[(train_data["Target"] == "Climate Change is a Real Concern") & (train_data["Stance"] == 2)].reset_index(drop = True)
#   for i in tqdm(np.random.randint(0, len(class12), 332)):
#     for _ in range(repetitions):
#       augmented_text = augmenter.augment(class12["Tweet"].iloc[i])
#       augmented_texts.append(augmented_text)
#   data = {
#       "Tweet" : augmented_texts,
#       "Target": "Climate Change is a Real Concern",
#       "Stance": 2
#   }
#   aug_df = pd.DataFrame(data)
#   d6 = pd.concat([d5, aug_df], axis = 0).reset_index(drop = True)



#   augmented_texts = []
#   class20 = train_data[(train_data["Target"] == "Feminist Movement") & (train_data["Stance"] == 0)].reset_index(drop = True)
#   for i in tqdm(np.random.randint(0, len(class20), 172)):
#     for _ in range(repetitions):
#       augmented_text = augmenter.augment(class20["Tweet"].iloc[i])
#       augmented_texts.append(augmented_text)
#   data = {
#       "Tweet" : augmented_texts,
#       "Target": "Feminist Movement",
#       "Stance": 0
#   }
#   aug_df = pd.DataFrame(data)
#   d7 = pd.concat([d6, aug_df], axis = 0).reset_index(drop = True)



#   augmented_texts = []
#   class21 = train_data[(train_data["Target"] == "Feminist Movement") & (train_data["Stance"] == 1)].reset_index(drop = True)
#   for i in tqdm(np.random.randint(0, len(class21), 290)):
#     for _ in range(repetitions):
#       augmented_text = augmenter.augment(class21["Tweet"].iloc[i])
#       augmented_texts.append(augmented_text)
#   data = {
#       "Tweet" : augmented_texts,
#       "Target": "Feminist Movement",
#       "Stance": 1
#   }
#   aug_df = pd.DataFrame(data)
#   d8 = pd.concat([d7, aug_df], axis = 0).reset_index(drop = True)



#   augmented_texts = []
#   class22 = train_data[(train_data["Target"] == "Feminist Movement") & (train_data["Stance"] == 2)].reset_index(drop = True)
#   for i in tqdm(np.random.randint(0, len(class22), 374)):
#     for _ in range(repetitions):
#       augmented_text = augmenter.augment(class22["Tweet"].iloc[i])
#       augmented_texts.append(augmented_text)
#   data = {
#       "Tweet" : augmented_texts,
#       "Target": "Feminist Movement",
#       "Stance": 2
#   }
#   aug_df = pd.DataFrame(data)
#   d9 = pd.concat([d8, aug_df], axis = 0).reset_index(drop = True)



#   augmented_texts = []
#   class30 = train_data[(train_data["Target"] == "Legalization of Abortion") & (train_data["Stance"] == 0)].reset_index(drop = True)
#   for i in tqdm(np.random.randint(0, len(class30), 145)):
#     for _ in range(repetitions):
#       augmented_text = augmenter.augment(class30["Tweet"].iloc[i])
#       augmented_texts.append(augmented_text)
#   data = {
#       "Tweet" : augmented_texts,
#       "Target": "Legalization of Abortion",
#       "Stance": 0
#   }
#   aug_df = pd.DataFrame(data)
#   d10 = pd.concat([d9, aug_df], axis = 0).reset_index(drop = True)



#   augmented_texts = []
#   class31 = train_data[(train_data["Target"] == "Legalization of Abortion") & (train_data["Stance"] == 1)].reset_index(drop = True)
#   for i in tqdm(np.random.randint(0, len(class31), 379)):
#     for _ in range(repetitions):
#       augmented_text = augmenter.augment(class31["Tweet"].iloc[i])
#       augmented_texts.append(augmented_text)
#   data = {
#       "Tweet" : augmented_texts,
#       "Target": "Legalization of Abortion",
#       "Stance": 1
#   }
#   aug_df = pd.DataFrame(data)
#   d11 = pd.concat([d10, aug_df], axis = 0).reset_index(drop = True)



#   augmented_texts = []
#   class32 = train_data[(train_data["Target"] == "Legalization of Abortion") & (train_data["Stance"] == 2)].reset_index(drop = True)
#   for i in tqdm(np.random.randint(0, len(class32), 323)):
#     for _ in range(repetitions):
#       augmented_text = augmenter.augment(class32["Tweet"].iloc[i])
#       augmented_texts.append(augmented_text)
#   data = {
#       "Tweet" : augmented_texts,
#       "Target": "Legalization of Abortion",
#       "Stance": 2
#   }
#   aug_df = pd.DataFrame(data)
#   d12 = pd.concat([d11, aug_df], axis = 0).reset_index(drop = True)


#   return d12

In [29]:
# augmented_train = f(train_data, augmenter)

In [30]:
# file_name = "/content/gdrive/MyDrive/Stance Detection with Bidirectional Conditional Encoding/augmented_train.csv"
# augmented_train.to_csv(file_name, index = False)

In [31]:
train_df = pd.read_csv("/content/gdrive/MyDrive/Stance Detection with Bidirectional Conditional Encoding/augmented_train.csv")

In [32]:
from sklearn.utils import shuffle
train_data = shuffle(train_df, random_state = 0)

In [33]:
train_data["Stance"].value_counts()

0    2000
1    2000
2    2000
Name: Stance, dtype: int64

In [34]:
import ast

def extract_string_from_list(txt):
  try:
    txt_list = ast.literal_eval(txt)
    if isinstance(txt_list, list) and len(txt_list) > 0:
      return txt_list[0]
  except (ValueError, SyntaxError, MemoryError):
    pass
  return txt

train_data["Tweet"] = train_data["Tweet"].apply(extract_string_from_list)

In [35]:
train_data["Tweet"] = train_data["Tweet"].apply(remove_whitespaces)
train_data["Tweet"] = train_data["Tweet"].apply(convert_to_lowercase)
train_data["Tweet"] = train_data["Tweet"].apply(remove_numbers)
train_data["Tweet"] = train_data["Tweet"].apply(remove_punctuations)
train_data["Tweet"] = train_data["Tweet"].apply(remove_url_html)
train_data["Target"] = train_data["Target"].apply(convert_to_lowercase)

test_data["Target"] = test_data["Target"].apply(convert_to_lowercase)

val_data["Target"] = val_data["Target"].apply(convert_to_lowercase)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_data["Target"] = test_data["Target"].apply(convert_to_lowercase)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  val_data["Target"] = val_data["Target"].apply(convert_to_lowercase)


In [36]:
train_data["Tweet"].apply(lambda x: len(str(x).split())).max()

41

In [38]:
test_data["Tweet"].apply(lambda x: len(str(x).split())).max()

27

In [39]:
val_data["Tweet"].apply(lambda x: len(str(x).split())).max()

26

In [40]:
train_data["Target"].apply(lambda x: len(str(x).split())).max()

6

In [41]:
train_data = train_data.drop_duplicates(subset = ["Tweet"])

In [42]:
train_data.reset_index(drop = True, inplace = True)
test_data.reset_index(drop = True, inplace = True)
val_data.reset_index(drop = True, inplace = True)

In [43]:
word2vec_data = pd.read_csv("/content/gdrive/MyDrive/Stance Detection with Bidirectional Conditional Encoding/word2vec training data.csv", encoding = "ISO-8859-1", names = ["sentiment", "ids", "date", "flag", "user", "Tweet"])
word2vec_data

Unnamed: 0,sentiment,ids,date,flag,user,Tweet
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."
...,...,...,...,...,...,...
1599995,4,2193601966,Tue Jun 16 08:40:49 PDT 2009,NO_QUERY,AmandaMarie1028,Just woke up. Having no school is the best fee...
1599996,4,2193601969,Tue Jun 16 08:40:49 PDT 2009,NO_QUERY,TheWDBoards,TheWDB.com - Very cool to hear old Walt interv...
1599997,4,2193601991,Tue Jun 16 08:40:49 PDT 2009,NO_QUERY,bpbabe,Are you ready for your MoJo Makeover? Ask me f...
1599998,4,2193602064,Tue Jun 16 08:40:49 PDT 2009,NO_QUERY,tinydiamondz,Happy 38th Birthday to my boo of alll time!!! ...


In [44]:
word2vec_data = word2vec_data.drop(columns = ["sentiment", "ids", "date", "flag", "user"])
word2vec_data

Unnamed: 0,Tweet
0,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,is upset that he can't update his Facebook by ...
2,@Kenichan I dived many times for the ball. Man...
3,my whole body feels itchy and like its on fire
4,"@nationwideclass no, it's not behaving at all...."
...,...
1599995,Just woke up. Having no school is the best fee...
1599996,TheWDB.com - Very cool to hear old Walt interv...
1599997,Are you ready for your MoJo Makeover? Ask me f...
1599998,Happy 38th Birthday to my boo of alll time!!! ...


In [45]:
word2vec_data["Tweet"] = word2vec_data["Tweet"].apply(remove_whitespaces)
word2vec_data["Tweet"] = word2vec_data["Tweet"].apply(convert_to_lowercase)
word2vec_data["Tweet"] = word2vec_data["Tweet"].apply(remove_numbers)
word2vec_data["Tweet"] = word2vec_data["Tweet"].apply(remove_punctuations)
word2vec_data["Tweet"] = word2vec_data["Tweet"].apply(remove_url_html)

In [46]:
word2vec_data = pd.concat([word2vec_data, pd.DataFrame(train_data["Tweet"])], axis = 0)

In [47]:
import gensim
tokenized_tweet = word2vec_data["Tweet"].apply(lambda x: x.split())
model_w2v = gensim.models.Word2Vec(
            tokenized_tweet,
            vector_size = 100,
            window = 5,
            min_count = 5,
            sg = 1,
            seed = 34
)
model_w2v.train(tokenized_tweet, total_examples = len(word2vec_data["Tweet"]), epochs = 20)



(311987049, 414676860)

In [48]:
def get_word_embeddings(word_list, model):
  word_embeddings = []
  for word in word_list:
    if word in model.wv:
      word_embeddings.append(list(model.wv[word]))

  return word_embeddings

In [49]:
train_data["Tweet"] = train_data["Tweet"].apply(lambda x: x.split()).apply(lambda tokens: get_word_embeddings(tokens, model_w2v))
test_data["Tweet"] = test_data["Tweet"].apply(lambda x: x.split()).apply(lambda tokens: get_word_embeddings(tokens, model_w2v))
val_data["Tweet"] = val_data["Tweet"].apply(lambda x: x.split()).apply(lambda tokens: get_word_embeddings(tokens, model_w2v))
train_data["Target"] = train_data["Target"].apply(lambda x: x.split()).apply(lambda tokens: get_word_embeddings(tokens, model_w2v))
test_data["Target"] = test_data["Target"].apply(lambda x: x.split()).apply(lambda tokens: get_word_embeddings(tokens, model_w2v))
val_data["Target"] = val_data["Target"].apply(lambda x: x.split()).apply(lambda tokens: get_word_embeddings(tokens, model_w2v))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_data["Tweet"] = test_data["Tweet"].apply(lambda x: x.split()).apply(lambda tokens: get_word_embeddings(tokens, model_w2v))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  val_data["Tweet"] = val_data["Tweet"].apply(lambda x: x.split()).apply(lambda tokens: get_word_embeddings(tokens, model_w2v))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guid

In [50]:
# train_data.to_csv("/content/gdrive/MyDrive/Stance Detection with Bidirectional Conditional Encoding/train_data.csv", index = False)
# test_data.to_csv("/content/gdrive/MyDrive/Stance Detection with Bidirectional Conditional Encoding/test_data.csv", index = False)
# val_data.to_csv("/content/gdrive/MyDrive/Stance Detection with Bidirectional Conditional Encoding/val_data.csv", index = False)

In [1]:
import pandas as pd
import torch
from google.colab import drive
drive.mount("/content/gdrive")

Mounted at /content/gdrive


In [2]:
device = "cuda" if torch.cuda.is_available() else "cpu"
device

'cuda'

In [3]:
train_data = pd.read_csv("/content/gdrive/MyDrive/Stance Detection with Bidirectional Conditional Encoding/train_data.csv")
test_data = pd.read_csv("/content/gdrive/MyDrive/Stance Detection with Bidirectional Conditional Encoding/test_data.csv")
val_data = pd.read_csv("/content/gdrive/MyDrive/Stance Detection with Bidirectional Conditional Encoding/val_data.csv")

In [4]:
def extract_list_from_string(s):
  try:
    extracted_list = eval(s)
    if isinstance(extracted_list, list):
      return extracted_list
    else:
      return None
  except SyntaxError:
    return None

train_data["Tweet"] = train_data["Tweet"].apply(extract_list_from_string)
test_data["Tweet"] = test_data["Tweet"].apply(extract_list_from_string)
val_data["Tweet"] = val_data["Tweet"].apply(extract_list_from_string)
train_data["Target"] = train_data["Target"].apply(extract_list_from_string)
test_data["Target"] = test_data["Target"].apply(extract_list_from_string)
val_data["Target"] = val_data["Target"].apply(extract_list_from_string)

In [5]:
train_data["Tweet"].apply(lambda x: len(x)).max()

41

In [6]:
def f(lst):
  if len(lst) < 41:
    d = 41 - len(lst)
    for _ in range(d):
      lst.append([0] * 100)
  return lst

def g(lst):
  if len(lst) < 6:
    d = 6 - len(lst)
    for _ in range(d):
      lst.append([0] * 100)
  return lst

In [7]:
train_data["Tweet"] = train_data["Tweet"].apply(f)
test_data["Tweet"] = test_data["Tweet"].apply(f)
val_data["Tweet"] = val_data["Tweet"].apply(f)
train_data["Target"] = train_data["Target"].apply(g)
test_data["Target"] = test_data["Target"].apply(g)
val_data["Target"] = val_data["Target"].apply(g)

In [8]:
import numpy as np

In [9]:
train_tweet_arr = np.array(train_data.Tweet.tolist())
train_target_arr = np.array(train_data.Target.tolist())
test_tweet_arr = np.array(test_data.Tweet.tolist())
test_target_arr = np.array(test_data.Target.tolist())
val_tweet_arr = np.array(val_data.Tweet.tolist())
val_target_arr = np.array(val_data.Target.tolist())

train_tweet_encodings = torch.tensor(train_tweet_arr, dtype = torch.float32).to(device)
train_target_encodings = torch.tensor(train_target_arr, dtype = torch.float32).to(device)
test_tweet_encodings = torch.tensor(test_tweet_arr, dtype = torch.float32).to(device)
test_target_encodings = torch.tensor(test_target_arr, dtype = torch.float32).to(device)
val_tweet_encodings = torch.tensor(val_tweet_arr, dtype = torch.float32).to(device)
val_target_encodings = torch.tensor(val_target_arr, dtype = torch.float32).to(device)

train_stance = torch.from_numpy(train_data.Stance.to_numpy()).type(torch.LongTensor).to(device)
test_stance = torch.from_numpy(test_data.Stance.to_numpy()).type(torch.LongTensor).to(device)
val_stance = torch.from_numpy(val_data.Stance.to_numpy()).type(torch.LongTensor).to(device)

In [10]:
from torch.utils.data import DataLoader, TensorDataset
import random
seed_val = 0
random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)


train_ds = TensorDataset(train_tweet_encodings, train_target_encodings, train_stance)
test_ds = TensorDataset(test_tweet_encodings, test_target_encodings, test_stance)
val_ds = TensorDataset(val_tweet_encodings, val_target_encodings, val_stance)

batch_size = 22
train_dl = DataLoader(train_ds, shuffle = True, batch_size = batch_size, drop_last = True)
test_dl = DataLoader(test_ds, shuffle = True, batch_size = batch_size, drop_last = True)
val_dl = DataLoader(val_ds, shuffle = True, batch_size = batch_size, drop_last = True)

In [11]:
train_data.Stance.value_counts()

1    2000
2    1999
0    1997
Name: Stance, dtype: int64

In [12]:
import torch.nn as nn

class BiLSTM(nn.Module):
  def __init__(self, input_size, hidden_size, num_classes):
    super(BiLSTM, self).__init__()
    self.lstm_target = nn.LSTM(input_size, hidden_size, batch_first = True, bidirectional = True)
    self.lstm_twitter = nn.LSTM(input_size, hidden_size, batch_first = True, bidirectional = True)
    self.W = nn.Linear(2*hidden_size, num_classes, bias = False)


  def forward(self, target, tweet):
    outputs_target, (h_n_target, c_n_target) = self.lstm_target(target)
    outputs_twitter, (h_n_twitter, c_n_twitter) = self.lstm_twitter(tweet, (torch.zeros_like(c_n_target), c_n_target))
    flattened_output = h_n_twitter.reshape(batch_size, -1)
    c = self.W(flattened_output)

    return c

In [13]:
input_size, hidden_size, num_classes = 100, 10, 3
lstm = BiLSTM(input_size, hidden_size, num_classes).to(device)

In [14]:
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(lstm.parameters(), lr = 1e-5)

In [15]:
!pip install torchmetrics

Collecting torchmetrics
  Downloading torchmetrics-1.2.0-py3-none-any.whl (805 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/805.2 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m92.2/805.2 kB[0m [31m2.6 MB/s[0m eta [36m0:00:01[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━[0m [32m737.3/805.2 kB[0m [31m10.6 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m805.2/805.2 kB[0m [31m9.8 MB/s[0m eta [36m0:00:00[0m
Collecting lightning-utilities>=0.8.0 (from torchmetrics)
  Downloading lightning_utilities-0.9.0-py3-none-any.whl (23 kB)
Installing collected packages: lightning-utilities, torchmetrics
Successfully installed lightning-utilities-0.9.0 torchmetrics-1.2.0


In [16]:
from sklearn.metrics import f1_score, classification_report
from torchmetrics import Accuracy, ConfusionMatrix

seed_val = 0
random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)

accuracy = Accuracy(task = "multiclass", num_classes = 3).to(device)

num_epochs = 11
for epoch in range(num_epochs):
  lstm.train()
  loss1 = 0
  loss2 = 0
  t = torch.tensor([])
  t = t.to(device)
  a = torch.tensor([])
  a = a.to(device)
  for batch in train_dl:
    tweet, target, stance = batch
    a = torch.cat((a, stance), dim = 0)
    c = lstm(target, tweet)
    y_preds = torch.softmax(c, dim = 1).argmax(dim = 1)
    t = torch.cat((t, y_preds), dim = 0)
    loss = criterion(c, stance)
    loss1 += loss
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
  train_accuracy = accuracy(t, a)

  lstm.eval()
  with torch.inference_mode():
    t2 =  torch.tensor([])
    t2 = t2.to(device)
    a2 = torch.tensor([])
    a2 = a2.to(device)
    loss2 = 0
    for batch in val_dl:
      tweet, target, stance = batch
      a2 = torch.cat((a2, stance), dim = 0)
      c = lstm(target, tweet)
      val_preds = torch.softmax(c, dim = 1).argmax(dim = 1)
      t2 = torch.cat((t2, val_preds), dim = 0)
      loss = criterion(c, stance)
      loss2 += loss
    val_accuracy = accuracy(t2, a2)
  print(f"Epoch {epoch+1}/{num_epochs} | Train Loss: {loss1/len(train_dl):.5f} | Train Accuracy: {train_accuracy :.5f} | Validation Loss: {loss2/len(val_dl):.5f} | Validation Accuracy: {val_accuracy:.5f}")
  confusionmatrix = ConfusionMatrix(task = "multiclass", num_classes = 3).to(device)
  confusion_matrix = confusionmatrix(t2, a2)
  print(confusion_matrix)
  print(classification_report(a2.to("cpu").numpy(), t2.to("cpu").numpy()))

Epoch 1/11 | Train Loss: 1.10460 | Train Accuracy: 0.33506 | Validation Loss: 1.10941 | Validation Accuracy: 0.36950
tensor([[193, 156,  41],
        [ 68,  31,  17],
        [ 88,  60,  28]], device='cuda:0')
              precision    recall  f1-score   support

         0.0       0.55      0.49      0.52       390
         1.0       0.13      0.27      0.17       116
         2.0       0.33      0.16      0.21       176

    accuracy                           0.37       682
   macro avg       0.33      0.31      0.30       682
weighted avg       0.42      0.37      0.38       682

Epoch 2/11 | Train Loss: 1.10379 | Train Accuracy: 0.33422 | Validation Loss: 1.10361 | Validation Accuracy: 0.39589
tensor([[202, 130,  59],
        [ 66,  40,  11],
        [ 84,  62,  28]], device='cuda:0')
              precision    recall  f1-score   support

         0.0       0.57      0.52      0.54       391
         1.0       0.17      0.34      0.23       117
         2.0       0.29      0.16   

In [17]:
from torchmetrics import Precision, Recall, F1Score, ConfusionMatrix
from sklearn.metrics import classification_report

lstm.eval()
with torch.inference_mode():
  t3 =  torch.tensor([])
  t3 = t3.to(device)
  a3 = torch.tensor([])
  a3 = a3.to(device)
  for batch in test_dl:
    tweet, target, stance = batch
    a3 = torch.cat((a3, stance), dim = 0)
    c = lstm(target, tweet)
    test_preds = torch.softmax(c, dim = 1).argmax(dim = 1)
    t3 = torch.cat((t3, test_preds), dim = 0)
  accuracy = Accuracy(task = "multiclass", num_classes = 3).to(device)
  accuracy = accuracy(t3, a3)
  precision = Precision(task = "multiclass", average = "macro", num_classes = 3).to(device)
  precision_scores = precision(t3, a3)
  f1score = F1Score(task = "multiclass", num_classes = 3).to(device)
  f1_scores = f1score(t3, a3)
  recall = Recall(task = "multiclass", average = "macro", num_classes = 3).to(device)
  recall_scores = recall(t3, a3)
  confusionmatrix = ConfusionMatrix(task = "multiclass", num_classes = 3).to(device)
  confusion_matrix = confusionmatrix(t3, a3)
  print(f"Accuracy: {accuracy:.5f}\nPrecision: {precision_scores:.5f}\nRecall: {recall_scores:.5f}\nF1 Score: {f1_scores:.5f}\nConfusion Matrix:\n\t{confusion_matrix}")

Accuracy: 0.35227
Precision: 0.33070
Recall: 0.34107
F1 Score: 0.35227
Confusion Matrix:
	tensor([[166,  89,  42],
        [ 77,  51,  20],
        [132,  96,  31]], device='cuda:0')


In [18]:
print(classification_report(a3.to("cpu").numpy(), t3.to("cpu").numpy()))

              precision    recall  f1-score   support

         0.0       0.44      0.56      0.49       297
         1.0       0.22      0.34      0.27       148
         2.0       0.33      0.12      0.18       259

    accuracy                           0.35       704
   macro avg       0.33      0.34      0.31       704
weighted avg       0.35      0.35      0.33       704

