# Pre Preparation

In [None]:
# Import thing
from google.colab import files
import pandas as pd
import torch
import re

# nltk
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk import pos_tag
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')

# sklearn for SVM & Tfidf Vectorization
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC

# Data visulization
import seaborn as sns
import matplotlib.pyplot as plt
from wordcloud import WordCloud

# check versions
!python --version
!nvidia-smi
print(torch.cuda.is_available())

In [None]:
# upload files
uploaded = files.upload()
for fn in uploaded:
  print('User uploaded file"{name}" with length {length} bytes').format(name=fn, length=len(uploaded[fn]))

# Data Prepare

In [None]:
# Read taining set 
dt_train = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/cs579data/train.csv', index_col='id')
dt_test = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/cs579data/test.csv', index_col='id')
dt_submit = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/cs579data/sample_submission.csv', index_col='id')
# del dt_test
# del dt_train

In [None]:
def preprocess(file_name='/content/drive/MyDrive/Colab Notebooks/cs579data/train.csv', index_col='id'):
    data = pd.read_csv(file_name, index_col=index_col)

    # remove error rows. cause some cases when tid1==tid2, the label is not 'agreed', it means the label is incorrect.
    error_data = data[(data['tid1'] == data['tid2']) & (data['label'] != 'agreed')]
    data = data.drop(error_data.index)

    # get all real news
    real_news = data[data['label'] == 'disagreed']['title2_en']

    # get news with unknown label, set it to neutral_news
    neutral_news = data[data['label'] == 'unrelated']['title2_en']

    # get fake news from two part.
    # part 1：all in 'tid1' 
    fake_news_1 = data[(data['tid1'] != data['tid2'])]['title1_en']
    # part 2: all fake news in 'tid2'
    fake_news_2 = data[(data['tid1'] != data['tid2']) & (data['label'] == 'agreed')]['title2_en']
    fake_news_1.append(fake_news_2)

    # save data
    error_data.to_csv('/content/drive/MyDrive/Colab Notebooks/cs579data/error.csv')
    real_news.to_csv('/content/drive/MyDrive/Colab Notebooks/cs579data/real.csv')
    fake_news_1.to_csv('/content/drive/MyDrive/Colab Notebooks/cs579data/fake.csv')
    neutral_news.to_csv('/content/drive/MyDrive/Colab Notebooks/cs579data/neutral.csv')
preprocess()

## This part can read data that saved from previous data.

In [None]:
dt_real = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/cs579data/real.csv', index_col='id')
dt_fake = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/cs579data/fake.csv', index_col='id')
dt_neutral = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/cs579data/neutral.csv', index_col='id')

# NLTK Text Preprocessing

In [None]:
def preprocess_text(text):
  """
    Preprocessing the text for each row.
    Includes tokenization, removing stopwords, lemmatization and convert all the token to lowercase.
  """
  # Tokenization
  tokens = [word for sent in sent_tokenize(text) for word in word_tokenize(sent)]
  stop = stopwords.words('english')
  # Removing stopwords
  tokens = [token for token in tokens if token not in stop]
  tokens = [word for word in tokens if len(word) >= 3]
  # Covert all tokens to lowercase
  tokens = [word.lower() for word in tokens]
  # Lemma
  lmtzr = WordNetLemmatizer()
  tokens = [lmtzr.lemmatize(word) for word in tokens]
  preprocessed_text = ' '.join(tokens)
  return preprocessed_text

In [None]:
# preprocessed
dt_real['preprocessed'] = dt_real.loc[:, ['title2_en']].applymap(preprocess_text)
dt_neutral['preprocessed'] = dt_neutral.loc[:, ['title2_en']].applymap(preprocess_text)
dt_fake['preprocessed'] = dt_fake.loc[:, ['title1_en']].applymap(preprocess_text)

In [None]:
# save, cause preprocess need much time
dt_real.to_csv('real_processed.csv')
dt_fake.to_csv('fake_processed.csv')
dt_neutral.to_csv('neutral_processed.csv')

# Vectorization & split training dataset

In [None]:
# read preprocessed data
dt_real = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/cs579data/real_processed.csv', index_col='id')
dt_fake = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/cs579data/fake_processed.csv', index_col='id')
dt_neutral = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/cs579data/neutral_processed.csv', index_col='id')

In [None]:
tfidf_vectorizer = TfidfVectorizer(use_idf=True, smooth_idf=True, min_df=2, norm=None)
real_vector = tfidf_vectorizer.fit_transform(dt_real['preprocessed'].values.astype('U'))

# Data Preview

In [None]:
dt_real['label'] = '0'
dt_fake['label'] = '1'
dt_neutral['label'] = '2'

In [None]:
print(dt_real.shape)
print(dt_fake.shape)
print(dt_neutral.shape)

In [None]:
real_titles = dt_real.title2_en
real_titles_ls = [text for text in real_titles]
# print(alls)
real_all_words = ' '.join(real_titles)
wordcloud_real = WordCloud(background_color='white',
    width= 800, height= 500,
    max_font_size = 180,
    collocations = False).generate(real_all_words)

plt.figure(figsize=(10,7))
plt.imshow(wordcloud_real, interpolation='bilinear')
plt.axis("off")
plt.show()

In [None]:
fake_titles = dt_fake.title1_en
fake_titles_ls = [text for text in fake_titles]
# print(alls)
fake_all_words = ' '.join(fake_titles)
wordcloud_fake = WordCloud(background_color='white',
    width= 800, height= 500,
    max_font_size = 180,
    collocations = False).generate(fake_all_words)

plt.figure(figsize=(10,7))
plt.imshow(wordcloud_fake, interpolation='bilinear')
plt.axis("off")
plt.show()

In [None]:
neutral_titles = dt_neutral.title2_en
neutral_titles_ls = [text for text in neutral_titles]
# print(alls)
neutral_all_words = ' '.join(neutral_titles)
wordcloud_neutral = WordCloud(background_color='white',
    width= 800, height= 500,
    max_font_size = 180,
    collocations = False).generate(neutral_all_words)

plt.figure(figsize=(10,7))
plt.imshow(wordcloud_neutral, interpolation='bilinear')
plt.axis("off")
plt.show()

# YiYi's Code

## Import

In [None]:
!pip install transformers
import time
import torch
import math
import numpy
from transformers import BertTokenizer
from transformers import logging
from IPython.display import clear_output
from transformers import BertForMaskedLM
import pandas as pd
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
from torch.nn.utils.rnn import pad_sequence
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from transformers import BertForSequenceClassification
import matplotlib.pyplot as plt
import numpy as np
from scipy.sparse.construct import rand

PRETRAINED_MODEL_NAME = "bert-base-cased"
tokenizer = BertTokenizer.from_pretrained(PRETRAINED_MODEL_NAME)

## Data

### Prepare Data

In [None]:
from google.colab import drive
df_train = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/cs579data/train.csv')

In [None]:
def get_split(text):
  """ only keep the first 150 words of a text. """
  return text[:150]

In [None]:
# delete row with long title
# Because long title will 
MAX_LENGTH = 150
df_train = df_train[~(df_train.title1_en.apply(lambda x : len(x)) > MAX_LENGTH)]
df_train = df_train[~(df_train.title2_en.apply(lambda x : len(x)) > MAX_LENGTH)]

# 250 thousands training datas are too large to spend a lot of time 
# So I select 70 percents of datas to train the model
SAMPLE_FRAC = 1.0
df_train = df_train.sample(frac=SAMPLE_FRAC, random_state=9527)

df_train = df_train.reset_index()
df_train = df_train.loc[:, ['title1_en', 'title2_en', 'label']]

# save processed training data to csv file
df_train.to_csv("train.csv", sep=",", index=False)

In [None]:
import random
import pandas
df_len = len(df_train)
#print(df_len)
split = 0.7
inde = math.floor(df_len * split)
# split our data into train/validation sets in 70%/30%
df_train_train = df_train.iloc[:inde, :] 
df_train_val = df_train.iloc[inde+1:, : ] 
print(len(df_train_train))
print(len(df_train_val))
df_train_train.to_csv("df_train_train.csv", sep=",", index=False)
df_train_val.to_csv("df_train_val.csv", sep=",", index=False)

In [None]:
type(df_train_train)
df_train_train.info()

### Visualize Dataset

In [None]:
def vis_train_val_distribution(df_train_train, df_train_val):
  labels = ['unrelated', 'agreed', 'disagreed']
  train_set = []
  valid_set = []
  for label in labels:
    train_set.append(df_train_train[df_train_train['label']==label]['label'].count())
    valid_set.append(df_train_val[df_train_val['label']==label]['label'].count())

  x = np.arange(len(labels))  # the label locations
  width = 0.35  # the width of the bars

  fig, ax = plt.subplots()
  rects1 = ax.bar(x - width/2, train_set, width, label='TrainSet Distribution')
  rects2 = ax.bar(x + width/2, valid_set, width, label='ValidSet Distribution')

  ax.set_ylabel('Count')
  ax.set_title('Train Valid Dataset Distribution')
  ax.set_xticks(x, labels)
  ax.legend()

  plt.show()

  fig1, ax = plt.subplots(1, 2)
  ax[0].pie(train_set, labels=labels, autopct='%1.1f%%',
          shadow=True, startangle=90)
  ax[0].axis('equal')  # Equal aspect ratio ensures that pie is drawn as a circle.
  ax[0].set_title('Training Set')

  ax[1].pie(valid_set, labels=labels, autopct='%1.1f%%',
          shadow=True, startangle=90)
  ax[1].axis('equal')  # Equal aspect ratio ensures that pie is drawn as a circle.
  ax[1].set_title('Validation Set')
  plt.show()

In [None]:
def generate_balance_data(df, max_num):
  """Create balanced data set"""
  labels = ['unrelated', 'agreed', 'disagreed']  # as our project only have three labels
  df_balanced = pd.DataFrame(columns=['title1_en', 'title2_en', 'label'])
  for label in labels:
    df_balanced = df_balanced.append(df[df['label']==label][0:max_num])
  return df_balanced

# df_train_train = generate_balance_data(df_train_train, 4000)
df_train_val = generate_balance_data(df_train_val, 1000)
df_train_train.to_csv("df_train_train.csv", sep=",", index=False)
df_train_val.to_csv("df_train_val.csv", sep=",", index=False)

In [None]:
vis_train_val_distribution(df_train_train, df_train_val)

## Model

In [None]:
if torch.cuda.is_available():    

    # Tell PyTorch to use the GPU.  
    
    device = torch.device('cuda')    


    print('There are %d GPU(s) available.' % torch.cuda.device_count())

    print('We will use the GPU:', torch.cuda.get_device_name(0))

# If not...

else:
    print('No GPU available, using the CPU instead.')
    device = torch.device('cpu')

### Model Class

In [None]:
class fake_news(Dataset):
    def __init__(self, mode, tokenizer):
        assert mode in ["df_train_train", "test","df_train_val"] 
        self.mode = mode
        # generate train or test csv file
        self.df = pd.read_csv(mode + ".csv")
        self.len = len(self.df)
        self.label_map = {'agreed': 0, 'disagreed': 1, 'unrelated': 2}
        self.tokenizer = tokenizer  

    def __getitem__(self, idx):
        if self.mode == "test":
            title1_en, title2_en = self.df.iloc[idx, :2].values
            label_tensor = None
        else:
            title1_en, title2_en, label = self.df.iloc[idx, :].values
            label_id = self.label_map[label]
            label_tensor = torch.tensor(label_id)

        # First sentence ==> transfer to BERT tokens and add [SEP] to separate two sentences
        word_pieces = ["[CLS]"]

        tokens_1 = self.tokenizer.tokenize(title1_en)
        word_pieces += tokens_1 + ["[SEP]"]
        len_1 = len(word_pieces)

        # Second sentence ==> transfer to BERT tokens and add [SEP]
        tokens_2 = self.tokenizer.tokenize(title2_en)
        word_pieces += tokens_2 + ["[SEP]"]
        len_2 = len(word_pieces) - len_1

        # transfer token to index
        ids = self.tokenizer.convert_tokens_to_ids(word_pieces)
        tokens_tensor = torch.tensor(ids)

        # [SEP] of first sentence set 0
        # [SEP] of second sentence set 1
        segments_tensor = torch.tensor([0] * len_1 + [1] * len_2,dtype=torch.long)     

        return (tokens_tensor, segments_tensor, label_tensor)

    def __len__(self):
        return self.len

In [None]:
def collate_fn(samples):
    
    tokens_tensors = [s[0] for s in samples]
    segments_tensors = [s[1] for s in samples]
    
    # we have labels in testset and validation set
    if samples[0][2] is not None:
        label_ids = torch.stack([s[2] for s in samples])
    else:
        label_ids = None
    

    # make sure each tensor has same length
    tokens_tensors = pad_sequence(tokens_tensors, batch_first=True)
    segments_tensors = pad_sequence(segments_tensors, batch_first=True)

    masks_tensors = torch.zeros(tokens_tensors.shape, dtype=torch.long)
    masks_tensors = masks_tensors.masked_fill(tokens_tensors != 0, 1)

    return tokens_tensors, segments_tensors, masks_tensors, label_ids

### Prediction Method

In [None]:
def get_predictions(model, dataloader, compute_acc=False): 
    predictions = None
    correct = 0
    total = 0
    i =0
    with torch.no_grad():
        for data in dataloader:
            # Use GPU to execute
            if next(model.parameters()).is_cuda:
                data = [t.to("cuda:0") for t in data if t is not None]
            tokens_tensors, segments_tensors, masks_tensors = data[:3]
            outputs = model(input_ids=tokens_tensors,token_type_ids=segments_tensors,attention_mask=masks_tensors)
            OP = outputs[0]
            _, pred = torch.max(OP.data, 1)
          
            # compute accuracy
            if compute_acc:
                labels = data[3]
                total += labels.size(0)
                correct += (pred == labels).sum().item()

            # store the priduction of this batch
            if predictions is None:
                predictions = pred
            else:
                predictions = torch.cat((predictions, pred))
    # We use this in training mode to know the accuracy of our model
    if compute_acc:
        acc = correct / total
        return predictions, acc
    return predictions

### Initial Training and Validation data set

In [None]:
# initialize training data and validation data
trainset = fake_news("df_train_train", tokenizer=tokenizer)
validset = fake_news("df_train_val", tokenizer=tokenizer)

In [None]:
df_train_val['label'].value_counts()

In [None]:
# trainloader = DataLoader(trainset, batch_size=64,collate_fn=collate_fn)
validloader = DataLoader(validset, batch_size=64,collate_fn=collate_fn)

In [None]:
PRETRAINED_MODEL_NAME = "bert-base-cased"
NUM_LABELS = 3

model = BertForSequenceClassification.from_pretrained(PRETRAINED_MODEL_NAME, num_labels=NUM_LABELS)

clear_output()

### Training Model

In [None]:
# move our model over to the selected device
model = model.to(device)
print("device:", device)
# To know the accuracy before training
#_, acc = get_predictions(model, validloader, compute_acc=True)
#print("classification acc:", acc)

start = time.time()
# activate training mode 
model.train()

# initialize optimizer
optimizer = torch.optim.Adam(model.parameters(), lr=1e-5)
pre_acc = 0.0
epoch = 0
while epoch <5:
    running_loss = 0.0
    for data in trainloader:
        tokens_tensors, segments_tensors, \
        masks_tensors, labels = [t.to(device) for t in data]
        optimizer.zero_grad()

        # forward pass
        outputs = model(input_ids=tokens_tensors,
                token_type_ids=segments_tensors,
                attention_mask=masks_tensors,
                labels=labels)

        loss = outputs[0]
        # backward
        loss.backward()
        optimizer.step()

        # store the batch loss
        running_loss += loss.item()

    # Use validation dataset to evaluate our model
    _, acc = get_predictions(model, validloader, compute_acc=True)
    # stop training when accuracy starts decreasing
    if acc <= pre_acc - 0.05:
      print("accuracy = %.4f is less than previous accuracy = %.4f - 0.2" %(acc, pre_acc))
      break
    # record previous accuracy
    pre_acc = acc
    print('epoch %d ==> loss: %.3f, accuracy: %.4f' %(epoch + 1, running_loss, acc))
    epoch += 1

print("The time used to execute this is given below")

end = time.time()

print(end - start)
torch.save(model, '/content/drive/MyDrive/Colab Notebooks/cs579data/9000train.pt')

In [None]:
# model = torch.load('/content/drive/MyDrive/Colab Notebooks/cs579data/full_train.pt')

In [None]:
print("Model's state_dict:")
for param_tensor in model.state_dict():
    print(param_tensor, "\t", model.state_dict()[param_tensor].size())

### Prediction

#### Prepare test data

In [None]:
MAX_LENGTH = 200
df_test = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/cs579data/test.csv')
print(len(df_test))
# df_test = df_test[~(df_test.title1_en.apply(lambda x : len(x)) > MAX_LENGTH)]
# df_test = df_test[~(df_test.title2_en.apply(lambda x : len(x)) > MAX_LENGTH)]
df_test['title1_en'] = df_test['title1_en'].apply(get_split)
df_test['title2_en'] = df_test['title2_en'].apply(get_split)
df_test = df_test.reset_index()
df_test = df_test.loc[:, ['title1_en', 'title2_en','id']]
print(len(df_test))
df_test.to_csv("test.csv", sep=",", index=False)

In [None]:
# keep a copy of original df_train_val dataset, for test each experiment
df_train_val.to_csv("df_train_val_test.csv", sep=",", index=False)

In [None]:
def valid_model():
  df_train_val = pd.read_csv("df_train_val_test.csv")
  df_train_val = generate_balance_data(df_train_val, 1000)
  df_train_val.to_csv("df_train_val.csv", sep=",", index=False)
  print(df_train_val['label'].value_counts())
  # initialize validation data
  validset = fake_news("df_train_val", tokenizer=tokenizer)
  
  validloader = DataLoader(validset, batch_size=64,collate_fn=collate_fn)
  _, acc = get_predictions(model, validloader, compute_acc=True)
  print("classification acc:", acc)
valid_model()

#### Predict

In [None]:
testset = fake_news("test", tokenizer=tokenizer)
testloader = DataLoader(testset, batch_size=50,collate_fn=collate_fn)

predictions = get_predictions(model, testloader)

# transform the label to the words we can understand
index_map = {v: k for k, v in testset.label_map.items()}

# produce the result file
df = pd.DataFrame({"label": predictions.tolist()})
df['label'] = df.label.apply(lambda x: index_map[x])
df_pred = pd.concat([testset.df.loc[:, ["id"]], df.loc[:, 'label']], axis=1)
df_pred.to_csv('/content/drive/MyDrive/Colab Notebooks/cs579data/“submission.csv', index=False)