**LUO YIFENG**

# Data

In [60]:
import re

def clean_text(text):
        """
        This function cleans the text in the following ways
        1. Replace websites with URL
        2. Replace 's with <space>'s (e.g., her's --> her 's)
        """
        text = re.sub(r"http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+", "URL", text) # Replace urls with special token
        #text = text.replace("\'s", "")
        #text = text.replace("\'", "")
        #text = text.replace("n\'t", " n\'t")
        #text = text.replace("@", "")
        #text = text.replace("#", "")
        #text = text.replace("_", " ")
        #text = text.replace("-", " ")
        text = text.replace("&amp;", "")
        text = text.replace("&gt;", "")
        text = text.replace("\"", "")
        text = text.replace("$MENTION$", '')
        text = text.replace("$ URL $", '')
        text = text.replace("$URL$", '')
        #text = text.replace(".", "")
        #text = text.replace(",", "")
        #text = text.replace("(", "")
        #text = text.replace(")", "")
        text = text.replace("<end>", "")
        text = ' '.join(text.split())
        return text.strip()

# word level
def tokenize(lines):
    return [line.split() for line in lines]

def clean(data):
    for index, text in enumerate(data):
        text = clean_text(text)
        data[index] = text
    
    return data        

In [61]:
df = pd.read_csv('data.csv')
X = df.Text.to_list()
y = []
for label in df.Label:
    if label == 'non-rumors':
        y.append(0)
    elif label == 'rumors':
        y.append(1)

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state=8)


X_train = clean(X_train)
X_test = clean(X_test)

In [62]:
X_train_tokenize = tokenize(X_train)
X_test_tokenize = tokenize(X_test)

### Bulid Vocab

In [63]:
import collections

class Vocab:
    """Vocabulary for text."""
    def __init__(self, tokens=None, min_freq=0, reserved_tokens=None):
        """Defined in :numref:`sec_text_preprocessing`"""
        if tokens is None:
            tokens = []
        if reserved_tokens is None:
            reserved_tokens = []
        # Sort according to frequencies
        counter = count_corpus(tokens)
        self._token_freqs = sorted(counter.items(), key=lambda x: x[1],
                                   reverse=True)
        # The index for the unknown token is 0
        self.idx_to_token = ['<unk>'] + reserved_tokens
        self.token_to_idx = {token: idx
                             for idx, token in enumerate(self.idx_to_token)}
        for token, freq in self._token_freqs:
            if freq < min_freq:
                break
            if token not in self.token_to_idx:
                self.idx_to_token.append(token)
                self.token_to_idx[token] = len(self.idx_to_token) - 1

    def __len__(self):
        return len(self.idx_to_token)

    def __getitem__(self, tokens):
        if not isinstance(tokens, (list, tuple)):
            return self.token_to_idx.get(tokens, self.unk)
        return [self.__getitem__(token) for token in tokens]

    def to_tokens(self, indices):
        if not isinstance(indices, (list, tuple)):
            return self.idx_to_token[indices]
        return [self.idx_to_token[index] for index in indices]

    @property
    def unk(self):  # Index for the unknown token
        return 0

    @property
    def token_freqs(self):  # Index for the unknown token
        return self._token_freqs
    
    
def count_corpus(tokens):
    if len(tokens) == 0 or isinstance(tokens[0], list):
        tokens = [token for line in tokens for token in line]
    return collections.Counter(tokens)

In [64]:
vocab = Vocab(X_train_tokenize, min_freq=5)

In [65]:
def truncate_pad(line, num_steps, padding_token):
    if len(line) > num_steps:
        return line[:num_steps]
    return line + [padding_token] * (num_steps - len(line))

In [66]:
import torch

train_features = torch.tensor([truncate_pad(
        vocab[line], 500, vocab['<pad>']) for line in X_train_tokenize])
test_features = torch.tensor([truncate_pad(
    vocab[line], 500, vocab['<pad>']) for line in X_test_tokenize])

In [69]:
train_features

tensor([[  0,   9,   0,  ...,   0,   0,   0],
        [139,  24, 721,  ...,   0,   0,   0],
        [  0,  28,  26,  ...,   0,   0,   0],
        ...,
        [ 80, 137,  39,  ...,   0,   0,   0],
        [  0,  87,  25,  ...,   0,   0,   0],
        [526,   0,   0,  ...,   0,   0,   0]])

In [72]:
train_data[1]

'BREAKING: French media reporting two suspects of #CharlieHebdo attack are killed | More at: http://t.co/jKrAfvH9sT http://t.co/oaZH2sz6fO'

In [114]:
from torch.utils import data

def load_array(data_arrays, batch_size, is_train=True):
    dataset = data.TensorDataset(*data_arrays)
    return data.DataLoader(dataset, batch_size, shuffle=is_train)

train_iter = load_array((train_features, torch.tensor(y_train)), 32)
test_iter = load_array((test_features, torch.tensor(y_test)), 32, is_train=False)

 # Model

In [76]:
import torch.nn as nn

class BiLstm(nn.Module):
    def __init__(self, vocab_size, embed_size, num_hiddens, num_layers, num_classes):
        super(BiLstm, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_size)
        self.encoder = nn.LSTM(embed_size, num_hiddens, num_layers, bidirectional=True)
        self.decoder = nn.Linear(4 * num_hiddens, num_classes)
        
    def forward(self, x):
        embeddings = self.embedding(x.T)
        self.encoder.flatten_parameters()
        outputs, _ = self.encoder(embeddings)
        encoding = torch.cat((outputs[0], outputs[-1]), dim=1)
        outs = self.decoder(encoding)
        return outs

In [78]:
embed_size, num_hiddens, num_layers, num_classes = 100, 100, 2, 2

def try_all_gpus():
    """Return all available GPUs, or [cpu(),] if no GPU exists.

    Defined in :numref:`sec_use_gpu`"""
    devices = [torch.device(f'cuda:{i}')
             for i in range(torch.cuda.device_count())]
    return devices if devices else [torch.device('cpu')]

def init_weights(m):
    if type(m) == nn.Linear:
        nn.init.xavier_uniform_(m.weight)
    if type(m) == nn.LSTM:
        for param in m._flat_weights_names:
            if "weight" in param:
                nn.init.xavier_uniform_(m._parameters[param])
                
net = BiLstm(len(vocab), embed_size, num_hiddens, num_layers, num_classes)
net.apply(init_weights);

## World Embeddings

In [101]:
import hashlib
import os
import requests
import zipfile

In [102]:
DATA_HUB = dict()
DATA_URL = 'http://d2l-data.s3-accelerate.amazonaws.com/'
DATA_HUB['glove.6b.100d'] = (DATA_URL + 'glove.6B.100d.zip',
                                 'cd43bfb07e44e6f27cbcc7bc9ae3d80284fdaf5a')
def download(name, cache_dir=os.path.join('..', 'data')):
    """Download a file inserted into DATA_HUB, return the local filename.

    Defined in :numref:`sec_kaggle_house`"""
    assert name in DATA_HUB, f"{name} does not exist in {DATA_HUB}."
    url, sha1_hash = DATA_HUB[name]
    os.makedirs(cache_dir, exist_ok=True)
    fname = os.path.join(cache_dir, url.split('/')[-1])
    if os.path.exists(fname):
        sha1 = hashlib.sha1()
        with open(fname, 'rb') as f:
            while True:
                data = f.read(1048576)
                if not data:
                    break
                sha1.update(data)
        if sha1.hexdigest() == sha1_hash:
            return fname  # Hit cache
    print(f'Downloading {fname} from {url}...')
    r = requests.get(url, stream=True, verify=True)
    with open(fname, 'wb') as f:
        f.write(r.content)
    return fname

In [103]:
def download_extract(name, folder=None):
    """Download and extract a zip/tar file.

    Defined in :numref:`sec_kaggle_house`"""
    fname = download(name)
    base_dir = os.path.dirname(fname)
    data_dir, ext = os.path.splitext(fname)
    if ext == '.zip':
        fp = zipfile.ZipFile(fname, 'r')
    elif ext in ('.tar', '.gz'):
        fp = tarfile.open(fname, 'r')
    else:
        assert False, 'Only zip/tar files can be extracted.'
    fp.extractall(base_dir)
    return os.path.join(base_dir, folder) if folder else data_dir

In [104]:
class TokenEmbedding:
    """Token Embedding."""
    def __init__(self, embedding_name):
        """Defined in :numref:`sec_synonyms`"""
        self.idx_to_token, self.idx_to_vec = self._load_embedding(
            embedding_name)
        self.unknown_idx = 0
        self.token_to_idx = {token: idx for idx, token in
                             enumerate(self.idx_to_token)}

    def _load_embedding(self, embedding_name):
        idx_to_token, idx_to_vec = ['<unk>'], []
        data_dir = download_extract(embedding_name)
        # GloVe website: https://nlp.stanford.edu/projects/glove/
        # fastText website: https://fasttext.cc/
        with open(os.path.join(data_dir, 'vec.txt'), 'r') as f:
            for line in f:
                elems = line.rstrip().split(' ')
                token, elems = elems[0], [float(elem) for elem in elems[1:]]
                # Skip header information, such as the top row in fastText
                if len(elems) > 1:
                    idx_to_token.append(token)
                    idx_to_vec.append(elems)
        idx_to_vec = [[0] * len(idx_to_vec[0])] + idx_to_vec
        return idx_to_token, torch.tensor(idx_to_vec)

    def __getitem__(self, tokens):
        indices = [self.token_to_idx.get(token, self.unknown_idx)
                   for token in tokens]
        vecs = self.idx_to_vec[torch.tensor(indices)]
        return vecs

    def __len__(self):
        return len(self.idx_to_token)

In [105]:
glove_embedding = TokenEmbedding('glove.6b.100d')

In [106]:
embeds = glove_embedding[vocab.idx_to_token]
embeds.shape

torch.Size([1030, 100])

In [108]:
net.embedding.weight.data.copy_(embeds)
net.embedding.weight.requires_grad = False

In [112]:
lr, num_epochs = 0.01, 5
optimizer = torch.optim.Adam(net.parameters(), lr=lr)
loss_function = nn.CrossEntropyLoss(reduction="none")

In [115]:
len(train_iter)

71

In [None]:
for epoch in range(num_epochs):
    print('Epoch: [{}/{}]'.format(epoch + 1, num_epochs))
    for i, (X, y) in enumerate(train_iter):
        # forward
        y_hat = net(X)
        # Compute the loss
        loss = loss_function(y_hat, y)
        
        # clean the gradient 
        optimizer.zero_grad()
        # do back propagation
        loss.backward()
        # update parameters
        optimizer.step()
        
        if (i + 1) % 10 == 0:
            total_steps = len(train_iter)
            true = y.data.cpu
            
            
        
        