# NLP Project

### Loading data

In [1]:
import json
import pandas as pd


df = pd.read_json('train.data.jsonl', orient='values', lines=True, encoding='utf-8')
dev = pd.read_json('dev.data.jsonl', orient='values', lines=True, encoding='utf-8')

y_train = pd.read_json('train.label.json',orient='index', convert_dates = False, encoding='utf-8',convert_axes=False)
y_dev = pd.read_json('dev.label.json',orient='index', convert_dates = False, encoding='utf-8',convert_axes=False)

test = pd.read_json('test.data.jsonl', orient='values', lines=True, encoding='utf-8')

In [3]:
#check if first tweet ID match label
sum([0 if y==x else 1 for x,y in zip(df.iloc[:,0].apply(lambda x: x['id_str']), y_train.index)])

0

In [4]:
df.iloc[0,0]

{'contributors': None,
 'truncated': False,
 'text': 'How to respond to the murderous attack on Charlie Hebdo? Every newspaper in the free world should print this. http://t.co/sC2ot63F6j',
 'in_reply_to_status_id': None,
 'id': 552800070199148544,
 'favorite_count': 77,
 'source': '<a href="http://www.apple.com" rel="nofollow">iOS</a>',
 'retweeted': False,
 'coordinates': None,
 'entities': {'symbols': [],
  'user_mentions': [],
  'hashtags': [],
  'urls': [],
  'media': [{'expanded_url': 'http://twitter.com/Heresy_Corner/status/552800070199148544/photo/1',
    'display_url': 'pic.twitter.com/sC2ot63F6j',
    'url': 'http://t.co/sC2ot63F6j',
    'media_url_https': 'https://pbs.twimg.com/media/B6vwvCVIQAASBJx.jpg',
    'id_str': '552800070153027584',
    'sizes': {'small': {'h': 408, 'resize': 'fit', 'w': 340},
     'large': {'h': 472, 'resize': 'fit', 'w': 393},
     'medium': {'h': 472, 'resize': 'fit', 'w': 393},
     'thumb': {'h': 150, 'resize': 'crop', 'w': 150}},
    'indices': 

In [2]:
y_train.columns = ['label']
y_train['index'] = y_train.index
y_train['lab'] = [1 if i=='rumour' else 0 for i in y_train.label]
y_train

Unnamed: 0,label,index,lab
552800070199148544,non-rumour,552800070199148544,0
544388259359387648,non-rumour,544388259359387648,0
552805970536333314,non-rumour,552805970536333314,0
525071376084791297,rumour,525071376084791297,1
498355319979143168,non-rumour,498355319979143168,0
...,...,...,...
524959027516932096,rumour,524959027516932096,1
524940940721418240,non-rumour,524940940721418240,0
580331453889708032,rumour,580331453889708032,1
552820384039706624,non-rumour,552820384039706624,0


In [3]:
y_dev.columns = ['label']
y_dev['index'] = y_dev.index
y_dev['lab'] = [1 if i=='rumour' else 0 for i in y_dev.label]
y_dev

Unnamed: 0,label,index,lab
553588913747808256,rumour,553588913747808256,1
524949003834634240,non-rumour,524949003834634240,0
553221281181859841,non-rumour,553221281181859841,0
580322346508124160,rumour,580322346508124160,1
544307417677189121,rumour,544307417677189121,1
...,...,...,...
525025279803424768,rumour,525025279803424768,1
552784600502915072,non-rumour,552784600502915072,0
499696525808001024,non-rumour,499696525808001024,0
580320612155060224,rumour,580320612155060224,1


#### Graph building function

In [4]:
class node(object):
    """
    Tree data structure.
    """
    def __init__(self, value, datas, children = []):
        """
        Constructor. Contains:
            value as id_str
            datas as tweet dictionary
            children as tweet responses
        """
        self.value = value
        self.children = children
        self.datas = datas

    def printed(self, level=0):
        """
        Function to visualize the tree
        """
        print("\t"*level+self.value+"\n")
        if len(self.children)>0:
            for child in self.children:
                child.printed(level+1)

                
def bfs(value, root):
    """
    Breadth first search to look for a node in a tree given its id_str. 
    Start from the root of a given tree.
    """
    queue = []
    queue.append(root)
    while len(queue) > 0:
        current = queue.pop(0)
        if current.value == value:
            return current
        for c in current.children:
            queue.append(c)
    return None

def preOrder(node, by_deep):
    """
    Flatten a tree using pre-orden algorithm.
    """
    text=node.datas['text']
    #node.children.sort(key= lambda x: datetime.strftime(datetime.strptime(x.datas['created_at'],'%a %b %d %H:%M:%S +0000 %Y'), '%Y-%m-%d %H:%M:%S'), reverse= True)
    if by_deep:
        node.children.sort(key=lambda x: len(x.children), reverse=True)
    else:
        node.children.sort(key= lambda x: x.datas['favorite_count'], reverse= True)
    
    for child in node.children:
        text += preOrder(child, by_deep)
    return ' ' + text

def getGraph(df,k):  
    """
    Build a tree given a list of id_str and replay to id_str. 
    Its iterates through the tree until all nodes in the list have been added.
    """
    root = node(df.iloc[k,0]['id_str'], datas = df.iloc[k,0])
    queue = [i for i in df.iloc[k,1:] if i is not None]

    while queue:
        current = queue.pop(0)
        parent = bfs(current['in_reply_to_status_id_str'], root)
        if parent is not None:
            parent.children = parent.children + [node(current['id_str'], datas = current)]
        elif current['in_reply_to_status_id_str'] in [i['id_str'] for i in queue]:
            queue.append(current)
        else:
            #creates an empty node and assumes its connects directly to root
            root.children = root.children + [node(current['in_reply_to_status_id_str'], datas = {'favorite_count':0, 'text':' ', 'id_str': current['in_reply_to_status_id_str']})]
            queue.append(current)
    
    return root

def simpleConcat(df,k, n=None):
    """
    Concatenate event tweets text in the order of the original dataset
    Takes as arguments de dataframe, event row and the maximum number of tweet to concatenate.
    """
    text = [i['text'] for i in df.iloc[k,:] if i is not None]
    return ' '.join(text[:n])

def sortByDate(df,k,n=None):
    """
    Concatenate event tweets text ordered by time stamp.
    Takes as arguments de dataframe, event row and the maximum number of tweet to concatenate.
    """
    from datetime import datetime
    twt = pd.DataFrame([t for t in df.iloc[k,:] if t is not None])
    twt['created_at'] = twt.created_at.apply(lambda x: datetime.strftime(datetime.strptime(x,'%a %b %d %H:%M:%S +0000 %Y'), '%Y-%m-%d %H:%M:%S')) 
    twt = twt.sort_values(by='created_at')
    
    return ' '.join(twt[:n].text)

def deepMetric(node):
    """
    Takes as argument a root node of a tree data structure.
    Returns the maximum depth or the height of a tree and the number of nodes.
    """
    maxDeep = 0
    n_nodes = 1
    for c in node.children:
        current, n_n = deepMetric(c)
        n_nodes += n_n
        if maxDeep < current:
            maxDeep = current
    return maxDeep + 1, n_nodes

def ave_deep(node):
    """
    Takes as argument a root node of a tree data structure.
    Takes the deepMetric outputs and calculate the depth normalized by the total number of nodes.
    """
    max_deep, n_nodes = deepMetric(node)
    return max_deep/n_nodes
    
def low2high(node):
    """
    Takes as argument a root node of a tree data structure.
    Returns the number of directed edges were user source is less popular than the target and the number of edges.
    """
    n_edges = 1
    n_low2high = 0
    for c in node.children:
        n_e, n_l2h = low2high(c)
        n_edges += n_e
        n_low2high += n_l2h
        if 'user' in node.datas and 'user' in c.datas:
            if node.datas['user']['followers_count'] < c.datas['user']['followers_count']:
                n_low2high += 1
            
    return n_edges, n_low2high

def lowDifussion(node):
    """
    Takes as argument a root node of a tree data structure.
    Calculate the ratio went a tweet goes from a low to high difussion node versus the total tweets.
    It uses output from low2high function.
    """
    edges, low = low2high(node)
    return low/edges

def leaf_count(node):
    """
    Takes as argument a root node of a tree data structure.
    Counts the number of leafs in a tree and the total number of nodes.
    """
    count = 0
    total_c = 1
    for c in node.children:
        part_count, total = leaf_count(c)
        count += part_count
        total_c += total
    if len(node.children) < 1:
        count +=1
    return count, total_c

def isolated_node(node):
    """
    Takes as argument a root node of a tree data structure.
    Calcultes the isolation ratio, the number of tweets without a replay versus the totla number of tweets.
    """
    leaf, total = leaf_count(node)
    return leaf/total

def nltk_sentiment(sentence):
    """
    Calculates the sentiment of a text using Vader sentiment analyzer from NLTK.
    """
    from nltk.sentiment.vader import SentimentIntensityAnalyzer
    
    nltk_sentiment = SentimentIntensityAnalyzer()
    score = nltk_sentiment.polarity_scores(sentence)
    return score['compound']

def sentimentSeq(df,k):
    """
    Calculates the sentiment of each text in a event.
    """
    return [nltk_sentiment(t['text']) for t in df.iloc[k,:] if t is not None]

def sentDate(df, k):
    """
    It creates a list with the time stamp of each tweet in an event.
    """
    from datetime import datetime
    twt = pd.DataFrame([t for t in df.iloc[k,:] if t is not None])
    twt['created_at'] = twt.created_at.apply(lambda x: datetime.strftime(datetime.strptime(x,'%a %b %d %H:%M:%S +0000 %Y'), '%Y-%m-%d %H:%M:%S')) 
    #twt = twt.sort_values(by='created_at')
    return twt.created_at

def favCalculation(df,k):
    """
    Calculates the proportion of likes of the source tweet versus the average likes of the rest of the responses.
    """
    SMOOTHING = 0.00001
    fav = [i['favorite_count'] for i in df.iloc[k,1:] if i is not None]
    mean = np.mean(fav) if len(fav) > 0 else 0
    return np.log(1+df.iloc[k,0]['favorite_count']/(SMOOTHING + mean)) 


### Extract and format data to pytorch consumible

In [9]:
from torchtext.data.utils import get_tokenizer
from transformers import BertTokenizer

token = get_tokenizer('spacy', 'en_core_web_sm')


def transformDF(df, label, tokenize=True, concatenation = True, concat_type = 'simple', n_tweets = None, order_by_deep = False):
    """
    Transform and extract tweets data to a pandas dataframe, ready to export to json.
    Takes the original Dataframe and the list of labels as arguments.
    
    Arguments: df, label, tokenize=True, concat_type = 'simple', n_tweets = -1, order_by_deep = False
    """
    
    index = []
    string = []
    source = []
    has_tag = []
    has_link = []
    fav=[]
    sent = []
    sent_date = []
    sent_total = []
    deep = []
    low=[]
    isolation =[]
    frifo_ratio = [] #friends to followers ratio

    for k in range(df.shape[0]):
        
        index.append(df.iloc[k,0]['id_str'])
        
        if concatenation:
            # text feature
            if concat_type == 'simple':
                concat = simpleConcat(df,k,n_tweets)
            elif concat_type == 'date':
                concat = sortByDate(df,k,n_tweets)
            else:
                concat = preOrder(getGraph(df,k), by_deep=order_by_deep)
            if tokenize:
                concat = token(concat)
                concat = [i.lower() for i in concat]
            string.append(concat)
        else:
            #Create sequenses
            string.append([i['text'] for i in df.iloc[k,:] if i is not None])
        
        #graph features
        tree = getGraph(df,k)
        deep.append(ave_deep(tree))
        low.append(lowDifussion(tree))
        isolation.append(isolated_node(tree))
        
        #user features
        source.append(df.iloc[k,0]['user']['name'])
        frifo_ratio.append(df.iloc[k,0]['user']['friends_count']/(df.iloc[k,0]['user']['followers_count']+0.000001))
        fav.append(favCalculation(df,k))
        
        #Special content features
        has_tag.append(1 if '#' in df.iloc[k,0]['text'] else 0)
        has_link.append(1 if 'http' in df.iloc[k,0]['text'] else 0)
        
        # Sentiment features
        #sent.append(sentimentSeq(df,k))
        #sent_date.append(sentDate(df,k))
        #if tokenize:
        #    sent_total.append(nltk_sentiment(" ".join(concat)))
        #else:
            #sent_total.append(nltk_sentiment(concat))
          
        
    return pd.DataFrame({'text': string,'source': source, 'ratio':frifo_ratio, 'tag': has_tag, 'isolation':isolation,
                             #'sentiment': sent,'sent_date': sent_date,
                         'link': has_link, 'fav': fav, 'deep': deep, 'low': low,
                             'label': label, 'index': index})
        

In [17]:
train1 = transformDF(df, y_train.lab)
devel = transformDF(dev, y_dev.lab)
test1 = transformDF(test, test.index) #use index to fill label

#Export to json pytorchtext readable format

#train1.to_json('train.json', orient = 'records', lines = True)
#devel.to_json('dev.json', orient = 'records', lines = True)
#test1.to_json('test.json', orient = 'records', lines = True)

### Measure features MI

In [18]:
from sklearn.feature_selection import mutual_info_classif

def mutualInfoScores(df, y):
    res = zip(df.columns,
                   mutual_info_classif(df, y, discrete_features=False)
                   )
    sorted_scores = sorted(res, key= lambda k: k[1], reverse=True)
    return sorted_scores


#Get scores for each feature
#ignore trackID, title and tags fetures
scores = mutualInfoScores(train1.iloc[:,2:9], train1.label)

In [19]:
scores

[('ratio', 0.07323665624978948),
 ('fav', 0.04196261288922831),
 ('low', 0.012337884923472764),
 ('link', 0.008326876136909744),
 ('tag', 0.004329119186485153),
 ('deep', 0.002757646357453458),
 ('isolation', 0.0017935966347604193)]

In [22]:
{v: train1.label.tolist().count(v) for v in train1.label}

{0: 3058, 1: 1583}

### Develop multiple datasets for ssystematic exp

In [45]:
transformDF?

[1;31mSignature:[0m
[0mtransformDF[0m[1;33m([0m[1;33m
[0m    [0mdf[0m[1;33m,[0m[1;33m
[0m    [0mlabel[0m[1;33m,[0m[1;33m
[0m    [0mtokenize[0m[1;33m=[0m[1;32mTrue[0m[1;33m,[0m[1;33m
[0m    [0mconcatenation[0m[1;33m=[0m[1;32mTrue[0m[1;33m,[0m[1;33m
[0m    [0mconcat_type[0m[1;33m=[0m[1;34m'simple'[0m[1;33m,[0m[1;33m
[0m    [0mn_tweets[0m[1;33m=[0m[1;32mNone[0m[1;33m,[0m[1;33m
[0m    [0morder_by_deep[0m[1;33m=[0m[1;32mFalse[0m[1;33m,[0m[1;33m
[0m[1;33m)[0m[1;33m[0m[1;33m[0m[0m
[1;31mDocstring:[0m
Transform and extract tweets data to a pandas dataframe, ready to export to json.
Takes the original Dataframe and the list of labels as arguments.

Arguments: df, label, tokenize=True, concat_type = 'simple', n_tweets = -1, order_by_deep = False
[1;31mFile:[0m      c:\users\framo\google drive\0 mit\0sem 1 2021\nlp\project\<ipython-input-44-549f3189ed36>
[1;31mType:[0m      function


In [1]:
train1.text.apply(lambda x: len(x))

In [57]:
### NO TOKEN

##simpleconcat
train1 = transformDF(df, y_train.lab, tokenize=False)
dev1 = transformDF(dev, y_dev.lab, tokenize=False)
train1.to_json('trainSimpleNT.json', orient = 'records', lines = True)
dev1.to_json('devSimpleNT.json', orient = 'records', lines = True)

##sort_by_date
train2 = transformDF(df, y_train.lab, concat_type = 'date', tokenize=False)
dev2 = transformDF(dev, y_dev.lab, concat_type = 'date', tokenize=False)
train2.to_json('trainDateNT.json', orient = 'records', lines = True)
dev2.to_json('devDateNT.json', orient = 'records', lines = True)

In [60]:
##tree by replies
train3 = transformDF(df, y_train.lab, concat_type = 'tree', order_by_deep = True, tokenize=False)
dev3 = transformDF(dev, y_dev.lab, concat_type = 'tree', order_by_deep = True, tokenize=False)
train3.to_json('trainDepthNT.json', orient = 'records', lines = True)
dev3.to_json('devDepthNT.json', orient = 'records', lines = True)

##tree by popularity
train4 = transformDF(df, y_train.lab, concat_type = 'tree', order_by_deep = False, tokenize=False)
dev4 = transformDF(dev, y_dev.lab, concat_type = 'tree', order_by_deep = False, tokenize=False)
train4.to_json('trainLikeNT.json', orient = 'records', lines = True)
dev4.to_json('devLikeNT.json', orient = 'records', lines = True)

In [None]:
# TOKENIZED DATA

##simpleconcat
train1 = transformDF(df, y_train.lab)
dev1 = transformDF(dev, y_dev.lab)
train1.to_json('trainSimple.json', orient = 'records', lines = True)
dev1.to_json('devSimple.json', orient = 'records', lines = True)

##sort_by_date
train2 = transformDF(df, y_train.lab, concat_type = 'date')
dev2 = transformDF(dev, y_dev.lab, concat_type = 'date')
train2.to_json('trainDate.json', orient = 'records', lines = True)
dev2.to_json('devDate.json', orient = 'records', lines = True)

##tree by replies
train3 = transformDF(df, y_train.lab, concat_type = 'tree', order_by_deep = True)
dev3 = transformDF(dev, y_dev.lab, concat_type = 'tree', order_by_deep = True)
train3.to_json('trainDepth.json', orient = 'records', lines = True)
dev3.to_json('devDepth.json', orient = 'records', lines = True)

##tree by popularity
train4 = transformDF(df, y_train.lab, concat_type = 'tree', order_by_deep = False)
dev4 = transformDF(dev, y_dev.lab, concat_type = 'tree', order_by_deep = False)
train4.to_json('trainLike.json', orient = 'records', lines = True)
dev4.to_json('devLike.json', orient = 'records', lines = True)

In [None]:
#sequense DATA

##simple sequence
train5 = transformDF(df, y_train.lab, concatenation = False)
dev5 = transformDF(dev, y_dev.lab, concatenation = False)
train5.to_json('trainSeq.json', orient = 'records', lines = True)
dev5.to_json('devSeq.json', orient = 'records', lines = True)

## sequence by date
train5 = transformDF(df, y_train.lab, concatenation = False)
dev5 = transformDF(dev, y_dev.lab, concatenation = False)
train5.to_json('trainSeq.json', orient = 'records', lines = True)
dev5.to_json('devSeq.json', orient = 'records', lines = True)

## sequence by tree
train5 = transformDF(df, y_train.lab, concatenation = False)
dev5 = transformDF(dev, y_dev.lab, concatenation = False)
train5.to_json('trainSeq.json', orient = 'records', lines = True)
dev5.to_json('devSeq.json', orient = 'records', lines = True)

## BI LSTM RNN + simple concat + Additional features

In [14]:
import torch
from torchtext.legacy import data
from torchtext.legacy import datasets

#TXT = data.Field(sequential=True,include_lengths=True, batch_first=True)#,dtype = torch.float)#tokenize = 'spacy',tokenizer_language = 'en_core_web_sm',include_lengths = True)
#SOURCE = data.Field()
#LABEL = data.Field(sequential=False, use_vocab=False, batch_first=True, dtype=torch.float)#,dtype = torch.float)
TXT = data.Field(sequential=True, include_lengths=True)
LABEL = data.Field(sequential=False, use_vocab=False)
RATIO = data.Field(sequential=False, use_vocab=False, dtype=torch.float)
INDEX = data.Field(sequential=False,use_vocab=False)
ISO = data.Field(sequential=False, use_vocab=False,dtype=torch.float)
LOW = data.Field(sequential=False, use_vocab=False, dtype=torch.float)
DEEP = data.Field(sequential=False, use_vocab=False, dtype=torch.float)
train_data, dev_data, test_data = data.TabularDataset.splits(
                            path ='./',
                            train = 'train.json',
                            validation = 'dev.json',
                            test = 'test.json',
                            format = 'json',
                            fields = {'text': ('t', TXT), 'label': ('l', LABEL), 'index': ('i', INDEX), 'ratio': ('r',RATIO),
                                      'isolation':('iso', ISO), 'low':('low', LOW), 'deep': ('deep', DEEP)}
)

In [15]:
TXT.build_vocab(train_data, vectors = "glove.6B.100d", unk_init = torch.Tensor.normal_)

INDEX.build_vocab(train_data)
RATIO.build_vocab(train_data)
ISO.build_vocab(train_data)
LOW.build_vocab(train_data)
DEEP.build_vocab(train_data)
LABEL.build_vocab(train_data)

In [16]:
import torch

BATCH_SIZE = 16

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
#device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
train_iterator, dev_iterator, test_iterator = data.BucketIterator.splits(
    (train_data, dev_data,test_data), 
    batch_size = BATCH_SIZE,
    sort_key = lambda x: len(x.t),
    #sort = False,
    #sort_within_batch = False,
    device = device)

#### RNN model

In [17]:
import torch.nn as nn
import torch.nn.functional as F

class RNN(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim, n_layers, 
                 bidirectional, dropout, mix_layer, extra_layer,pad_idx):   
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx = pad_idx)
        
        self.rnn = nn.LSTM(embedding_dim, 
                           hidden_dim, 
                           num_layers=n_layers, 
                           bidirectional=bidirectional, 
                           dropout=dropout)
        
        self.dropout = nn.Dropout(dropout)
        #self.fc = nn.Linear((hidden_dim * 2), output_dim)
        self.fc = nn.Linear((hidden_dim * 2)+3, mix_layer)
        self.fc2 = nn.Linear(mix_layer, extra_layer)
        self.fc3 = nn.Linear(extra_layer, output_dim)
        
    def forward(self, text, text_lengths, extra):
        embedded = self.dropout(self.embedding(text))
        packed_embedded = nn.utils.rnn.pack_padded_sequence(embedded, text_lengths.to('cpu'), enforce_sorted=False) 
        packed_output, (hidden, cell) = self.rnn(packed_embedded)
        output, output_lengths = nn.utils.rnn.pad_packed_sequence(packed_output)

        hidden = self.dropout(torch.cat((hidden[-2,:,:], hidden[-1,:,:]), dim = 1))
        #final = self.fc(hidden)#.squeeze(-1))
        #print(hidden.squeeze(1).shape, extra.unsqueeze(1).shape)
        extras = self.fc(torch.cat((hidden.squeeze(1),extra),dim=1))
        line = F.relu(self.fc2(extras))
        final = self.fc3(line)
            
        return final


model = RNN(vocab_size = len(TXT.vocab), 
            embedding_dim = 100, 
            hidden_dim = 256, 
            output_dim = 1,
            mix_layer = 256,
            extra_layer = 128, 
            n_layers = 2, 
            bidirectional = True, 
            dropout = 0.5, 
            pad_idx = TXT.vocab.stoi[TXT.pad_token])

In [18]:
import torch.optim as optim

optimizer = optim.Adam(model.parameters())
criterion = nn.BCEWithLogitsLoss()

model = model.to(device)
criterion = criterion.to(device)

In [19]:
def binary_accuracy(preds, y):
    #round predictions to the closest integer
    rounded_preds = torch.round(torch.sigmoid(preds))
    correct = (rounded_preds == y).float() #convert into float for division 
    acc = correct.sum() / len(correct)
    return acc

def f1_scoring(preds, y):
    from sklearn.metrics import f1_score
    import numpy as np
    rounded_preds = torch.round(torch.sigmoid(preds))
    score = f1_score(y.tolist(), rounded_preds.tolist())
    return torch.from_numpy(np.array(score))
    

def train(model, iterator, optimizer, criterion): 
    epoch_loss = 0
    epoch_acc = 0
    model.train()
    for batch in iterator:
        optimizer.zero_grad()
        text, text_lengths = batch.t
        extra_feat = torch.cat((batch.low.unsqueeze(1), batch.deep.unsqueeze(1), batch.iso.unsqueeze(1)),dim=1)
        predictions = model(text, text_lengths, extra_feat).squeeze(1)
        loss = criterion(predictions, batch.l.float())
        #acc = binary_accuracy(predictions, batch.l)
        acc = f1_scoring(predictions, batch.l)
        loss.backward()
        optimizer.step()
        
        epoch_loss += loss.item()
        epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

def evaluate(model, iterator, criterion):
    epoch_loss = 0
    epoch_acc = 0 
    model.eval()
    
    with torch.no_grad():
        for batch in iterator:
            text, text_lengths = batch.t
            extra_feat = torch.cat((batch.low.unsqueeze(1), batch.deep.unsqueeze(1), batch.iso.unsqueeze(1)),dim=1)
            predictions = model(text, text_lengths, extra_feat).squeeze(1)
            loss = criterion(predictions, batch.l.float())
            #acc = binary_accuracy(predictions, batch.l)
            acc = f1_scoring(predictions, batch.l)
            epoch_loss += loss.item()
            epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

import time

def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

In [20]:
"""
model.eval()
with torch.no_grad():
    for b in train_iterator:
        #r = b.r
        #tag= b.tag
        #link = b.link
        #fav = b.fav
        txt, len_txt = b.t
        #ex = torch.cat((b.r, b.tag, b.link),-1)
        #text, text_lengths = b.t
        print(len_txt < 0)
        #predictions = model(text, text_lengths, b.r)
        #print(fav)
"""

'\nmodel.eval()\nwith torch.no_grad():\n    for b in train_iterator:\n        #r = b.r\n        #tag= b.tag\n        #link = b.link\n        #fav = b.fav\n        txt, len_txt = b.t\n        #ex = torch.cat((b.r, b.tag, b.link),-1)\n        #text, text_lengths = b.t\n        print(len_txt < 0)\n        #predictions = model(text, text_lengths, b.r)\n        #print(fav)\n'

In [24]:
N_EPOCHS = 10

best_test_f1 = float('-inf')

for epoch in range(N_EPOCHS):

    start_time = time.time()
    
    train_loss, train_f1 = train(model, train_iterator, optimizer, criterion)
    test_loss, test_f1 = evaluate(model, dev_iterator, criterion)
    
    end_time = time.time()

    epoch_mins, epoch_secs = epoch_time(start_time, end_time)
    
    if test_f1 > best_test_f1:
        best_test_f1 = test_f1
        torch.save(model.state_dict(), 'model_LSTM_BI2_pool.pt')
    
    print(f'Epoch: {epoch+1:2} | Epoch Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_f1*100:.2f}%')
    print(f'\t Val. Loss: {test_loss:.3f} |  Val. Acc: {test_f1*100:.2f}%')

Epoch:  1 | Epoch Time: 0m 15s
	Train Loss: 0.062 | Train Acc: 95.52%
	 Val. Loss: 0.626 |  Val. Acc: 75.77%
Epoch:  2 | Epoch Time: 0m 15s
	Train Loss: 0.055 | Train Acc: 97.01%
	 Val. Loss: 0.664 |  Val. Acc: 79.07%
Epoch:  3 | Epoch Time: 0m 16s
	Train Loss: 0.052 | Train Acc: 96.40%
	 Val. Loss: 0.631 |  Val. Acc: 77.74%
Epoch:  4 | Epoch Time: 0m 15s
	Train Loss: 0.044 | Train Acc: 97.69%
	 Val. Loss: 1.118 |  Val. Acc: 71.09%
Epoch:  5 | Epoch Time: 0m 15s
	Train Loss: 0.041 | Train Acc: 97.08%
	 Val. Loss: 0.980 |  Val. Acc: 72.77%
Epoch:  6 | Epoch Time: 0m 16s
	Train Loss: 0.030 | Train Acc: 98.50%
	 Val. Loss: 0.927 |  Val. Acc: 76.87%
Epoch:  7 | Epoch Time: 0m 16s
	Train Loss: 0.038 | Train Acc: 97.70%
	 Val. Loss: 0.708 |  Val. Acc: 76.77%
Epoch:  8 | Epoch Time: 0m 15s
	Train Loss: 0.045 | Train Acc: 97.13%
	 Val. Loss: 0.856 |  Val. Acc: 78.19%
Epoch:  9 | Epoch Time: 0m 15s
	Train Loss: 0.032 | Train Acc: 98.10%
	 Val. Loss: 0.778 |  Val. Acc: 75.27%
Epoch: 10 | Epoch T

## BERT text representation

In [24]:
transformDF?

[1;31mSignature:[0m
[0mtransformDF[0m[1;33m([0m[1;33m
[0m    [0mdf[0m[1;33m,[0m[1;33m
[0m    [0mlabel[0m[1;33m,[0m[1;33m
[0m    [0mtokenize[0m[1;33m=[0m[1;32mTrue[0m[1;33m,[0m[1;33m
[0m    [0mconcatenation[0m[1;33m=[0m[1;32mTrue[0m[1;33m,[0m[1;33m
[0m    [0mconcat_type[0m[1;33m=[0m[1;34m'simple'[0m[1;33m,[0m[1;33m
[0m    [0mn_tweets[0m[1;33m=[0m[1;32mNone[0m[1;33m,[0m[1;33m
[0m    [0morder_by_deep[0m[1;33m=[0m[1;32mFalse[0m[1;33m,[0m[1;33m
[0m[1;33m)[0m[1;33m[0m[1;33m[0m[0m
[1;31mDocstring:[0m
Transform and extract tweets data to a pandas dataframe, ready to export to json.
Takes the original Dataframe and the list of labels as arguments.

Arguments: df, label, tokenize=True, concat_type = 'simple', n_tweets = -1, order_by_deep = False
[1;31mFile:[0m      c:\users\framo\google drive\0 mit\0sem 1 2021\nlp\project\<ipython-input-16-8a5e97cd9cf5>
[1;31mType:[0m      function


In [10]:
train1 = transformDF(df, y_train.lab, tokenize=False, concatenation=False)
devel = transformDF(dev, y_dev.lab,tokenize= False, concatenation=False)
test1 = transformDF(test, test.index, tokenize=False, concatenation=False)

train1.index = [i for i in range(train1.shape[0])]
devel.index = [i for i in range(devel.shape[0])]
test1.index = [i for i in range(test1.shape[0])]

train1.to_json('trainC.json', orient = 'records', lines = True)
devel.to_json('devC.json', orient = 'records', lines = True)
test1.to_json('testC.json', orient = 'records', lines = True)

In [33]:
import torch
from torch.utils.data import Dataset
from transformers import BertTokenizer

class SSTDataset(Dataset):

    def __init__(self, df, maxlen=256):

        #Store the contents of the file in a pandas dataframe
        self.df = df

        #Initialize the BERT tokenizer
        self.tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

        self.maxlen = maxlen

    def __len__(self):
        return len(self.df)

    def __getitem__(self, index):

        #Selecting the sentence and label at the specified index in the data frame
        sentence = self.df.loc[index, 'text']
        label = self.df.loc[index, 'label']
        indice = self.df.loc[index, 'index']

        #Preprocessing the text to be suitable for BERT
        tokens = self.tokenizer.tokenize(sentence) #Tokenize the sentence
        tokens = ['[CLS]'] + tokens + ['[SEP]'] #Insering the CLS and SEP token in the beginning and end of the sentence
        if len(tokens) < self.maxlen:
            tokens = tokens + ['[PAD]' for _ in range(self.maxlen - len(tokens))] #Padding sentences
        else:
            tokens = tokens[:self.maxlen-1] + ['[SEP]'] #Prunning the list to be of specified max length

        tokens_ids = self.tokenizer.convert_tokens_to_ids(tokens) #Obtaining the indices of the tokens in the BERT Vocabulary
        tokens_ids_tensor = torch.tensor(tokens_ids) #Converting the list to a pytorch tensor

        #Obtaining the attention mask i.e a tensor containing 1s for no padded tokens and 0s for padded ones
        attn_mask = (tokens_ids_tensor != 0).long()

        return tokens_ids_tensor, attn_mask, label, indice

In [34]:
from torch.utils.data import DataLoader

#Creating instances of training and development set
#maxlen sets the maximum length a sentence can have
#any sentence longer than this length is truncated to the maxlen size
train_set = SSTDataset(train1)
dev_set = SSTDataset(devel)
test_set = SSTDataset(test1)
#Creating intsances of training and development dataloaders
train_loader = DataLoader(train_set, batch_size = 10)
dev_loader = DataLoader(dev_set, batch_size = 10)
test_loader = DataLoader(test_set, batch_size = 10)

print("Done preprocessing training and development data.")

Done preprocessing training and development data.


### model 1

In [None]:
## Bert tutorial model

In [None]:
from transformers import BertModel

class SentimentClassifier(nn.Module):

    def __init__(self):
        super(SentimentClassifier, self).__init__()
        #Instantiating BERT model object 
        self.bert_layer = BertModel.from_pretrained('bert-base-uncased')
        
        #Classification layer
        #input dimension is 768 because [CLS] embedding has a dimension of 768
        #output dimension is 1 because we're working with a binary classification problem
        self.cls_layer = nn.Linear(768, 1)

    def forward(self, seq, attn_masks):
        '''
        Inputs:
            -seq : Tensor of shape [B, T] containing token ids of sequences
            -attn_masks : Tensor of shape [B, T] containing attention masks to be used to avoid contibution of PAD tokens
        '''

        #Feeding the input to BERT model to obtain contextualized representations
        outputs = self.bert_layer(seq, attention_mask = attn_masks)
        cont_reps = outputs.last_hidden_state

        #Obtaining the representation of [CLS] head (the first token)
        cls_rep = cont_reps[:, 0]

        #Feeding cls_rep to the classifier layer
        logits = self.cls_layer(cls_rep)

        return logits

#### model 2

In [17]:
import torch.nn as nn

class BERTGRU(nn.Module):
    def __init__(self,
                 bert,
                 hidden_dim,
                 output_dim,
                 n_layers,
                 bidirectional,
                 dropout):
        
        super().__init__()
        self.bert = BertModel.from_pretrained('bert-base-uncased')
        embedding_dim = bert.config.to_dict()['hidden_size']
        self.rnn = nn.GRU(embedding_dim,
                          hidden_dim,
                          num_layers = n_layers,
                          bidirectional = bidirectional,
                          batch_first = True,
                          dropout = 0 if n_layers < 2 else dropout)
        
        self.out = nn.Linear(hidden_dim * 2, output_dim)
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, text, attn_masks):
        
        with torch.no_grad():
            embedded = self.bert(text, attn_masks)[0]
        _, hidden = self.rnn(embedded)
        hidden = self.dropout(torch.cat((hidden[-2,:,:], hidden[-1,:,:]), dim = 1))
        output = self.out(hidden)
        
        return output

In [19]:
from transformers import BertModel

bert = BertModel.from_pretrained('bert-base-uncased')

In [20]:
model = BERTGRU(bert,
             hidden_dim = 256,
             output_dim = 1,
             n_layers = 2,
             bidirectional= True,
             dropout = 0.25)


In [24]:
import torch.optim as optim
device = 'cuda:0'

optimizer = optim.Adam(model.parameters())
criterion = nn.BCEWithLogitsLoss()

model = model.to(device)
criterion = criterion.to(device)

def train(model, iterator, optimizer, criterion): 
    epoch_loss = 0
    epoch_acc = 0
    model.train()
    for batch in iterator:
        optimizer.zero_grad()
        seq, attn_masks, labels = batch
        labels = labels.to(device)
        predictions = model(seq.to(device), attn_masks.to(device)).squeeze(1)
        loss = criterion(predictions, labels.float())
        #acc = binary_accuracy(predictions, batch.l)
        acc = f1_scoring(predictions, labels)
        loss.backward()
        optimizer.step()
        
        epoch_loss += loss.item()
        epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

def evaluate(model, iterator, criterion):
    epoch_loss = 0
    epoch_acc = 0 
    model.eval()
    
    with torch.no_grad():
        for batch in iterator:
            seq, attn_masks, labels = batch
            labels = labels.to(device)
            predictions = model(seq.to(device), attn_masks.to(device)).squeeze(1)
            loss = criterion(predictions, labels.float())
            #acc = binary_accuracy(predictions, batch.l)
            acc = f1_scoring(predictions, labels)
            epoch_loss += loss.item()
            epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)


In [27]:
N_EPOCHS = 5

best_test_f1 = float('-inf')

for epoch in range(N_EPOCHS):

    start_time = time.time()
    
    train_loss, train_f1 = train(model, train_loader, optimizer, criterion)
    test_loss, test_f1 = evaluate(model, dev_loader, criterion)
    
    end_time = time.time()

    epoch_mins, epoch_secs = epoch_time(start_time, end_time)
    
    if test_f1 > best_test_f1:
        best_test_f1 = test_f1
        torch.save(model.state_dict(), 'model_BERTGRU.pt')
    
    print(f'Epoch: {epoch+1:2} | Epoch Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_f1*100:.2f}%')
    print(f'\t Val. Loss: {test_loss:.3f} |  Val. Acc: {test_f1*100:.2f}%')

Epoch:  1 | Epoch Time: 5m 37s
	Train Loss: 0.075 | Train Acc: 93.64%
	 Val. Loss: 0.474 |  Val. Acc: 79.64%
Epoch:  2 | Epoch Time: 5m 36s
	Train Loss: 0.059 | Train Acc: 94.77%
	 Val. Loss: 0.583 |  Val. Acc: 79.57%
Epoch:  3 | Epoch Time: 5m 37s
	Train Loss: 0.050 | Train Acc: 95.03%
	 Val. Loss: 0.620 |  Val. Acc: 76.77%
Epoch:  4 | Epoch Time: 5m 36s
	Train Loss: 0.046 | Train Acc: 95.14%
	 Val. Loss: 0.572 |  Val. Acc: 74.93%
Epoch:  5 | Epoch Time: 5m 37s
	Train Loss: 0.046 | Train Acc: 95.17%
	 Val. Loss: 0.690 |  Val. Acc: 78.05%


## OUTPUT

In [551]:
def predict(model, iterator):
    model.eval()
    predict = []
    index = []
    with torch.no_grad():
        for batch in iterator:
            text, text_lengths = batch.t
            extra_feat = torch.cat((batch.low.unsqueeze(1), batch.deep.unsqueeze(1), batch.iso.unsqueeze(1)),dim=1)
            preds = model(text, text_lengths, extra_feat).squeeze(1)
            rounded_preds = torch.round(torch.sigmoid(preds))
            predict = predict + rounded_preds.tolist()
            index = index + batch.i.tolist()
    
    result = {x:y for x,y in zip(index, predict)}
    return result   

In [28]:
#model.load_state_dict(torch.load('model_LSTM_BI2.pt'))

#dict_p = predict(model, test_iterator)

<All keys matched successfully>

In [23]:
dict_p = predict(model, test_iterator)
out = {k:'rumour' if v>0 else 'non-rumour' for k,v in dict_p.items()}
import json
with open('test-output.json', 'w') as fp:
    json.dump(out, fp)

In [35]:
import json
out = {k:'non-rumour' for k in devel['index']}
with open('dev.result.json', 'w') as fp:
    json.dump(out, fp)

In [552]:
import json
dict_p1 = predict(model, dev_iterator)
out1 = {k:'rumour' if v>0 else 'non-rumour' for k,v in dict_p1.items()}
with open('dev.result.json', 'w') as fp:
    json.dump(out1, fp)

In [37]:
with torch.no_grad():
    for batch in dev_loader:
        seq, attn_masks, labels, indice = batch
print(indice)

('544458445764968448', '580338194731962368', '552825844755083265', '552807904597008385', '552804592988479488', '525025279803424768', '552784600502915072', '499696525808001024', '580320612155060224', '553218279557582849')


In [39]:
def predict(model, iterator):
    model.eval()
    predict = []
    index = []
    with torch.no_grad():
        for batch in iterator:
            seq, attn_masks, labels, indice = batch
            labels = labels.to(device)
            preds = model(seq.to(device), attn_masks.to(device)).squeeze(1)
            rounded_preds = torch.round(torch.sigmoid(preds))
            predict = predict + rounded_preds.tolist()
            index = index + list(indice)
    
    result = {x:y for x,y in zip(index, predict)}
    return result

model.load_state_dict(torch.load('model_BERTGRU.pt'))
dict_p1 = predict(model, dev_loader)
out1 = {k:'rumour' if v>0 else 'non-rumour' for k,v in dict_p1.items()}
with open('dev.result.json', 'w') as fp:
    json.dump(out1, fp)