# Natural Language Processing with Disaster Tweets
## Weitong Zhang, UID: 705302329, Email: <weightzero@g.ucla.edu>

In [1]:
import pandas as pd
import itertools
import warnings
import torch
import transformers
import numpy as np
import pickle
from clean import *
from sklearn.feature_extraction.text import TfidfVectorizer,CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, classification_report
from sklearn.exceptions import ConvergenceWarning
from torch.utils.data import TensorDataset, DataLoader
warnings.filterwarnings(action='ignore', category=ConvergenceWarning)

## Data Preprocessing

Here we read the data and preprocess it, we use the data preprocessing method provided by [DisaterTweets](https://www.kaggle.com/zinebkhanjari/disastertweets#Preprocessing) to clean up the non-text, emojis, and abbreviations.
We move the copied code into `clean.py`

In [2]:
train = pd.read_csv("./data/train.csv")
test = pd.read_csv("./data/test.csv")

def clean_tweet(text):
    # Remove non text
    text = remove_URL(text)
    text = remove_HTML(text)
    text = remove_not_ASCII(text)
    
    # Lower text, replace abbreviations
    text = text.lower()
    text = replace_abbrev(text)  
    text = remove_mention(text)
    text = remove_number(text)
    
    # Remove emojis / smileys
    text = remove_emoji(text)
    text = transcription_sad(text)
    text = transcription_smile(text)
    text = transcription_heart(text)
    
    # Remove repeated punctuation / words
    text = remove_elongated_words(text)
    text = remove_repeat_punct(text)

    return text

train["clean_text"] = train["text"].apply(clean_tweet)
test["clean_text"] = test["text"].apply(clean_tweet)

train.head()

Unnamed: 0,id,keyword,location,text,target,clean_text
0,1,,,Our Deeds are the Reason of this #earthquake M...,1,our deeds are the reason of this #earthquake m...
1,4,,,Forest fire near La Ronge Sask. Canada,1,forest fire near la ronge sask. canada
2,5,,,All residents asked to 'shelter in place' are ...,1,all residents asked to 'shelter in place' are ...
3,6,,,"13,000 people receive #wildfires evacuation or...",1,NUMBER people receive #wildfires evacuation or...
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1,just got sent this photo from ruby #alaska as ...


## Training pipeline

Next we are about to propose several vectorization method and training model.
But before that, we are about to introduce the training pipeline to integrate
all training method later, which includes the following five parts in general.

vectorization --> train-text split -->  training --> evaluation --> test

In [3]:
def run(Vectorizer, Trainer):
    y = train.target
    trainX, valX, trainY, valY = train_test_split(Vectorizer.trainX, y, test_size = 0.3)
    Trainer.train(trainX, trainY)
    predTrain = Trainer.predict(trainX)
    predVal = Trainer.predict(valX)
    print('\t{} + {}:'.format(str(Vectorizer), str(Trainer)))
    print('-' * 60)
    print('Training Dataset')
    print(classification_report(trainY, predTrain))
    print('-' * 60)
    print('Validation Dataset')
    print(classification_report(valY, predVal))
    print('=' * 60)
    trainX, trainY = Vectorizer.trainX, y
    Trainer.train(trainX, trainY)
    test['target'] = Trainer.predict(Vectorizer.testX)
    test[['id', 'target']].to_csv('./output/{}_{}.csv'.format(str(Vectorizer), str(Trainer)), index=False)

### Vectorization

#### Count Vectorizer and TF-IDF Vectorizer (scikit-learn provided)

The first two vectorization methods are the Count Vectorizer and TF-IDF vectorizer
provided by scikit-learn. Count vectorizer which simply counts the time of 
appearance of each word in the sentence. TF-IDF vectorizer maintain another count
mectric which weight more on the 'unusual' words. These two vectorizers are simple
but might ignore the combination of the words.

In [4]:
class sklearnVectorizer:
    def __init__(self, obj):
        self.obj = obj
        self.vec = obj()
        self.trainX = self.vec.fit_transform(train['clean_text'])
        self.testX = self.vec.transform(test['clean_text'])
    def __str__(self):
        return str(self.obj.__name__)
skvector = [CountVectorizer, TfidfVectorizer]

#### BERT pretrained feature vector

In [5]:
class bertVectorizer:
    def __init__(self, rebuild=False, model_id='bert-base-uncased'):
        self.model_id = model_id
        model_class, tokenizer_class, pretrained_weights = (transformers.BertModel, transformers.BertTokenizer, model_id)
        tokenizer = tokenizer_class.from_pretrained(pretrained_weights)
        model = model_class.from_pretrained(pretrained_weights)
        if not rebuild:
            with open('./data/' + model_id + '_feature.pkl', 'rb') as f:
                D = pickle.load(f)
                self.trainX = D['train']
                self.testX = D['test']
            return
        T = tokenizer.batch_encode_plus(train.text.apply(preprocess), max_length=64, add_special_tokens=True, return_attention_mask=True,padding=True,truncation=True)
        input_id = torch.tensor(T['input_ids'], dtype=torch.long)
        mask = torch.tensor(T['attention_mask'], dtype=torch.float32)
        with torch.no_grad():
            X = model(input_id, attention_mask=mask)[0][:,0,:]
        self.trainX = X.numpy()
        T = tokenizer.batch_encode_plus(test.text.apply(preprocess), max_length=64, add_special_tokens=True, return_attention_mask=True,padding=True,truncation=True)
        input_id = torch.tensor(T['input_ids'], dtype=torch.long)
        mask = torch.tensor(T['attention_mask'], dtype=torch.float32)
        with torch.no_grad():
            X = model(input_id, attention_mask=mask)[0][:,0,:]
        self.testX = X.numpy()
        with open('./data/' + model_id + '_feature.pkl', 'wb') as f:
            pickle.dump({'train': self.trainX, 'test': self.testX}, f)
    def __str__(self):
        return self.model_id

### Training model

We are using the scikit-learn provided learner such as Losistic Regression, SVC, Decision Tree, Random Forest and kNN. Fine-tuning model is discussed in the report but we will skip it due to its heavy computational cost

In [6]:
class sklearnTrainer:
    def __init__(self, obj):
        self.obj = obj
    def train(self, X, y):
        self.clf = self.obj().fit(X, y)
    def predict(self, X):
        return self.clf.predict(X)
    def __str__(self):
        return str(self.obj.__name__)
sklearner = [LogisticRegression, SVC, DecisionTreeClassifier, KNeighborsClassifier, RandomForestClassifier]

## Result

Here we first show the result using scikit-learn provided methods

In [7]:
print('=' * 60)
for v in skvector:
    for l in sklearner:
        run(sklearnVectorizer(v), sklearnTrainer(l))
for v in ['bert-base-uncased', 'bert-large-uncased']:
    for l in sklearner:
        run(bertVectorizer(model_id=v), sklearnTrainer(l))

	CountVectorizer + LogisticRegression:
------------------------------------------------------------
Training Dataset
              precision    recall  f1-score   support

           0       0.94      0.99      0.96      3025
           1       0.98      0.92      0.95      2304

    accuracy                           0.96      5329
   macro avg       0.96      0.95      0.96      5329
weighted avg       0.96      0.96      0.96      5329

------------------------------------------------------------
Validation Dataset
              precision    recall  f1-score   support

           0       0.78      0.87      0.82      1317
           1       0.79      0.67      0.73       967

    accuracy                           0.79      2284
   macro avg       0.79      0.77      0.77      2284
weighted avg       0.79      0.79      0.78      2284

	CountVectorizer + SVC:
------------------------------------------------------------
Training Dataset
              precision    recall  f1-score   s