# Imports

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
import nltk
from nltk.corpus import stopwords 
from nltk.tokenize import word_tokenize 
import string
from nltk.stem import WordNetLemmatizer 
from nltk.corpus import wordnet
import itertools
import re

from sklearn.model_selection import train_test_split

from transformers import BertModel, BertTokenizer
import torch
import numpy as np
from tqdm import tqdm

from torch import nn
import torch.nn.functional as F
from torch.autograd import Variable
from torch.utils.data import DataLoader
import matplotlib.pyplot as plt

from mlxtend.plotting import plot_confusion_matrix
from sklearn.metrics import confusion_matrix, classification_report
def make_report(y_pred , y_true):
    print ("")
    print ("Classification Report: ")
    print (classification_report(y_true, y_pred))
    cm = confusion_matrix(y_true, y_pred)
    fig, ax = plot_confusion_matrix(conf_mat=cm)
    plt.show()

# Loading Data

In [None]:
data = pd.read_csv('/kaggle/input/nlp-getting-started/train.csv', index_col='id')
validation = pd.read_csv('/kaggle/input/nlp-getting-started/test.csv', index_col='id')
print(data.shape)
print(validation.shape)
data.head()

# EDA

In [None]:
data.target.hist()

In [None]:
n = data.shape[0]
print('location ratio',data[~data.location.isnull()].shape[0]/n)
print('keyword ratio',data[~data.keyword.isnull()].shape[0]/n)
print('keyword and location ratio',data[(~data.keyword.isnull()) & (~data.location.isnull())].shape[0]/n)

In [None]:
data[~data.location.isnull()].target.hist()

In [None]:
data[~data.keyword.isnull()].target.hist()

In [None]:
len(data.keyword.unique())

In [None]:
data.keyword.hist()

In [None]:
len(data.location.unique())

# Preprocess Data

In [None]:
def clean_abbreviation(text):
    text = re.sub(r"he's", "he is",  text)
    text = re.sub(r"there's", "there is",  text)
    text = re.sub(r"We're", "We are",  text)
    text = re.sub(r"That's", "That is",  text)
    text = re.sub(r"won't", "will not",  text)
    text = re.sub(r"they're", "they are",  text)
    text = re.sub(r"Can't", "Cannot",  text)
    text = re.sub(r"wasn't", "was not",  text)
    text = re.sub(r"aren't", "are not",  text)
    text = re.sub(r"isn't", "is not",  text)
    text = re.sub(r"What's", "What is",  text)
    text = re.sub(r"i'd", "I would",  text)
    text = re.sub(r"should've", "should have",  text)
    text = re.sub(r"where's", "where is",  text)
    text = re.sub(r"we'd", "we would",  text)
    text = re.sub(r"i'll", "I will",  text)
    text = re.sub(r"weren't", "were not",  text)
    text = re.sub(r"They're", "They are",  text)
    text = re.sub(r"let's", "let us",  text)
    text = re.sub(r"it's", "it is",  text)
    text = re.sub(r"can't", "cannot",  text)
    text = re.sub(r"don't", "do not",  text)
    text = re.sub(r"you're", "you are",  text)
    text = re.sub(r"i've", "I have",  text)
    text = re.sub(r"that's", "that is",  text)
    text = re.sub(r"i'll", "I will",  text)
    text = re.sub(r"doesn't", "does not",  text)
    text = re.sub(r"i'd", "I would",  text)
    text = re.sub(r"didn't", "did not",  text)
    text = re.sub(r"ain't", "am not",  text)
    text = re.sub(r"you'll", "you will",  text)
    text = re.sub(r"I've", "I have",  text)
    text = re.sub(r"Don't", "do not",  text)
    text = re.sub(r"I'll", "I will",  text)
    text = re.sub(r"I'd", "I would",  text)
    text = re.sub(r"Let's", "Let us",  text)
    text = re.sub(r"you'd", "You would",  text)
    text = re.sub(r"It's", "It is",  text)
    text = re.sub(r"Ain't", "am not",  text)
    text = re.sub(r"Haven't", "Have not",  text)
    text = re.sub(r"Could've", "Could have",  text)
    text = re.sub(r"youve", "you have",  text)
    text = re.sub(r"haven't", "have not",  text)
    text = re.sub(r"hasn't", "has not",  text)
    text = re.sub(r"There's", "There is",  text)
    text = re.sub(r"He's", "He is",  text)
    text = re.sub(r"It's", "It is",  text)
    text = re.sub(r"You're", "You are",  text)
    text = re.sub(r"I'M", "I am",  text)
    text = re.sub(r"shouldn't", "should not",  text)
    text = re.sub(r"wouldn't", "would not",  text)
    text = re.sub(r"i'm", "I am",  text)
    text = re.sub(r"I'm", "I am",  text)
    text = re.sub(r"Isn't", "is not",  text)
    text = re.sub(r"Here's", "Here is",  text)
    text = re.sub(r"you've", "you have",  text)
    text = re.sub(r"we're", "we are",  text)
    text = re.sub(r"what's", "what is",  text)
    text = re.sub(r"couldn't", "could not",  text)
    text = re.sub(r"we've", "we have",  text)
    text = re.sub(r"who's", "who is",  text)
    text = re.sub(r"y'all", "you all",  text)
    text = re.sub(r"would've", "would have",  text)
    text = re.sub(r"it'll", "it will",  text)
    text = re.sub(r"we'll", "we will",  text)
    text = re.sub(r"We've", "We have",  text)
    text = re.sub(r"he'll", "he will",  text)
    text = re.sub(r"Y'all", "You all",  text)
    text = re.sub(r"Weren't", "Were not",  text)
    text = re.sub(r"Didn't", "Did not",  text)
    text = re.sub(r"they'll", "they will",  text)
    text = re.sub(r"they'd", "they would",  text)
    text = re.sub(r"DON'T", "DO NOT",  text)
    text = re.sub(r"they've", "they have",  text)

    text = re.sub(r"tnwx", "Tennessee Weather",  text)
    text = re.sub(r"azwx", "Arizona Weather",  text)  
    text = re.sub(r"alwx", "Alabama Weather",  text)
    text = re.sub(r"wordpressdotcom", "wordpress",  text)      
    text = re.sub(r"gawx", "Georgia Weather",  text)  
    text = re.sub(r"scwx", "South Carolina Weather",  text)  
    text = re.sub(r"cawx", "California Weather",  text)
    text = re.sub(r"usNWSgov", "United States National Weather Service",  text) 
    text = re.sub(r"MH370", "Malaysia Airlines Flight 370",  text)
    text = re.sub(r"okwx", "Oklahoma City Weather",  text)
    text = re.sub(r"arwx", "Arkansas Weather",  text)  
    text = re.sub(r"lmao", "laughing my ass off",  text)  
    text = re.sub(r"amirite", "am I right",  text)

    text = re.sub(r"w/e", "whatever",  text)
    text = re.sub(r"w/", "with",  text)
    text = re.sub(r"USAgov", "USA government",  text)
    text = re.sub(r"recentlu", "recently",  text)
    text = re.sub(r"Ph0tos", "Photos",  text)
    text = re.sub(r"exp0sed", "exposed",  text)
    text = re.sub(r"<3", "love",  text)
    text = re.sub(r"amageddon", "armageddon",  text)
    text = re.sub(r"Trfc", "Traffic",  text)
    text = re.sub(r"WindStorm", "Wind Storm",  text)
    text = re.sub(r"16yr", "16 year",  text)
    text = re.sub(r"TRAUMATISED", "traumatized",  text)

    text = re.sub(r"IranDeal", "Iran Deal",  text)
    text = re.sub(r"ArianaGrande", "Ariana Grande",  text)
    text = re.sub(r"camilacabello97", "camila cabello",  text) 
    text = re.sub(r"RondaRousey", "Ronda Rousey",  text)     
    text = re.sub(r"MTVHottest", "MTV Hottest",  text)
    text = re.sub(r"TrapMusic", "Trap Music",  text)
    text = re.sub(r"ProphetMuhammad", "Prophet Muhammad",  text)
    text = re.sub(r"PantherAttack", "Panther Attack",  text)
    text = re.sub(r"StrategicPatience", "Strategic Patience",  text)
    text = re.sub(r"socialnews", "social news",  text)
    text = re.sub(r"IDPs:", "Internally Displaced People :",  text)
    text = re.sub(r"ArtistsUnited", "Artists United",  text)
    text = re.sub(r"ClaytonBryant", "Clayton Bryant",  text)
    text = re.sub(r"jimmyfallon", "jimmy fallon",  text)
    text = re.sub(r"justinbieber", "justin bieber",  text)  
    text = re.sub(r"Time2015", "Time 2015",  text)
    text = re.sub(r"djicemoon", "dj icemoon",  text)
    text = re.sub(r"LivingSafely", "Living Safely",  text)
    text = re.sub(r"FIFA16", "Fifa 2016",  text)
    text = re.sub(r"thisiswhywecanthavenicethings", "this is why we cannot have nice things",  text)
    text = re.sub(r"bbcnews", "bbc news",  text)
    text = re.sub(r"UndergroundRailraod", "Underground Railraod",  text)
    text = re.sub(r"c4news", "c4 news",  text)
    text = re.sub(r"MUDSLIDE", "mudslide",  text)
    text = re.sub(r"NoSurrender", "No Surrender",  text)
    text = re.sub(r"NotExplained", "Not Explained",  text)
    text = re.sub(r"greatbritishbakeoff", "great british bake off",  text)
    text = re.sub(r"LondonFire", "London Fire",  text)
    text = re.sub(r"KOTAWeather", "KOTA Weather",  text)
    text = re.sub(r"LuchaUnderground", "Lucha Underground",  text)
    text = re.sub(r"KOIN6News", "KOIN 6 News",  text)
    text = re.sub(r"LiveOnK2", "Live On K2",  text)
    text = re.sub(r"9NewsGoldCoast", "9 News Gold Coast",  text)
    text = re.sub(r"nikeplus", "nike plus",  text)
    text = re.sub(r"david_cameron", "David Cameron",  text)
    text = re.sub(r"peterjukes", "Peter Jukes",  text)
    text = re.sub(r"MikeParrActor", "Michael Parr",  text)
    text = re.sub(r"4PlayThursdays", "Foreplay Thursdays",  text)
    text = re.sub(r"TGF2015", "Tontitown Grape Festival",  text)
    text = re.sub(r"realmandyrain", "Mandy Rain",  text)
    text = re.sub(r"GraysonDolan", "Grayson Dolan",  text)
    text = re.sub(r"ApolloBrown", "Apollo Brown",  text)
    text = re.sub(r"saddlebrooke", "Saddlebrooke",  text)
    text = re.sub(r"TontitownGrape", "Tontitown Grape",  text)
    text = re.sub(r"AbbsWinston", "Abbs Winston",  text)
    text = re.sub(r"ShaunKing", "Shaun King",  text)
    text = re.sub(r"MeekMill", "Meek Mill",  text)
    text = re.sub(r"TornadoGiveaway", "Tornado Giveaway",  text)
    text = re.sub(r"GRupdates", "GR updates",  text)
    text = re.sub(r"SouthDowns", "South Downs",  text)
    text = re.sub(r"braininjury", "brain injury",  text)
    text = re.sub(r"auspol", "Australian politics",  text)
    text = re.sub(r"PlannedParenthood", "Planned Parenthood",  text)
    text = re.sub(r"calgaryweather", "Calgary Weather",  text)
    text = re.sub(r"weallheartonedirection", "we all heart one direction",  text)
    text = re.sub(r"edsheeran", "Ed Sheeran",  text)
    text = re.sub(r"TrueHeroes", "True Heroes",  text)
    text = re.sub(r"ComplexMag", "Complex Magazine",  text)
    text = re.sub(r"TheAdvocateMag", "The Advocate Magazine",  text)
    text = re.sub(r"CityofCalgary", "City of Calgary",  text)
    text = re.sub(r"EbolaOutbreak", "Ebola Outbreak",  text)
    text = re.sub(r"SummerFate", "Summer Fate",  text)
    text = re.sub(r"RAmag", "Royal Academy Magazine",  text)
    text = re.sub(r"offers2go", "offers to go",  text)
    text = re.sub(r"ModiMinistry", "Modi Ministry",  text)
    text = re.sub(r"TAXIWAYS", "taxi ways",  text)
    text = re.sub(r"Calum5SOS", "Calum Hood",  text)
    text = re.sub(r"JamesMelville", "James Melville",  text)
    text = re.sub(r"JamaicaObserver", "Jamaica Observer",  text)
    text = re.sub(r" textLikeItsSeptember11th2001", " text like it is september 11th 2001",  text)
    text = re.sub(r"cbplawyers", "cbp lawyers",  text)
    text = re.sub(r"fewmore texts", "few more  texts",  text)
    text = re.sub(r"BlackLivesMatter", "Black Lives Matter",  text)
    text = re.sub(r"NASAHurricane", "NASA Hurricane",  text)
    text = re.sub(r"onlinecommunities", "online communities",  text)
    text = re.sub(r"humanconsumption", "human consumption",  text)
    text = re.sub(r"Typhoon-Devastated", "Typhoon Devastated",  text)
    text = re.sub(r"Meat-Loving", "Meat Loving",  text)
    text = re.sub(r"facialabuse", "facial abuse",  text)
    text = re.sub(r"LakeCounty", "Lake County",  text)
    text = re.sub(r"BeingAuthor", "Being Author",  text)
    text = re.sub(r"withheavenly", "with heavenly",  text)
    text = re.sub(r"thankU", "thank you",  text)
    text = re.sub(r"iTunesMusic", "iTunes Music",  text)
    text = re.sub(r"OffensiveContent", "Offensive Content",  text)
    text = re.sub(r"WorstSummerJob", "Worst Summer Job",  text)
    text = re.sub(r"HarryBeCareful", "Harry Be Careful",  text)
    text = re.sub(r"NASASolarSystem", "NASA Solar System",  text)
    text = re.sub(r"animalrescue", "animal rescue",  text)
    text = re.sub(r"KurtSchlichter", "Kurt Schlichter",  text)
    text = re.sub(r"Throwingknifes", "Throwing knives",  text)
    text = re.sub(r"GodsLove", "God's Love",  text)
    text = re.sub(r"bookboost", "book boost",  text)
    text = re.sub(r"ibooklove", "I book love",  text)
    text = re.sub(r"NestleIndia", "Nestle India",  text)
    text = re.sub(r"realDonaldTrump", "Donald Trump",  text)
    text = re.sub(r"DavidVonderhaar", "David Vonderhaar",  text)
    text = re.sub(r"CecilTheLion", "Cecil The Lion",  text)
    text = re.sub(r"weathernetwork", "weather network",  text)
    text = re.sub(r"GOPDebate", "GOP Debate",  text)
    text = re.sub(r"RickPerry", "Rick Perry",  text)
    text = re.sub(r"frontpage", "front page",  text)
    text = re.sub(r"NewsIn texts", "News In  texts",  text)
    text = re.sub(r"ViralSpell", "Viral Spell",  text)
    text = re.sub(r"til_now", "until now",  text)
    text = re.sub(r"volcanoinRussia", "volcano in Russia",  text)
    text = re.sub(r"ZippedNews", "Zipped News",  text)
    text = re.sub(r"MicheleBachman", "Michele Bachman",  text)
    text = re.sub(r"53inch", "53 inch",  text)
    text = re.sub(r"KerrickTrial", "Kerrick Trial",  text)
    text = re.sub(r"abstorm", "Alberta Storm",  text)
    text = re.sub(r"Beyhive", "Beyonce hive",  text)
    text = re.sub(r"RockyFire", "Rocky Fire",  text)
    text = re.sub(r"Listen/Buy", "Listen / Buy",  text)
    text = re.sub(r"ArtistsUnited", "Artists United",  text)
    text = re.sub(r"ENGvAUS", "England vs Australia",  text)
    text = re.sub(r"ScottWalker", "Scott Walker",  text)
    return text
    
def lower_stopwords_punctuation(text):
    text = text.lower()
    text = ' '.join([w for w in word_tokenize(text) if not w in set(stopwords.words('english'))])
    text = text.translate(str.maketrans('', '', string.punctuation))
    text = ' '.join(text.split())
    return text

def simple_cleaning(text):
    text=''.join(i for i, _ in itertools.groupby(text))    
    pattern = '[0-9]+'
    text=re.sub(pattern, ' isnumber ', text) 
    text = re.sub('[^a-zA-Z]', ' ',  text)
    return text

def get_wordnet_pos(treebank_tag):
    lemmatizer = WordNetLemmatizer()
    name = treebank_tag[0]
    treebank_tag = treebank_tag[1]
    if treebank_tag.startswith('J'):
        return lemmatizer.lemmatize(name, wordnet.ADJ)
    elif treebank_tag.startswith('V'):
        return lemmatizer.lemmatize(name, wordnet.VERB)
    elif treebank_tag.startswith('N'):
        return lemmatizer.lemmatize(name, wordnet.NOUN)
    elif treebank_tag.startswith('R'):
        return lemmatizer.lemmatize(name, wordnet.ADV)
    else:
        return lemmatizer.lemmatize(name, wordnet.NOUN)

def lemma(text):
    tokenizer = nltk.tokenize.TreebankWordTokenizer()
    pos = nltk.pos_tag(tokenizer.tokenize(text))
    return ' '.join(map(get_wordnet_pos, pos))

def cleaning_pipeline(text):
    text = clean_abbreviation(text)
    text = lower_stopwords_punctuation(text)
    text = simple_cleaning(text)
    text = lemma(text)
    return text

In [None]:
%%time
data['text_clean'] = data.text.apply(lambda x: cleaning_pipeline(x))
validation['text_clean'] = validation.text.apply(lambda x: cleaning_pipeline(x))

In [None]:
data.head()

In [None]:
x_train, x_test, y_train, y_test = train_test_split(data.drop('target', axis = 1), data.target, test_size=0.2, random_state=42)
print(x_train.shape)
print(x_test.shape)

In [None]:
def padding_zero(vec, max_length):
    if len(vec) < max_length:
        vec = np.pad(vec, ( (0, max_length-len(vec)), (0,0) ), 'constant')
    elif len(vec) > max_length:
        vec = vec[:max_length]
    return vec

def embed_data(data):
    model = BertModel.from_pretrained('bert-base-uncased')
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
    text_data = list(data["text_clean"].values)
    train_lst = list()

    for i in tqdm(range(0, len(text_data))):
        input_ids = torch.tensor(tokenizer.encode(text_data[i], add_special_tokens=True, max_length=100)).unsqueeze(0)  # Batch size 1
        outputs = model(input_ids)
        last_hidden_states = outputs[0][0]
        bert_arr = padding_zero(last_hidden_states.detach().numpy(), 64)
        train_lst += [bert_arr]
    train_arr = np.array(train_lst)
    return train_arr

train_arr = embed_data(x_train)
test_arr = embed_data(x_test)
val_arr = embed_data(validation)

In [None]:
BATCH_SIZE = 128

class MyDataSet(torch.utils.data.Dataset):
    def __init__(self, x, y):
        self.x = torch.tensor(x)
        self.y = torch.tensor(y).long()
        
    def __len__(self):
        return len(self.x)
    
    def __getitem__(self, idx):
        return(self.x[idx], self.y[idx])
    
train_dataset = MyDataSet(train_arr, y_train.values)
test_dataset = MyDataSet(test_arr, y_test.values)
validation_dataset = MyDataSet(val_arr, np.zeros(validation.shape[0]))

train_dataloader = torch.utils.data.DataLoader(train_dataset, batch_size = BATCH_SIZE, shuffle = True)
test_dataloader = torch.utils.data.DataLoader(test_dataset, batch_size = BATCH_SIZE, shuffle = False)
validation_dataloader = torch.utils.data.DataLoader(validation_dataset, batch_size = BATCH_SIZE, shuffle = False)

# Modeling

In [None]:
class TransformerClassifier(torch.nn.Module):
    def __init__(self):
        super(TransformerClassifier, self).__init__()
        encoder = nn.TransformerEncoderLayer(768, 32, dim_feedforward=256) 
        self.encoder = nn.TransformerEncoder(encoder, 4) 
        self.flatten = torch.nn.Sequential(nn.MaxPool2d((3, 3)), torch.nn.Flatten())
        self.linear = nn.Linear(5376, 2)
    
    def forward(self, x):
        x = self.encoder(x)
        x = self.flatten(x)
        x = self.linear(x)
        return x
model = TransformerClassifier()
print(model)

# Train

In [None]:
%%time
N_EPOCHS = 10
device = 'cpu'
criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=5e-5)

print('started!')
for epoch in range(N_EPOCHS):
    #Train
    train_batch_loss = 0
    model.train()
    for step, batch in enumerate(train_dataloader):
        x = batch[0].to(device)
        y = batch[1].to(device)
        if step%10==1:
            print(step)
        optimizer.zero_grad()
        outputs = model(x)    
        loss = criterion(outputs, y)
        train_batch_loss += loss.item()
        loss.backward()
        optimizer.step()

    #Validation
    test_batch_loss = 0
    model.eval()
    with torch.no_grad():
        for step, batch in enumerate(test_dataloader):
            x = batch[0].to(device)
            y = batch[1].to(device)
            outputs = model(x)
            loss = criterion(outputs, y)
            test_batch_loss += loss.item()

    print('{}/{} train loss: {} test loss: {}'.format(epoch+1, N_EPOCHS,
                                                            train_batch_loss / len(train_dataloader),
                                                            test_batch_loss / len(test_dataloader)))

torch.save(model.state_dict(), './new.model')

In [None]:
pred = []
true = []
with torch.no_grad():
    for step, batch in enumerate(test_dataloader):
        x = batch[0].to(device)
        y = batch[1].to(device)
        outputs = model(x)
        pred+=(torch.argmax(outputs, axis = 1).detach().numpy().tolist())
        true+=(y.detach().numpy().tolist())

In [None]:
make_report(y_pred=pred, y_true=true)

In [None]:
pred = []
with torch.no_grad():
    for step, batch in enumerate(validation_dataloader):
        x = batch[0].to(device)
        y = batch[1].to(device)
        outputs = model(x)
        pred+=(torch.argmax(outputs, axis = 1).detach().numpy().tolist())

In [None]:
pd.DataFrame({'id':validation.index, 'target':pred}).to_csv('./submition1.csv', index = False)