## Importing Libraries

In [1]:
import torch
from transformers import *
from fastai.text.all import *

from blurr.data.all import *
from blurr.modeling.all import *
import unidecode
import wordninja

[nltk_data] Downloading package wordnet to /home/aaagraw/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


## Importing Data

In [2]:
path = Path("../data/identify_the_sentiments/")

train_df = pd.read_csv(path/'train_2kmZucJ.csv')
train_df = train_df.rename(columns={'tweet':'text'})
test_df = pd.read_csv(path/'test_oJQbWVk.csv')
test_df = test_df.rename(columns={'tweet':'text'})

## Pre-Processing

In [3]:
def clean_tweet(text):
    
    # lower-case all characters
    text=text.lower()
    
    # remove twitter handles
    text= re.sub(r'@\S+', '',text) 
    
    # remove urls
    text= re.sub(r'http\S+', '',text) 
    text= re.sub(r'pic.\S+', '',text)
      
    # replace unidecode characters
    text=unidecode.unidecode(text)
      
    # regex only keeps characters
    text= re.sub(r"[^a-zA-Z+']", ' ',text)
    
    # keep words with length>1 only
    text=re.sub(r'\s+[a-zA-Z]\s+', ' ', text+' ') 

    # split words like 'whatisthis' to 'what is this'
    def preprocess_wordninja(sentence):      
        def split_words(x):
            x=wordninja.split(x)
            x= [word for word in x if len(word)>1]
            return x
        new_sentence=[ ' '.join(split_words(word)) for word in sentence.split() ]
        return ' '.join(new_sentence)
    
    text=preprocess_wordninja(text)
    
    # regex removes repeated spaces, strip removes leading and trailing spaces
    text= re.sub("\s[\s]+", " ",text).strip()  
    
    return text

In [4]:
## Preprocessing Text
train_df['text']=train_df['text'].apply(lambda x: clean_tweet(x))
test_df['text']=test_df['text'].apply(lambda x: clean_tweet(x))

In [5]:
train_df.head()

Unnamed: 0,id,label,text
0,1,0,fingerprint pregnancy test android apps beautiful cute health iger iphone only iphones iphone
1,2,0,finally trans paran silicon case thanks to my uncle yay sony peri sony ex peri as
2,3,0,we love this would you go talk make memories unplug relax iphone smartphone wi fi connect
3,4,0,i'm wired know i'm george was made that way iphone cute daventry home
4,5,1,what amazing service apple won't even talk to me about question have unless pay them for their stupid support


In [6]:
test_df.head()

Unnamed: 0,id,text
0,7921,hate the new iphone upgrade won't let me download apps ugh apple sucks
1,7922,currently shitting my fucking pants apple imac cash money rad de st swags wags wag
2,7923,i'd like to puts some cd roms on my ipad is that possible yes but wouldn't that block the screen
3,7924,my ipod is officially dead lost all my and videos from the and sos concert and from vet camp hating life sobbing
4,7925,been fighting itunes all night only want the music paid for


## Training a  Model

In [7]:
## We are doing sequence classification (Blurr functionality)
task = HF_TASKS_AUTO.SequenceClassification

## Define your model (many hugging face models can be selected from here)
pretrained_model_name = "bert-base-uncased"

## Blurr makes it easy to get configs related to model specified above (so convienient!)
hf_arch, hf_config, hf_tokenizer, hf_model = BLURR_MODEL_HELPER.get_hf_objects(pretrained_model_name,  task=task)

In [8]:
## Create DataLoader using Blurr and FastAI Data loaders
blocks = (HF_TextBlock(hf_arch, hf_config, hf_tokenizer, hf_model), CategoryBlock)
dblock = DataBlock(
    blocks=blocks,  
    get_x=ColReader('text'), 
    get_y=ColReader('label'), 
    splitter=RandomSplitter(0.2, seed=42))
dls = dblock.dataloaders(train_df, bs=64)

In [9]:
## Checking things are working fine
dls.show_batch(dataloaders=dls, max_n=2)

Unnamed: 0,text,category
0,orange mood today mood my mood orange bell ross bell and ross alligator skin alligator crocodile alligators trap samsung nike nike jordan air jordan ralph lauren polo polo ralph lauren london grad watch crocodiles trap london grad bell ross watches bell ross watch,0
1,not sure what he was after but he was digging hole dogs pet do glover love my dog adorable animal photo of the day insta good dogs insta puppy dogs of insta gram doggy animals pets agram black and white insta gram dogs pets of insta gram canine dog life iphone,0


In [10]:
## Define model 
model = HF_BaseModelWrapper(hf_model)
learn = Learner(dls, 
                model,
                opt_func=partial(Adam, decouple_wd=True),
                loss_func=CrossEntropyLossFlat(),
                metrics=[accuracy],
                cbs=[HF_BaseModelCallback],
                splitter=hf_splitter)

learn.create_opt() 

In [11]:
## Only training last layer so freeze everything else
learn.freeze()

## Train for 5 epochs
learn.fit_one_cycle(5, lr_max=1e-3)

epoch,train_loss,valid_loss,accuracy,time
0,0.466401,0.302765,0.863636,00:20
1,0.294039,0.209141,0.912247,00:20
2,0.239846,0.211146,0.915404,00:20
3,0.225147,0.199774,0.914773,00:20
4,0.21756,0.199265,0.915404,00:20


## Making Predictions

In [12]:
test_dl = learn.dls.test_dl(test_df)
preds, _ = learn.get_preds(dl=test_dl, reorder=False)

In [13]:
submission = pd.DataFrame({'id':test_df.iloc[test_dl.get_idxs(),0].values})
submission['label'] = preds.argmax(axis=1)

In [14]:
submission.to_csv("./submission/submission_blurr_bert_uncased.csv", index=False)

In [15]:
## Get Probabilities
submission['label']  = preds[:,1]
submission.to_csv("./submission/submission_blurr_bert_uncased_prob.csv", index=False)