In [0]:
import nltk
from nltk.corpus import reuters
from nltk import word_tokenize
from nltk.stem.porter import PorterStemmer
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.svm import LinearSVC
from sklearn.multiclass import OneVsRestClassifier
from sklearn.metrics import f1_score, precision_score, recall_score
import json 
import pandas as pd 
from pandas.io.json import json_normalize
import types
import fastai
from fastai import *
from fastai.text import * 
import numpy as np
from functools import partial
import re, io, os, collections, html


nltk.download('reuters')
nltk.download('punkt')  
nltk.download('stopwords')
stop_words = stopwords.words("english")

[nltk_data] Downloading package reuters to /root/nltk_data...
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [0]:
# Create train and test datasets for fine-tuning language model from NLTK Reuters corpus

def train_test_sets(documents):
    train_docs_id = list(filter(lambda doc: doc.startswith("train"), documents))
    test_docs_id = list(filter(lambda doc: doc.startswith("test"), documents))

    train_docs = [reuters.raw(doc_id) for doc_id in train_docs_id]
    test_docs = [reuters.raw(doc_id) for doc_id in test_docs_id]
    
    df_train_text = pd.DataFrame(train_docs, columns=['text'])
    df_val_text = pd.DataFrame(test_docs, columns=['text'])
    
    train_labels = [reuters.categories(doc_id)[0] for doc_id in train_docs_id]
    test_labels = [reuters.categories(doc_id)[0] for doc_id in test_docs_id]
    
    df_train_label = pd.DataFrame(train_labels, columns=['label'])
    df_val_label = pd.DataFrame(test_labels, columns=['label'])
    
    df_train = pd.concat([df_train_label, df_train_text], axis=1)
    df_val = pd.concat([df_val_label, df_val_text], axis=1)
    
    return df_train, df_val

In [0]:
df_train, df_val = train_test_sets(reuters.fileids())

In [0]:
# Fine tune the language model pre-trained on Wikipedia text using NLTK Reuters data
def get_finetuned_LM():
    data_lm = TextLMDataBunch.from_df(train_df = df_train, valid_df = df_val, path = "")
    return data_lm

In [0]:
# Extract level 1 categorization from aspects
def extract_class(aspect_str):
    return str(aspect_str.split('/')[0])

# convert float point sentiment scores to categories
def score_to_class(sent_score):
    if sent_score > '0':
      return 'positive'
    elif sent_score < '0':
      return 'negative'
    else:
      return "neutral"

In [0]:
# Create datasets for Aspect Based Sentiment Classification (ABSA) on FiQA corpus

def aspect_senti_sets():
  with open('task1_headline_ABSA_train.json', 'r') as infile:  
    dict_hdln = json.load(infile)
  
  pd_list_sent = []
  pd_list_snip = []
  pd_list_tar = []
  pd_list_scr = []
  pd_list_asp = []
  for k in dict_hdln.keys():
      pd_list_sent.append([k, dict_hdln[k]['sentence']])
      pd_list_snip.append([k, dict_hdln[k]['info'][0]['snippets']])
      pd_list_tar.append([k, dict_hdln[k]['info'][0]['target']])
      pd_list_scr.append([k, dict_hdln[k]['info'][0]['sentiment_score']])
      pd_list_asp.append([k, dict_hdln[k]['info'][0]['aspects']])
  df_hdln_sent = pd.DataFrame(pd_list_sent,columns=['ID','sentence'])
  df_hdln_snip = pd.DataFrame(pd_list_snip,columns=['ID','info:snippets'])
  df_hdln_tar = pd.DataFrame(pd_list_tar,columns=['ID','info:target'])
  df_hdln_scr = pd.DataFrame(pd_list_scr,columns=['ID','info:sentiment_score'])
  df_hdln_asp = pd.DataFrame(pd_list_asp,columns=['ID','info:aspects'])
  
  pd_hdln_1 = pd.concat([df_hdln_sent,df_hdln_snip.iloc[:,1]],axis=1)
  pd_hdln_2 = pd.concat([pd_hdln_1,df_hdln_tar.iloc[:,1]],axis=1)
  pd_hdln_3 = pd.concat([pd_hdln_2,df_hdln_scr.iloc[:,1]],axis=1)
  df_hdln = pd.concat([pd_hdln_3,df_hdln_asp.iloc[:,1]],axis=1)
  
  df_hdln['info:aspects'] = df_hdln['info:aspects'].str.strip('[]')
  df_hdln['info:aspects'] = df_hdln['info:aspects'].str.strip('\'\'').astype('str')
  
  df_hdln['info_aspect_class'] = df_hdln['info:aspects'].apply(extract_class)
  
  # tokenization 
  tokenized_doc = df_hdln['sentence'].apply(lambda x: x.split())

  # remove stop-words 
  tokenized_doc = tokenized_doc.apply(lambda x: [item for item in x if item not in stop_words])

  # de-tokenization 
  detokenized_doc = [] 
  for i in range(len(df_hdln)): 
      t = ' '.join(tokenized_doc[i]) 
      detokenized_doc.append(t) 

  df_hdln['sentence'] = detokenized_doc
  
  # Building the aspect dataset
  df_fiqa_hdln_aspect = pd.DataFrame({'label':df_hdln.info_aspect_class, 'text':df_hdln.sentence})
  
  # Building the sentiment dataset
  df_hdln = df_hdln.rename(columns = {'info:sentiment_score':'info_sentiment_score'})
  df_fiqa_hln_sent = pd.DataFrame({'labelscore':df_hdln.info_sentiment_score, 'text':df_hdln.sentence})
  df_fiqa_hln_sent['labelscore'] = df_fiqa_hln_sent['labelscore'].fillna('0.0')
  df_fiqa_hln_sent['label'] = df_fiqa_hln_sent['labelscore'].apply(score_to_class)
  df_fiqa_hln_sent = df_fiqa_hln_sent[['label', 'text', 'labelscore']]
  df_fiqa_hln_sent = df_fiqa_hln_sent.drop(columns = ['labelscore'])
  
  df_fiqa_hdln_aspect = df_fiqa_hdln_aspect.reset_index(drop=True)
  df_fiqa_hln_sent = df_fiqa_hln_sent.reset_index(drop=True)
  
  return df_fiqa_hdln_aspect, df_fiqa_hln_sent

In [0]:
def get_textclassdatabunch():
    data_lm = get_finetuned_LM()
    
    df_fiqa_hdln_aspect, df_fiqa_hdln_sent = aspect_senti_sets()
    
    # split data into training and validation set
    df_trn_aspect, df_val_aspect = train_test_split(df_fiqa_hdln_aspect, stratify = df_fiqa_hdln_aspect['label'], test_size = 0.3, random_state = 12)
    df_trn_sent, df_val_sent = train_test_split(df_fiqa_hdln_sent, stratify = df_fiqa_hdln_sent['label'], test_size = 0.2, random_state = 12)
    
    # Aspect classifier model data
    data_clas1 = TextClasDataBunch.from_df(path = "", train_df = df_trn_aspect, valid_df = df_val_aspect, vocab=data_lm.train_ds.vocab, bs=32)
    
    # Sentiment classifier model data
    data_clas2 = TextClasDataBunch.from_df(path = "", train_df = df_trn_sent, valid_df = df_val_sent, vocab=data_lm.train_ds.vocab, bs=32)
    
    return data_lm, data_clas1, data_clas2

In [0]:
def get_finetuned_aspect_classifier():
    
    data_lm, data_clas1, data_clas2 = get_textclassdatabunch()
    
    learn_aspect = language_model_learner(data_lm, arch=AWD_LSTM, drop_mult=0.7)
    # train the learner object with learning rate = 1e-2
    learn_aspect.fit_one_cycle(4, 1e-2, moms=(0.8, 0.7))
    learn_aspect.save_encoder('ft_enc')
    learn_aspect = text_classifier_learner(data_clas1, arch=AWD_LSTM, drop_mult=0.7)
    learn_aspect.load_encoder('ft_enc')
    learn_aspect.fit_one_cycle(1, 1e-2, moms=(0.8, 0.7))
    learn_aspect.freeze_to(-2)
    learn_aspect.fit_one_cycle(1, slice(5e-3/2., 5e-3), moms=(0.8, 0.7))
    learn_aspect.unfreeze()
    learn_aspect.fit_one_cycle(1, slice(2e-3/100, 2e-3), moms=(0.8, 0.7))
    
    return learn_aspect

In [0]:
def get_finetuned_senti_classifier():
    
    data_lm, data_clas1, data_clas2 = get_textclassdatabunch()
    
    learn_senti = language_model_learner(data_lm, arch=AWD_LSTM, drop_mult=0.7)
    # train the learner object with learning rate = 1e-2
    learn_senti.fit_one_cycle(4, 1e-2, moms=(0.8, 0.7))
    learn_senti.save_encoder('ft_enc')
    learn_senti = text_classifier_learner(data_clas2, arch=AWD_LSTM, drop_mult=0.7)
    learn_senti.load_encoder('ft_enc')
    learn_senti.fit_one_cycle(4, 1e-2, moms=(0.8, 0.7))
    learn_senti.freeze_to(-2)
    learn_senti.fit_one_cycle(4, slice(5e-3/2., 5e-3), moms=(0.8, 0.7))
    learn_senti.unfreeze()
    learn_senti.fit_one_cycle(4, slice(2e-3/100, 2e-3), moms=(0.8, 0.7))
    
    return learn_senti
    

In [0]:
learn_aspect = get_finetuned_aspect_classifier()

epoch,train_loss,valid_loss,accuracy,time
0,4.056133,3.492874,0.369269,02:52
1,3.587591,3.259112,0.390728,02:52
2,3.328302,3.176452,0.399958,02:52
3,3.200045,3.158475,0.402593,02:52


epoch,train_loss,valid_loss,accuracy,time
0,1.40134,1.315203,0.381679,00:00


epoch,train_loss,valid_loss,accuracy,time
0,1.250894,1.220028,0.580153,00:00


epoch,train_loss,valid_loss,accuracy,time
0,1.164611,1.192525,0.633588,00:01


In [0]:
learn_senti = get_finetuned_senti_classifier()

epoch,train_loss,valid_loss,accuracy,time
0,4.056039,3.483662,0.371129,02:52
1,3.582258,3.259999,0.391676,02:53
2,3.328262,3.179625,0.399691,02:54
3,3.193767,3.161952,0.402176,02:53


epoch,train_loss,valid_loss,accuracy,time
0,1.140165,1.019116,0.636364,00:00
1,1.035617,0.788968,0.647727,00:00
2,0.942678,0.744092,0.647727,00:00
3,0.875791,0.739287,0.659091,00:00


epoch,train_loss,valid_loss,accuracy,time
0,0.720607,0.71516,0.636364,00:00
1,0.726044,0.692565,0.693182,00:00
2,0.706991,0.692791,0.693182,00:00
3,0.672488,0.681399,0.693182,00:00


epoch,train_loss,valid_loss,accuracy,time
0,0.650281,0.672329,0.681818,00:01
1,0.62302,0.662663,0.693182,00:01
2,0.609017,0.656776,0.670455,00:01
3,0.605527,0.656167,0.681818,00:01


In [0]:
learn_aspect.predict("Standard Chartered expected to pay just over $1 billion to resolve U.S., U.K. probes")

(Category Corporate, tensor(0), tensor([0.3373, 0.1855, 0.2541, 0.2230]))

In [0]:
learn_aspect.predict("Tesco leads leap in FTSE 100")

(Category Stock, tensor(3), tensor([0.2615, 0.1702, 0.2444, 0.3239]))

In [0]:
learn_senti.predict("Royal Mail chairman Donald Brydon set to step down")

(Category negative, tensor(0), tensor([0.6005, 0.0096, 0.3900]))

In [0]:
learn_senti.predict("Tesco leads leap in FTSE 100")

(Category positive, tensor(2), tensor([0.0782, 0.0051, 0.9167]))