In [1]:
import fastai
from fastai import *
from fastai.text import * 
from fastai.text.models import *
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import pandas as pd
from sklearn.metrics import confusion_matrix, accuracy_score
import swifter
import nltk
from nltk.corpus import stopwords
import re 
from bs4 import BeautifulSoup
from sklearn.metrics import precision_score, recall_score
from nltk.stem.snowball import SnowballStemmer

In [2]:
stops={}

In [3]:
#PREPROCEESING THE TEXT

def text_preprocessing(text, language, minWordSize):
    
    # remove html
    text_html = BeautifulSoup(text,"html.parser" ).get_text()
    
    # remove non-letters
    text_non_letters = re.sub("[^a-zA-Z']", " ", text_html) 
        
    # convert to lower-case
    text_lower = text_non_letters.lower()
    
    # remove stop words
    text_no_stop_words = ' '
    for w in text_lower.split():
        if w not in stops:  
            text_no_stop_words = text_no_stop_words + w + ' '  
    
    # do stemming
    text_stemmer = ' '
    stemmer = SnowballStemmer(language)
    for w in text_no_stop_words.split():
        text_stemmer = text_stemmer + stemmer.stem(w) + ' '
         
    # remove short words
    text_no_short_words = ' '
    for w in text_stemmer.split(): 
        if len(w) >=minWordSize:
            text_no_short_words = text_no_short_words + w + ' '
            
    return text_no_short_words

In [4]:
#PREPROCEESING THE TEXT

def text_preprocessing_simple(text, language, minWordLength):
    
    # remove html
    text_html = BeautifulSoup(text,"html.parser" ).get_text()
    
    # remove non-letters
    text_non_letters = re.sub("[^a-zA-Z']", " ", text_html) 
        
    # convert to lower-case
    text_lower = text_non_letters.lower()
    return text_lower

In [25]:
def text_preprocessing_simple(text,language, minWordLength):
    text = text.encode('ascii', 'ignore').decode('ascii')
    text_non_letters = re.sub("[^a-zA-Z']", " ", text) 
        
    # convert to lower-case
    text_lower = text_non_letters.lower()
    return text_lower

In [6]:
def change_ratings(x):
    if x in [0, 1,2]:
        return -1
    elif x == 3:
        return 0
    else:
        return 1

In [7]:
def get_old_data():
    #load and prepare old data
    df_reviews = pd.read_csv('Input_data/Old_data_reviews.csv',sep=',',quotechar='"' ) #contains the reviews from Facebook and Tripadvisor
    df_cat_labels = pd.read_csv('Input_data/Old_data_categorisation_labels.csv',sep=',',quotechar='"') # contains the classlabels of the reviews
    df_merged_data = pd.merge(df_cat_labels,df_reviews,how='left',left_on='review_id',right_on='id')

    # Drop unnecessary columns
    df_merged_data = df_merged_data.drop(['labeler_id', 'post_type', 'datetime_posted','likes', 'traveler_type',
                        'rating_food','rating_service', 'rating_environment', 'rating_value',
                         'reviewer_id','source_subject_id','id_x','review_id','id_y','source'],axis=1)

    #select english reviews
    olddata =  df_merged_data.loc[df_merged_data['language'] == 'en']
    olddata['sentiment'] = olddata['rating'].apply(change_ratings)
    reviews_old = olddata['text'].values
    sentiment_old=olddata['sentiment'].values
    return reviews_old, sentiment_old

In [8]:
def get_new_data():
    #load and prepare new data
    newdata=pd.read_csv('Input_data/New_data.csv')
    #Split category column 
    newdata['category']=newdata['Categories'].apply(lambda x:x.split(';'))

    def category_includer(data,string):    
        for i in range(len(data)):
            data.loc[i,string]=0
            column=list(data.loc[i,'category'])
            if string in column:
                data.loc[i,string]=1
        return data

    newdata=category_includer(newdata,'experience')
    newdata=category_includer(newdata,'service')
    newdata=category_includer(newdata,'consistency')
    newdata=category_includer(newdata,'value')
    newdata=category_includer(newdata,'food')
    newdata=category_includer(newdata,'convenience')

    #dropping the two columns and filter only English
    newdata=newdata.drop(['Categories','category'],axis=1)
    newdata=newdata[newdata['Language']=="eng"]
    reviews_new= newdata['Text'].values
    sentiment_new = newdata['Sentiment'].values

    return reviews_new, sentiment_new

In [9]:
def get_new_translated_data():
    newdata = pd.read_csv('Input_data/newdata.csv')
    reviews_new= newdata['Text'].values
    sentiment_new = newdata['Sentiment'].values
    return reviews_new, sentiment_new

In [10]:
def get_all_data():
    reviews_old, sentiment_old = get_old_data()
    reviews_new, sentiment_new = get_new_data()
    reviews = list(reviews_old) + list(reviews_new)
    sentiment = list(sentiment_old) + list(sentiment_new)
    return reviews, sentiment

In [11]:
def run_preprocessing(preprocess_fn, reviews_train, reviews_test, language, minWordLength):
    for i in range(len(reviews_train)):
        reviews_train[i] = preprocess_fn(reviews_train[i], language, minWordLength)
    for i in range(len(reviews_test)):
        reviews_test[i] = preprocess_fn(reviews_test[i], language, minWordLength)

In [12]:
def preprocess_data(type_data="all", preprocessing="all"):
    #Get data
    if type_data == "all":
        reviews, sentiments = get_all_data()
    elif type_data == "old":
        reviews, sentiments = get_old_data()
    else:
        reviews, sentiments = get_new_translated_data()

    #test train split
    reviews_train, reviews_test, sentiment_train, sentiment_test = train_test_split(reviews, sentiments, test_size=0.2,random_state=0,stratify=sentiments)

    #bag of words
    language = 'english'
    minWordLength = 2 # shorter words will be removed
    
    if preprocessing =="all":
        run_preprocessing(text_preprocessing, reviews_train, reviews_test, language, minWordLength)
    else:
        run_preprocessing(text_preprocessing_simple, reviews_train, reviews_test, language, minWordLength)
    return reviews_train, reviews_test, sentiment_train, sentiment_test


In [13]:
def get_accuracy_matrix(learn, predictions, targets, losses):
    interp = ClassificationInterpretation(learn, predictions, targets, losses)
    
    accuracy_matrix=pd.crosstab(predictions, targets)
    accuracy_matrix2=accuracy_matrix.copy()
    accuracy_matrix2=accuracy_matrix2.rename(columns={0:-1,1:0,2:1})
    accuracy_matrix2=accuracy_matrix2.rename(index={0: -1, 1:0, 2: 1})
    accuracy_matrix2 = accuracy_matrix2.T

    precision_minus_1=accuracy_matrix[0][0]/accuracy_matrix.sum(1)[0]
    precision_0 = accuracy_matrix[1][1]/accuracy_matrix.sum(1)[1]
    precision_1 = accuracy_matrix[2][2]/accuracy_matrix.sum(1)[2]
    precision=[precision_minus_1,precision_0,precision_1]
    recall_minus_1 = accuracy_matrix[0][0]/accuracy_matrix.sum(0)[0]
    recall_0 = accuracy_matrix[1][1]/accuracy_matrix.sum(0)[1]
    recall_1 = accuracy_matrix[2][2]/accuracy_matrix.sum(0)[2]
    recall=[recall_minus_1,recall_0,recall_1]
    weighted_precision = (precision[0]*accuracy_matrix.sum(0)[0]+ precision[1]*accuracy_matrix.sum(0)[1]+ precision[2]*accuracy_matrix.sum(0)[2])/sum(accuracy_matrix.sum(0))
    weighted_recall = (recall[0]*accuracy_matrix.sum(1)[0]+ recall[1]*accuracy_matrix.sum(1)[1]+ recall[2]*accuracy_matrix.sum(1)[2])/sum(accuracy_matrix.sum(1))
    return interp, weighted_precision, weighted_recall

In [26]:
def train_model(type_data="all", preprocessing="all"):
    reviews_train, reviews_test, sentiment_train, sentiment_test = preprocess_data(type_data, preprocessing)
    newdata = pd.read_pickle('newdata.pkl')
    newdata = newdata[newdata['Language']=='eng']
    reviews_new = list(newdata['Text'].values)
    sentiments_new = list(newdata['Sentiment'].values)
#     newdata = pd.DataFrame(list(zip(reviews_new, sentiments_new)), columns=['text', 'labels'])
    # Create language databunch
    data_lm = TextLMDataBunch.from_df(train_df = pd.DataFrame({'text': reviews_train, 'labels': sentiment_train}), 
                                      valid_df = pd.DataFrame({'text': reviews_test, 'labels':sentiment_test}), 
                                      test_df = pd.DataFrame({'text': reviews_new, 'labels':sentiments_new}),
                                      path = "",
                                      text_cols=0,
                                      label_cols=1)

    # Create classifier databunch
    data_clas = TextClasDataBunch.from_df(path = "", 
                                          train_df = pd.DataFrame({'text': reviews_train, 'labels': sentiment_train}),
                                          valid_df = pd.DataFrame({'text': reviews_test, 'labels': sentiment_test}), 
                                          test_df = pd.DataFrame({'text': reviews_new, 'labels':sentiments_new}),
                                          vocab=data_lm.train_ds.vocab, 
                                          text_cols=0,
                                          bs=16,
                                          label_cols=1)
    learn = language_model_learner(data_lm, AWD_LSTM, pretrained=True, drop_mult=0.3)
    learn.fit_one_cycle(1, 1e-2, moms=(0.8,0.7))
    learn.unfreeze()
    learn.fit_one_cycle(5, 1e-3, moms=(0.8,0.7))
    learn.save_encoder('fine_tuned_enc')
    learn = text_classifier_learner(data_clas, AWD_LSTM,drop_mult=0.2);
    learn.load_encoder('fine_tuned_enc')
    learn.fit_one_cycle(1, 2e-2, moms=(0.8,0.7))
    learn.freeze_to(-2)
    learn.fit_one_cycle(1, slice(1e-2/(2.6**4),1e-2), moms=(0.8,0.7))
    learn.freeze_to(-3)
    learn.fit_one_cycle(1, slice(5e-3/(2.6**4),5e-3), moms=(0.8,0.7))
    learn.unfreeze()
    learn.fit_one_cycle(5, slice(1e-3/(2.6**4),1e-3), moms=(0.8,0.7))
    preds, targets, losses = learn.get_preds(with_loss=True)
    predictions = np.argmax(preds, axis = 1)

    print(type_data, "-", preprocessing)
    accuracy_matrix = ClassificationInterpretation(learn, preds, targets, losses)
    print("Validation Dataset")
    print("Accuracy Score", accuracy_score(targets, predictions))
    print("Weighted Precision", precision_score(targets, predictions, average='weighted'))
    print("Weighted Reccall", recall_score(targets, predictions, average='weighted'))

    preds, targets, losses = learn.get_preds(ds_type=DatasetType.Test, with_loss=True)
    predictions = np.argmax(preds, axis = 1)
    pred_map = {0: -1, 1: 0, 2: 1}
    predictions = [pred_map[p.item()] for p in predictions]

    accuracy_matrix = ClassificationInterpretation(learn, preds,  newdata['Sentiment'], losses)
    print("Test Dataset")
    print("Accuracy Score", accuracy_score( newdata['Sentiment'], predictions))
    print("Weighted Precision",precision_score( newdata['Sentiment'], predictions, average='weighted'))
    print("Weighted Reccall", recall_score( newdata['Sentiment'], predictions, average='weighted'))
    return learn, targets, predictions

In [27]:
#, accuracy_matrix, weighted_precision, weighted_recall, accuracy_matrix_test, weighted_precision_test, weighted_recall_test
learn, targets, predictions = train_model("new","simple")

epoch,train_loss,valid_loss,accuracy,time
0,5.316602,4.818934,0.182891,00:16


epoch,train_loss,valid_loss,accuracy,time
0,4.748326,4.636548,0.202705,00:17
1,4.574528,4.502439,0.215901,00:17
2,4.367606,4.447746,0.219262,00:17
3,4.17667,4.432954,0.221941,00:18
4,4.046043,4.439981,0.220917,00:18


epoch,train_loss,valid_loss,accuracy,time
0,0.487891,0.42773,0.840822,01:02


epoch,train_loss,valid_loss,accuracy,time
0,0.449142,0.390078,0.860899,01:11


epoch,train_loss,valid_loss,accuracy,time
0,0.426225,0.364185,0.866635,01:12


epoch,train_loss,valid_loss,accuracy,time
0,0.288641,0.35425,0.870459,01:09
1,0.269143,0.376353,0.872371,01:08
2,0.213044,0.409073,0.858987,01:06
3,0.165524,0.433009,0.864723,01:16
4,0.119819,0.424523,0.862811,01:19


new - simple
Validation Dataset
Accuracy Score 0.862810707456979
Weighted Precision 0.856409939612923
Weighted Reccall 0.862810707456979


Test Dataset
Accuracy Score 0.927789046653144
Weighted Precision 0.925484266871039
Weighted Reccall 0.927789046653144


In [19]:
learn.export('ulmfit_93_precision_93_recall')

In [28]:
preds, targets, losses = learn.get_preds(ds_type=DatasetType.Test, with_loss=True)

In [39]:
pred_map = {0: -1, 1: 0, 2: 1}

In [43]:
predictions = [pred_map[p.item()] for p in predictions]

In [45]:
print("Accuracy Score", accuracy_score(newdata['Sentiment'], predictions))

Accuracy Score 0.9324543610547668


In [22]:
newdata = pd.read_pickle('newdata.pkl')
newdata = newdata[newdata['Language']=='eng']
reviews_new = list(newdata['Text'].values)
sentiments_new = list(newdata['Sentiment'].values)

In [27]:
predictions.unique()

tensor([0, 1, 2])

## New data for training, with stemming

In [None]:
learn=train_model("new", "all")

## Old data for training, No stemming and no stop word filters

In [None]:
learn=train_model("old", "simple")

## Old data for training, With stemming and stop words filter

In [None]:
learn= train_model("old", "all")

## All data, no stemming and no stop words filter

In [None]:
learn = train_model("all", "simple")

## All data, with stemming and stop words filters

In [None]:
learn = train_model("all", "all")