In [1]:
import re                                       # regular expressions
from sklearn.svm import LinearSVC
from nltk.classify import SklearnClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB

# To do preprocessing
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords

# To do preprocessing
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
import nltk
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('words')

# from sklearn.model_selection import cross_val_score
from sklearn.metrics import precision_recall_fscore_support # to report on precision and recall
import numpy as np # To compute the average results

from random import shuffle # To shuffle the dataset


# To use feature selection in the Pipeline
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfTransformer

from collections import Counter
import pandas as pd
import gensim

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data] Downloading package words to /root/nltk_data...
[nltk_data]   Unzipping corpora/words.zip.


Load data file

In [6]:
filepath = 'topic_with_sentiment.csv'

In [7]:
data = pd.read_csv(filepath, sep=",", error_bad_lines=False, encoding='latin-1')

In [9]:
data.head()

Unnamed: 0,Text,Topic_assigned,Topic_Percentage,Ground_Truth,Ground_Truth_Sentiment
0,pleasure ing 7 night recently perfect every wa...,Service,0.234,Facilities Comfort Food Staff Location Service,5
1,lovely first visit iconic bar wonderful servic...,Service,0.2886,Food Price Service,5
2,3 u ed rhodes_hotel 4 night location take padd...,Price,0.1645,Facilities Cleanliness Staff Location,4
3,form moment_arrive left experienced absolute_p...,Service,0.2637,Food Staff Service,5
4,well strange 5star new come along eager try he...,Staff,0.2831,Facilities Cleanliness Food Price Staff Service,1


Cleaning data (remove minor reviews, replace 5 labels with 3 labels)

In [10]:
data = data[data.Ground_Truth_Sentiment != 0]
data = data[data.Ground_Truth_Sentiment != 6]
data = data[data.Ground_Truth_Sentiment != 7]
data = data[data.Ground_Truth_Sentiment != 8]
data = data[data.Ground_Truth_Sentiment != 9]
data = data[data.Ground_Truth_Sentiment != 10]

In [11]:
data['Ground_Truth_Sentiment'] = data['Ground_Truth_Sentiment'].replace(1, 0)
data['Ground_Truth_Sentiment'] = data['Ground_Truth_Sentiment'].replace(2, 0)

In [12]:
data['Ground_Truth_Sentiment'] = data['Ground_Truth_Sentiment'].replace(3, 1)

In [13]:
data['Ground_Truth_Sentiment'] = data['Ground_Truth_Sentiment'].replace(4, 2)
data['Ground_Truth_Sentiment'] = data['Ground_Truth_Sentiment'].replace(5, 2)

In [14]:
data['Ground_Truth_Sentiment'].value_counts()

2    381598
1     98710
0     39253
Name: Ground_Truth_Sentiment, dtype: int64

In [33]:
def preProcess(text):
    # should return a list of tokens
    
    # word tokenisation, including punctuation removal
    tokenizer = RegexpTokenizer(r'\w+')
    tokens = tokenizer.tokenize(str(text))
    
    # lowercasing
    tokens = [t.lower() for t in tokens]

    # stopword removal- benefits are it removes rare words, though bad for bigram relations
    stop = set(stopwords.words('english'))
    tokens = [t for t in tokens if t not in stop]
    
    # lemmatisation
    lemmatiser = WordNetLemmatizer()
    tokens = [lemmatiser.lemmatize(t) for t in tokens]
    tokens = [t for t in tokens if t] # ensure no empty space

    # remove numbers
    digits = '0123456789'
    tokens = [t for t in tokens if t not in digits]

    #bigram_mod = create_bigrams(tokens)
    #tokens = [bigram_mod[line] for line in tokens]
    
    return tokens

In [None]:
# if use 2 feature only (text with labels)
#def toFeatureVector(words):
    # return a dictionary 'featureVect' where the keys are the tokens in 'words' and the values are the number of occurrences of the tokens
    # start by using binary values only:
#    counts = Counter(words)
#    return {w: counts[w]/sum(counts.values()) for w in counts.keys()}#{w: 1.0/len(words) for w in words}

In [35]:
# use 3 features (text with labels and pro. of topics)
featureDict = {} # the global feature dictionary

def toFeatureVector(tokens, topics):
    # return a dictionary 'featureVect' where the keys are the tokens in 'words' and the values are the number of occurrences of the tokens
    # start by using binary values only:
#     baseDict = {}
    featureVec = {}

    for w in tokens:
        try:
            featureVec[w] += 1.0/len(tokens)
        except KeyError:
            featureVec[w] = 1.0/len(tokens)
        try:
            featureDict[w] += 1.0/len(tokens)
        except KeyError:
            featureDict[w] = 1.0/len(tokens)
    
    # just get bigram binary presence or not
    for i in range(1, len(tokens)):
        bigram = tokens[i-1] + " " + tokens[i]
        try:
            featureVec[bigram] = 1 #+= 1.0/len(tokens)
        except KeyError:
            featureVec[bigram] = 1 #= 1.0/len(tokens)
        try:
            featureDict[bigram] += 1.0
        except KeyError:
            featureDict[bigram] = 1.0

    featureVec['Topics:'+str(topics)] = 1.0 #0.3
        
    try:
        featureVec['topics:'+topics] += 1.0
    except KeyError:
         featureDict['topics:'+topics] = 1.0
    
    return featureVec

In [36]:
def splitData(percentage):
    dataSamples = len(rawData)
    halfOfData = int(len(rawData)/2)
    trainingSamples = int((percentage*dataSamples)/2)
    for (Text,label,t) in rawData[:trainingSamples] + rawData[halfOfData:halfOfData+trainingSamples]:
        trainData.append((toFeatureVector(preProcess(Text), t),label))
    for (Text,label,t) in rawData[trainingSamples:halfOfData] + rawData[halfOfData+trainingSamples:]:
        testData.append((toFeatureVector(preProcess(Text), t),label))

In [37]:
#if use 2 features only
#def splitData(percentage):
#    dataSamples = len(rawData)
#    halfOfData = int(len(rawData)/2)
#    trainingSamples = int((percentage*dataSamples)/2)
#    for (Text,label) in rawData[:trainingSamples] + rawData[halfOfData:halfOfData+trainingSamples]:
#        trainData.append((toFeatureVector(preProcess(Text)),label))
#    for (Text,label) in rawData[trainingSamples:halfOfData] + rawData[halfOfData+trainingSamples:]:
#        testData.append((toFeatureVector(preProcess(Text)),label))

In [38]:
# TRAINING AND VALIDATING OUR CLASSIFIER

def trainClassifier(trainData, classifier_name):
    print("Training Classifier...")
    pipeline =  Pipeline([('tfidf', TfidfTransformer()),('chi2', SelectKBest(chi2, k=20000)),('cl', classifier_name)])
    #pipeline = Pipeline([('tfidf', TfidfTransformer()), ('sampling', SMOTE()),('classification', classifier_name)])
    return SklearnClassifier(pipeline).train(trainData)

In [39]:
# PREDICTING LABELS GIVEN A CLASSIFIER

def predictLabels(reviewSamples, classifier):
    return classifier.classify_many(map(lambda t: t[0], reviewSamples))

def predictLabel(text, classifier):
    return classifier.classify(toFeatureVector(preProcess(text)))

In [40]:
def crossValidate(dataset, folds, classifier_name):
    shuffle(dataset)
    results = []
    foldSize = int(len(dataset)/folds)

    
    for i in range(0,len(dataset),foldSize):
        # insert code here that trains and tests on the 10 folds of data in the dataset
        print("Fold start on items %d - %d" % (i, i+foldSize))
        myTestData = dataset[i:i+foldSize]
        myTrainData = dataset[:i] + dataset[i+foldSize:]
        classifier = trainClassifier(myTrainData, classifier_name)
        y_true = list(map(lambda x: x[1], myTestData))
        y_pred = predictLabels(myTestData, classifier)
        results.append(precision_recall_fscore_support(y_true, y_pred, average='weighted', zero_division=0))
        
    avgResults = list(map(np.mean,list(zip(*results))[:3]))
    return avgResults

Choose subgroup

In [24]:
topic = ['Staff, Location', 
         'Location', 
         'Comfort, Clealiness', 
         'Facilities', 
         'Location, Food', 
         'Food',
         'Staff', 
         'Service', 
         'Price']

In [27]:
subgroup = data[data.Topic_assigned == topic[0]]

Prepare Data to train


In [None]:
subgroup['Ground_Truth_Sentiment'] = subgroup['Ground_Truth_Sentiment'].apply(str)
subgroup['Topic_Percentage'] = subgroup['Topic_Percentage'].apply(str)

In [30]:
x = subgroup['Text'].values.tolist()
topics = subgroup['Topic_Percentage'].values.tolist()
label = subgroup['Ground_Truth_Sentiment'].values.tolist()

In [42]:
rawData = []

#if use 2 features
#for sent, l in zip(x, label):   
#    rawData.append((sent, l))

for sent, l, t in zip(x, label, topics):   
    rawData.append((sent, l, t))

In [43]:
trainData = [] # the training data as a percentage of the total dataset
testData = [] # the test data as a percentage of the total dataset
# references to the data files

print("Now %d rawData, %d trainData, %d testData" % (len(rawData), len(trainData), len(testData)))
print("Preparing the dataset...")
print("Now %d rawData, %d trainData, %d testData" % (len(rawData), len(trainData), len(testData)))
print("Preparing training and test data...")
splitData(0.8)
print("Now %d rawData, %d trainData, %d testData" % (len(rawData), len(trainData), len(testData)))

Now 80021 rawData, 0 trainData, 0 testData
Preparing the dataset...
Now 80021 rawData, 0 trainData, 0 testData
Preparing training and test data...
Now 80021 rawData, 64016 trainData, 16005 testData


In [44]:
#svc = LinearSVC(max_iter=10000)
svm = SVC(class_weight='balanced')
dt = DecisionTreeClassifier(class_weight='balanced')
rf = RandomForestClassifier(class_weight='balanced')
nb = MultinomialNB()

In [45]:
svm_c = crossValidate(trainData, 10, svm)
print('Finished training SVC classifier!')

dt_c = crossValidate(trainData, 10, dt)
print('Finished training Decision Tree classifier!')

rf_c = crossValidate(trainData, 10, rf)
print('Finished training Random Forest classifier!')

nb_c = crossValidate(trainData, 10, nb)
print('Finished training Naive Bayes classifier!')

Fold start on items 0 - 6401
Training Classifier...
Fold start on items 6401 - 12802
Training Classifier...
Fold start on items 12802 - 19203
Training Classifier...
Fold start on items 19203 - 25604
Training Classifier...
Fold start on items 25604 - 32005
Training Classifier...
Fold start on items 32005 - 38406
Training Classifier...
Fold start on items 38406 - 44807
Training Classifier...
Fold start on items 44807 - 51208
Training Classifier...
Fold start on items 51208 - 57609
Training Classifier...
Fold start on items 57609 - 64010
Training Classifier...
Fold start on items 64010 - 70411
Training Classifier...
Finished training SVC classifier!
Fold start on items 0 - 6401
Training Classifier...
Fold start on items 6401 - 12802
Training Classifier...
Fold start on items 12802 - 19203
Training Classifier...
Fold start on items 19203 - 25604
Training Classifier...
Fold start on items 25604 - 32005
Training Classifier...
Fold start on items 32005 - 38406
Training Classifier...
Fold star

In [46]:
def create_result_table(classifier1, classifier2, classifier3, classifier4):
    model_scores = pd.DataFrame({'Support Vector Machine':[classifier1[0], classifier1[1], classifier1[2]],
                                       
                                 'Decision Tree':[classifier2[0], classifier2[1], classifier2[2]],
                                       
                                 'Random Forest':[classifier3[0], classifier3[1], classifier3[2]],
                                       
                                 'Naive Bayes':[classifier4[0], classifier4[1], classifier4[2]]},
                                      
                                  index=['Precision', 'Recall', 'F1 Score'])
    
    # Add 'Best Score' column
    model_scores['Best Score'] = model_scores.idxmax(axis=1)

    return model_scores

In [47]:
model_scores = create_result_table(svm_c, dt_c, rf_c, nb_c)

In [48]:
model_scores.head()

Unnamed: 0,Support Vector Machine,Decision Tree,Random Forest,Naive Bayes,Best Score
Precision,0.803571,0.823984,0.812713,0.766177,Decision Tree
Recall,0.791825,0.802644,0.862905,0.875207,Naive Bayes
F1 Score,0.795794,0.812828,0.8344,0.817021,Random Forest


For testing

In [None]:
testTrue = list(map(lambda t: t[1], testData))

In [None]:
from sklearn.metrics import accuracy_score

In [None]:
svm_classifier = trainClassifier(trainData, svm)
svm_testPred = predictLabels(testData, svm_classifier)
svm_acc = accuracy_score(testTrue, svm_testPred)
print(svm_acc)

dt_classifier = trainClassifier(trainData, dt)
dt_testPred = predictLabels(testData, dt_classifier)
dt_acc = accuracy_score(testTrue, dt_testPred)
print(dt_acc)

rf_classifier = trainClassifier(trainData, rf)
rf_testPred = predictLabels(testData, rf_classifier)
rf_acc = accuracy_score(testTrue, rf_testPred)
print(rf_acc)

nb_classifier = trainClassifier(trainData, nb)
nb_testPred = predictLabels(testData, nb_classifier)
nb_acc = accuracy_score(testTrue, nb_testPred)
print(nb_acc)

Training Classifier...
0.6922115785089547
Training Classifier...
0.6583090379008746
Training Classifier...
0.7163681799250312
Training Classifier...
0.7255310287380258


In [None]:
from joblib import dump, load
dump(classifier, 'Price_SVC.joblib') 

In [None]:
clf = load('filename.joblib') 