# Sentiment Analysis

## Learning Objectives:
1. How to prepare data for machine learning, i.e., vectrize
1. How to learn ac machine learning classifier
1. How to apply a machine learning classifier
1. How to evaluate a machine learning classifier

### Process:
1. load dataset
1. Analysis of data
1. Create Feature Vector
1. Vectorize data
1. Learn Machine Learning Classifier
1. Apply Machine Learning Classifier
1. Evaluate Classifier


# Setup

In [443]:
import os, sys
import pandas as pd
import re

import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()
from nltk.stem.porter import PorterStemmer
stemmer = PorterStemmer()
from nltk.tokenize import RegexpTokenizer
tokenizer = RegexpTokenizer(r'\w+[-]?\w+')

import scipy
from scipy.sparse import dok_matrix

import sklearn
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2

from sklearn.model_selection import cross_val_score

# Machine Learning Algorithms
from sklearn.naive_bayes import GaussianNB

In [444]:
# Checking versions
print('Version check:\n--------------')
print('Pandas v',pd.__version__)
print('NLTK v',nltk.__version__)
print('SkiPy v',scipy.__version__)
print('SKLearn v',sklearn.__version__)

Version check:
--------------
Pandas v 0.22.0
NLTK v 3.2.4
SkiPy v 0.19.1
SKLearn v 0.19.1


##### How to download NLTK packages
Replace *URL*, *USERNAME*, and *PASSWORD* if you need to configure a proxy.

Then, uncomment the lines and run it.

*Please note that a separate windows will popup. Select the appropriate package.*

In [445]:
# nltk.set_proxy('http://gate-zrh-os.swissre.com:8080', ('<USERNAME>', '<PASSWORD>'))
# nltk.download()

## Utilities

In [446]:
def writeResults(dfResults, sFilename, sPrefix='', sPostfix=''):
    fnOut = sFilename
    if sPrefix:
        fnOut = sPrefix + fnOut
    if sPostfix:
        fnOut = fnOut + sPostfix
        
    filepath = outDirectory + fnOut
    dfResults.to_excel(filepath)
    print('Results haven been written to ', filepath)

# Load Dataset

In [447]:
# Filepath to dataset
fpDataset = './data/customerfeedback.xlsx'

#Load Excel file into a DataFrame
dfExcelWorkbook = pd.read_excel(fpDataset, sheet_name=None)
sheets = list(dfExcelWorkbook.keys())
dfData = dfExcelWorkbook[sheets[0]]

# Prepare directory to output results
outDirectory = './result/'
if not os.path.exists(outDirectory):
    os.makedirs(outDirectory)


In [448]:
# Check the read dataset 
dfData.head(10)

Unnamed: 0,FEEDBACK,RATING
0,never got clean glasses in Warsaw either.,0
1,The bed in the Radisson Bleu was not comfortab...,1
2,Michael was an excellent tour director. He wen...,1
3,Krakow Hotel was below my expectations because...,0
4,All the city tour guides have been excellent a...,1
5,The bed in the Radisson Bleu was not comfortab...,0
6,The Prague hotel should provide in-room intern...,0
7,Michael (Tour Director) was brilliant! Thomas ...,1
8,The entire voyage was very well done by Viking...,0
9,Michael was excellent. The Prague hotel should...,1


## Stats and Infos
Some info about the data

In [449]:
print('Number of attributes:', dfData.shape[1])
print('Name of attributes:', dfData.columns)
print('Number of rows:', dfData.shape[0])


Number of attributes: 2
Name of attributes: Index(['FEEDBACK', 'RATING'], dtype='object')
Number of rows: 28448


# Feacture Vector

## Build Feature Vector

In [450]:
def countTokens(tokens):
    results = {}

    for token in tokens:
        if token not in results:
            results[token] = 1
        else:
            results[token] = results[token] + 1
    return results

In [451]:
def extractTokens(strText):
    return tokenizer.tokenize(strText)

In [452]:
featureColumns = ['feature', 'positives', 'negatives']
dfFeatures = pd.DataFrame(columns=featureColumns)
colFeedback = 'FEEDBACK'
colRating = 'RATING'

features = {}
for index in range(0, 100): #dfData.shape[0]):
    # get feedback
    feedback = dfData.iloc[index][colFeedback]
    rating = dfData.iloc[index][colRating]
    
    # analyze feedback
    tokens = extractTokens(feedback)
    featurecount = countTokens(tokens)
    
    # add to feature list
    for feature in featurecount.keys():
        if feature not in features:
            features[feature] = {'positives': 0, 'negatives': 0}
        if rating == 0:
            features[feature]['negatives'] = features[feature]['negatives'] + 1
        elif rating != 0:
            features[feature]['positives'] = features[feature]['positives'] + 1   
          
# create and beautify
dfFeatures = pd.DataFrame.from_dict(features, orient='index')
dfFeatures = dfFeatures.reset_index()
dfFeatures = dfFeatures.rename({'index':'feature'}, axis=1)

### Analyze Features

In [453]:
# Count number of times freature occures
dfFeatures['support'] = dfFeatures.apply(lambda x: x['positives'] + x['negatives'], axis=1)

# Compute sentiment value feature
dfFeatures['sentiment'] = dfFeatures.apply(lambda x: x['positives'] / x['support'], axis=1)

dfFeatures.head(20)

fnFeaturesAll = 'allfeatures.xlsx'
writeResults(dfFeatures, fnFeaturesAll)

Results haven been written to  ./result/allfeatures.xlsx


## Load Feature Vector

In [454]:
dfExcelWorkbook = pd.read_excel(outDirectory + fnFeaturesAll, sheet_name=None)
sheets = list(dfExcelWorkbook.keys())
dfFeatures = dfExcelWorkbook[sheets[0]]

In [455]:
print('Feature Stats:')
print('Number of Features:', dfFeatures.shape[0])
dfFeatures.head(20)

Feature Stats:
Number of Features: 691


Unnamed: 0,feature,positives,negatives,support,sentiment
0,1-2,0,1,1,0.0
1,15,1,1,2,0.5
2,1A,0,1,1,0.0
3,2nd,0,1,1,0.0
4,30,1,1,2,0.5
5,69,0,1,1,0.0
6,AC,0,1,1,0.0
7,Adam,1,0,1,1.0
8,All,3,0,3,1.0
9,Alpendurada,1,3,4,0.25


## Feature Selection

In [456]:
dfFeatureVector = dfFeatures.sort_values(by='support', ascending=False).head(20)
dfFeatureVector.head(10)

Unnamed: 0,feature,positives,negatives,support,sentiment
660,was,31,18,49,0.632653
599,the,24,15,39,0.615385
616,to,16,21,37,0.432432
142,and,22,14,36,0.611111
366,in,14,11,25,0.56
650,very,18,7,25,0.72
458,of,17,7,24,0.708333
622,tour,14,6,20,0.7
98,The,12,7,19,0.631579
281,excellent,17,2,19,0.894737


# Prepare Trainingset

## Create Instances

An instance from a text is used to train a machine learning model or to classify the text. The instance is a vector representation of a text based on the given feature vector.

In [457]:
def createInstance(strText, dfFeatures=dfFeatureVector):
    result = []
    
    for feature in dfFeatureVector['feature']:
        if (feature in strText):
            result.append(1)
        else:
            result.append(0)
    
    return result

In [458]:
trainSet = []
trainSetLabels = []
for index, row in dfData[1:100].iterrows():
    instance = createInstance(row['FEEDBACK'])
    trainSet.append(instance)
    trainSetLabels.append(row['RATING'])
    
m = dok_matrix(trainSet)

In [459]:
# Check result
trainSet[1:10]

[[1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1],
 [1, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 1],
 [1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0],
 [1, 1, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0, 0, 1, 0],
 [1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0],
 [1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
 [1, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0],
 [1, 0, 0, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0],
 [1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 1]]

## Feature Selection

In [460]:
#X2TrainInstances = SelectKBest(chi2, k=10).fit_transform(trainSet, trainSetLabels)

# Machine Learning

In [461]:
# Naive Bayes (NB)
nb = GaussianNB()
classifier = nb.fit(trainSet,trainSetLabels)

# Support Vector Machines (SVM)
# svm = SVC()
# classifier = svm.fit(m,trainSetLabels)

# ADA Boost
# adaBoost = AdaBoostClassifier(n_estimators=200)
# classifier = adaBoost.fit(m,trainSetLabels)

# Random Forest
# randomForest = RandomForestClassifier(n_estimators=20)
# classifier = randomForest.fit(trainSet,trainSetLabels)

## Quick Analysis

In [463]:
print(classifier.score(trainSet,trainSetLabels))

0.7878787878787878
GaussianNB(priors=None)


# Sentiment Analysis

# Evaluation

In [478]:
print('Score per fold:', scoresAccuracy)
score = cross_val_score(classifier, trainSet, trainSetLabels, cv=10)
print('Avg. Score:', score.mean())

Precision_score = cross_val_score(classifier, trainSet, trainSetLabels, cv=10, scoring='precision')
print('Precision:', Precision_score.mean())
Recall_score = cross_val_score(classifier, trainSet, trainSetLabels, cv=10, scoring='recall')
print('Recall:', Recall_score.mean())
F1_score = cross_val_score(classifier, trainSet, trainSetLabels, cv=10, scoring='f1')
print('F1:', F1_score.mean())



Score per fold: [0.72727273 0.45454545 0.90909091 0.81818182 0.7        0.55555556
 0.77777778 0.55555556 0.77777778 0.77777778]
Avg. Score: 0.7053535353535354
Precision: 0.8279761904761905
Recall: 0.7285714285714284
F1: 0.7522230710466004


## General Utilities

In [82]:
def preprocessWord(word, aPoSType='n', lemmatize=True, stem=False):
    ### Lowercase
    word = word.lower()
    
    
    ### Lemmatize
    if lemmatize:
        word = lemmatizer.lemmatize(word, aPoSType)
        
    ### Stemming
    if stem:
        word = stemmer.stem(word)
        
    return word

In [83]:
def preprocessSentence(strSentence):
    result = ''
    words = tokenizer.tokenize(strSentence)
    for word in words:
        word = preprocessWord(word)
        result += word + ' '
        
    return result.strip()