# Sentiment Analysis

## Learning Objectives:
1. How to prepare data for machine learning, i.e., feature selection
1. How to learn a machine learning classifier
1. How to learn a machine learning classifier
1. How to apply a machine learning classifier
1. How to evaluate a machine learning classifier

### Process:
1. load dataset
1. analyzse dataset
1. create feature vector
1. vectorize data
1. learn machine learning classifier
1. evaluate classifier
1. apply machine learning classifier


# Setup

## Install Dependencies

In [None]:
import sys
!{sys.executable} -m pip install pandas
!{sys.executable} -m pip install xlrd
!{sys.executable} -m pip install nltk
!{sys.executable} -m pip install sklearn
!{sys.executable} -m pip install openpyxl
!{sys.executable} -m pip install scipy


## Import Dependencies

In [None]:
################################
# Required
################################
import os, sys
import pandas as pd

# Feature Creation
from nltk.tokenize import RegexpTokenizer
#tokenizer = RegexpTokenizer(r'\w+[-]?\w+')

# Machine Learning Algorithms
from sklearn.naive_bayes import GaussianNB
# More at http://scikit-learn.org/stable/auto_examples/classification/plot_classifier_comparison.html

# Evaluation
from sklearn.model_selection import cross_val_score

################################
# Optional
################################
import re #RegEx

# Improve feature creation
## Natural Language Processing module
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()
from nltk.stem.porter import PorterStemmer
stemmer = PorterStemmer()

# Improve feature selection
import sklearn
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2

# Machine Learning
import scipy
from scipy.sparse import dok_matrix

In [None]:
# Checking versions
print('Version check:\n--------------')
print('Pandas v',pd.__version__)
print('NLTK v',nltk.__version__)
print('SKLearn v',sklearn.__version__)

### How to download NLTK packages
Replace *URL*, *USERNAME*, and *PASSWORD* if you need to configure a proxy.

Then, uncomment the lines and run it.

*Please note that a separate windows will popup. Select the appropriate package.*

In [None]:
# nltk.set_proxy('http://gate-zrh-os.swissre.com:8080', ('<USERNAME>', '<PASSWORD>'))
# nltk.download()

## Utilities

In [None]:
# Saving DataFrame as EXCEL for review
def writeResults(dfResults, sFilename, sPrefix='', sPostfix=''):
    fnOut = sFilename
    if sPrefix:
        fnOut = sPrefix + fnOut
    if sPostfix:
        fnOut = fnOut + sPostfix
        
    filepath = outDirectory + fnOut
    dfResults.to_excel(filepath)
    print('Results haven been written to ', filepath)

# Load Dataset

In [None]:
# Filepath to dataset
fpDataset = './data/customer-feedback_full_cleaned_1000.xlsx'

#Load Excel file into a DataFrame
dfExcelWorkbook = pd.read_excel(fpDataset, sheet_name=None)
sheets = list(dfExcelWorkbook.keys())
dfData = dfExcelWorkbook[sheets[0]]

# Prepare directory to output results
outDirectory = './result/'
if not os.path.exists(outDirectory):
    os.makedirs(outDirectory)

In [None]:
# Check dataset 
dfData.head(10)

## Stats and Infos
Some info about the data

In [None]:
print('Number of attributes:', dfData.shape[1])
print('Name of attributes:', dfData.columns)
print('Number of rows:', dfData.shape[0])
print('Positives/Negatives:', dfData['RATING'].mean())


# Feacture Vector

## Build Feature Vector
Machine learning requires that features (input) correlates with target class (output). For that reason, we need to define the features that we want to use for machine learning. We use words as features because they correlate with sentiments.

### Result
* **dfFeature**: DataFrame containing a list of features (words). It contains for each feature the frequency and avg. sentiment.

In [None]:
# $$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$
# $$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$
# MODIFY THIS METHOD TO WIN
# $$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$
# $$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$
# HINT: e.g., TF/IDF
# $$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$
# $$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$

# counts the tokens in a list of tokens.
def countTokens(tokens):
    results = {}

    for token in tokens:
        if token not in results:
            results[token] = 1
        else:
            results[token] = results[token] + 1
    return results

In [None]:
# $$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$
# $$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$
# MODIFY THIS METHOD TO WIN
# $$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$
# $$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$
# HINT: stopwords, lemmatization, stemming, named entity, lowercase, word combination (e.g, 'not good'), adjectives, etc. 
# $$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$
# $$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$

# extract features from a text
def extractTokens(strText):
    result = []
    # features = tokenizer.tokenize(strText)
    result = re.split('\s', strText)
    return result

In [None]:
featureColumns = ['feature', 'positives', 'negatives']
dfFeatures = pd.DataFrame(columns=featureColumns)
colFeedback = 'FEEDBACK'
colRating = 'RATING'

features = {}
for index, row in dfData.iterrows():
    # get feedback
    feedback = dfData.iloc[index][colFeedback]
    rating = dfData.iloc[index][colRating]
    
    # analyze feedback
    tokens = extractTokens(str(feedback))
    featurecount = countTokens(tokens)
    
    # add to feature list
    for feature in featurecount.keys():
        if feature not in features:
            features[feature] = {'positives': 0, 'negatives': 0}
        if rating == 0:
            features[feature]['negatives'] = features[feature]['negatives'] + 1
        elif rating != 0:
            features[feature]['positives'] = features[feature]['positives'] + 1   
          
# create and beautify
dfFeatures = pd.DataFrame.from_dict(features, orient='index')
dfFeatures = dfFeatures.reset_index()
dfFeatures = dfFeatures.rename({'index':'feature'}, axis=1)

### Analyze Features

In [None]:
# Count number of times freature occures
dfFeatures['support'] = dfFeatures.apply(lambda x: x['positives'] + x['negatives'], axis=1)

# Compute sentiment value feature
dfFeatures['sentiment'] = dfFeatures.apply(lambda x: x['positives'] / x['support'], axis=1)
fnFeaturesAll = 'allfeatures.xlsx'
writeResults(dfFeatures, fnFeaturesAll)

dfFeatures.sort_values(by='support', ascending=False).head(20)

In [None]:
print("Number of Features", dfFeatures.shape[0])

# (i) DISCUSSION: How to clean up the features?

## Load Feature Vector

In [None]:
dfExcelWorkbook = pd.read_excel(outDirectory + fnFeaturesAll, sheet_name=None)
sheets = list(dfExcelWorkbook.keys())
dfFeatures = dfExcelWorkbook[sheets[0]]

In [None]:
print('Feature Stats:')
print('Number of Features:', dfFeatures.shape[0])
dfFeatures.head(20)

## Feature Selection

In [None]:
dfFeatureVector = dfFeatures.sort_values(by='support', ascending=False)
dfFeatureVector.head(10)

## Top-10 Negatives

In [None]:
dfFeatureVector.sort_values(by='sentiment', ascending=True).head(10)

# Top-10 Positives

In [None]:
dfFeatureVector.sort_values(by='sentiment', ascending=False).head(10)

In [None]:
# $$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$
# $$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$
# MODIFY THIS METHOD TO WIN
# $$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$
# $$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$
# HINT: e.g., remove rare features, remove irrelevant features
# $$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$
# $$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$

def selectFeatures(dfFeatures):
    print("Number of features:",dfFeatures.shape[0])
    result = dfFeatures
    # result = dfFeatures[dfFeatures.support > 10]
    
    print("Number of selected features:", result.shape[0])
    return result
    
dfSelectedFeatures = selectFeatures(dfFeatureVector)

# Prepare Trainingset

## Create Instances

An instance from a text is used to train a machine learning model or to classify the text. The instance is a vector representation of a text based on the given feature vector.

In [None]:
def createInstance(strText, dfFeatures=dfFeatureVector):
    result = []
    
    for feature in dfFeatureVector['feature']:
        if (str(feature) in strText):
            result.append(1)
        else:
            result.append(0)
    
    return result

In [None]:
trainSet = []
trainSetLabels = []
for index, row in dfData.iterrows():
    instance = createInstance(str(row['FEEDBACK']))
    trainSet.append(instance)
    trainSetLabels.append(row['RATING'])

In [None]:
# Check result
trainSet[0:1]

## Feature Selection

In [None]:
#X2TrainInstances = SelectKBest(chi2, k=10).fit_transform(trainSet, trainSetLabels)

# Evaluation

In [None]:
m = dok_matrix(trainSet)

# Decision Tree
from sklearn.tree import DecisionTreeClassifier
myDecisionTree = DecisionTreeClassifier(max_depth=5)
classifier = myDecisionTree.fit(trainSet,trainSetLabels)

# Random Forest
# from sklearn.ensemble import RandomForestClassifier
# classifier = RandomForestClassifier(n_estimators=20).fit(trainSet,trainSetLabels)

# Naive Bayes (NB)
# classifier = GaussianNB().fit(trainSet,trainSetLabels)

# ADA Boost
#from sklearn.ensemble import AdaBoostClassifier
#classifier = AdaBoostClassifier(n_estimators=200).fit(m,trainSetLabels)

# Support Vector Machines (SVM)
# classifier = SVC().fit(m,trainSetLabels)

# Neural Network (NN)
#from sklearn.neural_network import MLPClassifier
#classifier = MLPClassifier(alpha=1).fit(trainSet,trainSetLabels)


## Quick Analysis

In [None]:
classifier.score(trainSet,trainSetLabels)

In [None]:
score = cross_val_score(myDecisionTree, trainSet, trainSetLabels, cv=10)
print('Score per fold:', score)
print('Avg. Score:', score.mean())

Precision_score = cross_val_score(myDecisionTree, trainSet, trainSetLabels, cv=10, scoring='precision')
print('Precision:', Precision_score.mean())
Recall_score = cross_val_score(myDecisionTree, trainSet, trainSetLabels, cv=10, scoring='recall')
print('Recall:', Recall_score.mean())
F1_score = cross_val_score(myDecisionTree, trainSet, trainSetLabels, cv=10, scoring='f1')
print('F1:', F1_score.mean())



# Sentiment Analysis

In [None]:
classifier = myDecisionTree.fit(trainSet,trainSetLabels)

# Naive Bayes (NB)
#classifier = GaussianNB().fit(trainSet,trainSetLabels)

# Support Vector Machines (SVM)
# svm = SVC()
# classifier = svm.fit(m,trainSetLabels)

# ADA Boost
# adaBoost = AdaBoostClassifier(n_estimators=200)
# classifier = adaBoost.fit(m,trainSetLabels)

# Random Forest
# randomForest = RandomForestClassifier(n_estimat§ors=20)
# classifier = randomForest.fit(trainSet,trainSetLabels)

## Analyze Errors

In [None]:
for index, row in dfData[0:100].iterrows():
    feedback = str(row['FEEDBACK'])
    instance = [createInstance(feedback)]
    predictedSentiment = classifier.predict(instance)
    
    if (predictedSentiment != row['RATING']):
        sentiment = 'BAD'
        if predictedSentiment == 1:
            sentiment = 'GOOD'
        print("AI says", sentiment, ":\t",feedback)