# Building a Sentiment Model using ESPPy

![image.png](http://sww.sas.com/saspediawiki/images/4/41/Esppy_pipeline_amazon_review.png)

In [None]:
esp.delete_projects()

In [None]:
esp.get_projects()

In [None]:
import pandas as pd
from sklearn.utils import shuffle
from sklearn.metrics import precision_score, recall_score, accuracy_score
from matplotlib import pyplot as plt
%matplotlib inline

# Connect to ESP server and create project locally
# import sys
# sys.path.insert(0, './python-esp')
import esppy
esp = esppy.ESP('espserver')
proj = esp.create_project('amazon_reviews')

## Load and preprocess data

In [None]:
train_data = pd.read_csv('reviews_train_5000.csv', header=None, 
                             names=["id", "title", "content", "rank", "sentiment"])
 
score_data = pd.read_csv('reviews_test_1000.csv', header=None, 
                             names=["id", "title", "content", "rank", "sentiment"])

In [None]:
train_data.head()

In [None]:
train_data['sentiment'].hist()

Clearly there is an imbalance in the training set. In order to build a generalizable model, sample the data for each stratum and build a new table. 

In [None]:
seed = 1234
n_samples = 5000
train_ratio = 0.5

# get all the rows in train where sentiment is 1 
# return a random sample of 2500 rows from this subset 
train_data_sample_pos = train_data.loc[train_data['sentiment'] == 1.0].sample(int(n_samples * train_ratio),
                                                                              random_state=seed)
# get all the rows in train where sentiment is 0
# return a random sample of 2500 rows from this subset 
# sample with replacement gives duplicate rows but artifically increases rare cases
train_data_sample_neg = train_data.loc[train_data['sentiment'] == 0.0].sample(int(n_samples * (1 - train_ratio)), 
                                                                              replace=True, random_state=seed)

train_data_sample = pd.concat([train_data_sample_pos, train_data_sample_neg])
 
train_data_sample_shuffled = shuffle(train_data_sample, random_state=seed)

In [None]:
train_data_sample_shuffled['sentiment'].hist()
plt.show()

In [None]:
train_data_sample_shuffled

## Project Construction

### 1. Create source and calculate windows for data streams
All event streams must enter continuous queries by being published or injected into a source window. Event streams cannot be published or injected into any other window type. Here the goal is to add a Source window to our project that will allow access to streaming data, preprocess the data, and result in a streaming dataset we can use to extract sentiment from.

In [None]:
# What data do we need to determine sentiment
train_data.head()

In [None]:
esp.calculate.TextVectorization

In [None]:
# Create training data source window
src = esp.SourceWindow(schema=('id*:int64', 'content:string', 'sentiment:string'),
                      index_type='empty', insert_only=True, autogen_key=True)

# Create a calculation window to tokenize text input
# Each token represents an event, output would be |id|tid|word
tok = esp.calculate.Tokenization(schema=('id*:int64', 'tid*:int64', 'word:string'),
                                 input_map=dict(docId='id', doc='content'),
                                 output_map=dict(docIdOut='id*', tokenIdOut='tid*', wordOut='word'))

# Create a calculation window to vectorize input tokens
# TextVectorization will output a vector representation of the tokens
# Optionally using a start and stop list to filter tokens
vec = esp.calculate.TextVectorization(schema=('id*:int64', 'vector:array(dbl)'),
                                     wordVec='/root/glove.6B.200d.txt',
                                     startList='/root/pos_neg_words.txt',
                                     stopList='/root/stop-words.txt',
                                     wordVecDelimiter='SPACE',
                                     outputDocVec=1,
                                     input_map=dict(docId='id', token='word'),
                                     output_map=dict(docIdOut='id*', vectorOut='vector[1-200]'))

# Create a join window to join the source stream and 
# document vector for each row.
# Want to include only sentiment, content, and word vector
jn = esp.JoinWindow(type='inner')
jn.add_condition(left='id', right='id')
jn.add_field_selection(name='sentiment', source='l_sentiment')
jn.add_field_selection(name='content', source='l_content')
jn.add_field_selection(name='vector', source='r_vector')

# Add the created windows to the project
proj.windows['w_data'] = src
proj.windows['w_tok'] = tok
proj.windows['w_vec'] = vec
proj.windows['w_join'] = jn

# Connect the windows to create a pipeline
src.add_target(tok, role='data')
tok.add_target(vec, role='data')
src.add_target(jn, role='data')
vec.add_target(jn, role='data')

# Plot the graph of project so far
proj.to_graph()

For the test set we need to preprocess it the same as above, so we can create a second src window and 2nd calculation windows to create a similar pipeline that will feed in to the Project later in the graph.

In [None]:
# Create training data source window
src2 = esp.SourceWindow(schema=('id*:int64', 'content:string', 'sentiment:string'),
                      index_type='empty', insert_only=True, autogen_key=True)

# Create a calculation window to tokenize text input
# Each token represents an event, output would be |id|tid|word
tok2 = esp.calculate.Tokenization(schema=('id*:int64', 'tid*:int64', 'word:string'),
                                 input_map=dict(docId='id', doc='content'),
                                 output_map=dict(docIdOut='id*', tokenIdOut='tid*', wordOut='word'))

# Create a calculation window to vectorize input tokens
# TextVectorization will output a vector representation of the tokens
# Optionally using a start and stop list to filter tokens
vec2 = esp.calculate.TextVectorization(schema=('id*:int64', 'vector:array(dbl)'),
                                     wordVec='/root/glove.6B.200d.txt',
                                     startList='/root/pos_neg_words.txt',
                                     stopList='/root/stop-words.txt',
                                     wordVecDelimiter='SPACE',
                                     outputDocVec=1,
                                     input_map=dict(docId='id', token='word'),
                                     output_map=dict(docIdOut='id*', vectorOut='vector[1-200]'))


jn2 = esp.JoinWindow(type = 'inner')
jn2.add_condition(left='id', right='id')
jn2.add_field_selection(name='sentiment', source='l_sentiment')
jn2.add_field_selection(name='content', source='l_content')
jn2.add_field_selection(name='vector', source='r_vector')
 
proj.windows['w_data2'] = src2
proj.windows['w_tok2']  = tok2
proj.windows['w_vec2']  = vec2
proj.windows['w_join2'] = jn2
 
src2.add_target(tok2, role='data')
tok2.add_target(vec2, role='data')
src2.add_target(jn2, role='data')
vec2.add_target(jn2, role='data')
 
proj.to_graph()

The result to the project is two seperate streams that perform essentially the same task, albeit on different source data. The data is now in the correct format to train a model. Two models will be used - a logistic regression and a support vector machine. 

### 2. Create training and scoring windows for logistic regression and SVM

Training windows provide the ability to train machine learning algorithms on streams of data. Models used in this fashion would be considered online models. Both training from scratch as well as sequentially updating the model on streams of data is possible. In this case the models will be trained as data is fed into the system. 

Scoring windows take the learned model and input data to generate predictions. This will add a column of predicted values to the output schema. 

**Logistic Regression** 

In [None]:
# Create training window
# Specify indepedendent and dependent vars
# nInit - Number of events to initilize 
# commitInterval - Number of events to process before commiting a model to downstream scoring 
# numC - Specifies the number of regularization parameters to try
# ratioC - Specifies the ratio in setting the set of regularization parameters.
train_lr = esp.train.LogisticRegression(input_map=dict(inputs=['vector[1-200]', 'sentiment'], target='sentiment'), 
                                        nInit=1500, commitInterval=500, numC=20, ratioC=2.5)

# Add window to project
proj.windows['w_train_lr'] = train_lr

# Connect Join to LR
jn.add_target(train_lr, role='data')

# Create scoring window
# Output predicted_y is P(sentiment=1)
score_lr = esp.score.LogisticRegression(input_map=dict(inputs=['vector[1-200]']), 
                                        output_map=dict(yPredictOut='predicted_y', modelIdOut='model_id'),
                                        schema=['id*:int64', 'content:string', 'sentiment:string', 
                                                'vector:array(dbl)', 'predicted_y:double', 'model_id:int64'])

# Add score window to project
proj.windows['w_score_lr'] = score_lr

# Connect the test set to the score window
jn2.add_target(score_lr, role='data')

# Connect train window to score (score needs to use learned model built in train)
train_lr.add_target(score_lr, role='model')

# Display project
esppy.options.display.image_scale = 0.65
proj

**SVM**

Using same esp model parameters and identical graph as above

In [None]:
train_svm = esp.train.SVM(input_map=dict(inputs=['vector[1-200]', 'sentiment'], 
                                         target='sentiment'),
                          nInit=1500, commitInterval=500,
                          numC=20, ratioC=2.5)
proj.windows['w_train_svm'] = train_svm

jn.add_target(train_svm, role='data')

score_svm = esp.score.SVM(input_map=dict(inputs='vector[1-200]'),
                          output_map=dict(yPredictOut='predicted_y', modelIdOut='model_id'),
                          schema=['id*:int64', 'content:string', 'sentiment:string',
                                  'vector:array(dbl)', 'predicted_y:double', 'model_id:int64'])
 
proj.windows['w_score_svm'] = score_svm

jn2.add_target(score_svm, role='data')

train_svm.add_target(score_svm, role='model')

proj

### 3. Online model fit statistics
Treating this as a binary classification problem where sentiment is either 1 (for happy) or 0 (for sad). The output prediction is the probability that an event is 1. So anything above 0.5 would be predicted to be happy, otherwise it is sad. The performance of both models are based on MCE. 

To output fit statistics you can use a compute window and a fitstat window. The compute window is used to one-hot encode the target variable and generate, on a per event basis, the probability for both class predictions P(1) and P(0). The fitstat window is used to calculate the MCE from these probabilities and the ground truth label. 

**Logistic Regression**

In [None]:
# Create compute window to calculate output probabilities 
comp_lr = esp.ComputeWindow(schema=['id*:int64', 'sentiment:string',
                                       'predicted_y:double', 'p_1:double', 'p_0:double'])

# One-hot encode the true label 
comp_lr.add_field_expression("tostring(tointeger(sentiment))")
# Calulate class prob. predictions
comp_lr.add_field_expression("predicted_y")
comp_lr.add_field_expression("predicted_y")
comp_lr.add_field_expression("1-predicted_y")

# Create fitstat window to calculate MCE
fitstat_lr = esp.calculate.FitStat(schema=('id*:int64','mceOut:double'),
                                      classLabels='0,1', windowLength=200)
fitstat_lr.set_inputs(inputs=('p_0:double', 'p_1:double'), 
                         response=('sentiment:string'))
fitstat_lr.set_outputs(mceOut='mceOut:double')

# Add windows to project
proj.windows['w_comp_lr'] = comp_lr
proj.windows['w_fitstat_lr'] = fitstat_lr
score_lr.add_target(comp_lr, role='data')
comp_lr.add_target(fitstat_lr, role='data')
 
esppy.options.display.image_scale = 0.45
proj

**SVM**

In [None]:
comp_svm = esp.ComputeWindow("w_comp_svm", 
                               schema=['id*:int64', 'sentiment:string',
                                       'predicted_y:double', 'p_1:double', 'p_0:double'])
comp_svm.add_field_expression("tostring(tointeger(sentiment))")
comp_svm.add_field_expression("predicted_y")
comp_svm.add_field_expression("""
1/(1+pow(2.718281828, -predicted_y))
""")
comp_svm.add_field_expression("""
1-1/(1+pow(2.718281828, -predicted_y))
""")
 
fitstat_svm = esp.calculate.FitStat(schema=('id*:int64','mceOut:double'),
                                    classLabels='0,1',
                                    windowLength=200)
fitstat_svm.set_inputs(inputs=('p_0:double', 'p_1:double'), 
                       response=('sentiment:string'))
fitstat_svm.set_outputs(mceOut='mceOut:double')
 
proj.windows['w_comp_svm'] = comp_svm
proj.windows['w_fitstat_svm'] = fitstat_svm
score_svm.add_target(comp_svm, role='data')
comp_svm.add_target(fitstat_svm, role='data')
esppy.options.display.image_scale = 0.45
proj

In [None]:
# join the two fitstat windows for plotting
jn3 = esp.JoinWindow(type = 'inner')
jn3.add_condition(left='id', right='id')
jn3.add_field_selection(name='mce_lr', source='l_mceOut')
jn3.add_field_selection(name='mce_svm',   source='r_mceOut')
 
proj.windows['w_join3'] = jn3
fitstat_lr.add_target(jn3, role='data')
fitstat_svm.add_target(jn3, role='data')
 
#print(proj.to_xml(pretty=True))
esppy.options.display.image_scale = 0.4
proj

In [None]:
esp.load_project(proj)

### 4. Publish and Subscribe to windows
ESP windows works as a pub/sub model, meaning that some windows (like source) will be published too while other windows are subscribed too in order to see the output. In this project the data is streamed into the source window (w_data) through publishing and the remaining windows can be subscribed too. 

In [None]:
# esp.load_project(proj)

# # stream the training data into the engine
src.publish_events(train_data_sample_shuffled, pause=15)
 
# subscribe necessary windows
src.subscribe()
tok.subscribe()
vec.subscribe()
jn.subscribe()
train_lr.subscribe()
score_lr.subscribe()
train_svm.subscribe()
score_svm.subscribe()
comp_lr.subscribe()
comp_svm.subscribe()
fitstat_lr.subscribe()
fitstat_svm.subscribe()
jn3.subscribe()

In [None]:
# the following code display the outputs of the windows
 
# src.tail(10)
# tok.tail(10)
vec.head()
# jn.tail(10)
 
# pd.set_option('display.max_colwidth', -1)
# train_lr.tail(10) # a list published logistic regression models
 
# pandas.set_option('display.max_colwidth', -1)

# train_svm.tail(10) # a list published svm models