# Sentiment

### MNB

In [1]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
import pickle

In [2]:
%load_ext memory_profiler

In [3]:
%%memit
# %%time
print('loading transformer....')
# load the tfidf_vectorizer from disk
filename = 'models/MNB_vect.sav'
MNB_vect = pickle.load(open(filename, 'rb'))
print('loading transformer....done!')

print('loading MNB model (86% acuracy rate)....')
# load the selector from disk 86% accuracy
filename = 'models/MNB_model.sav' 
MNB_model = pickle.load(open(filename, 'rb'))
print('loading MNB model (86% acuracy rate)....done!')

loading transformer....
loading transformer....done!
loading MNB model (86% acuracy rate)....
loading MNB model (86% acuracy rate)....done!
peak memory: 479.59 MiB, increment: 393.19 MiB




In [4]:
def MNB_predict_sentiment(text):
    """
    expect: a string of text
    modify: vectorize the string with 'tfidf_vectorizer' and 'selector' transformers
    return: an int (1:'postive' ; 0:'negative')
    """
    text_list=[]
    text_list.append(text)
    raw_dtm = MNB_vect.transform(text_list)
    pred_class = int(MNB_model.predict(raw_dtm))    
    return pred_class

In [5]:
MNB_predict_sentiment("""This guy doesn't get comedy. Amy schumer is a great comedian, but her show is trying to shove "funny" down your throat so hard that it loses credibility. The same goes for the kroll show. Review is a fresh concept and Andy pulls off laughs without trying so hard you shudder from the douche chills. I felt like it had just enough painful awkwardness without going overboard and relying on it to carry the show.I honestly thought the show was gonna be dumb when i saw the previews but I laughed non stop through the whole first episode. If it was any other host I think the show would be a flop but Andy is a perfect fit and plays the part flawlessly IMO. I give it a MILLION STARS!!!!!!!
""")

0

### SVM

In [6]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.svm import LinearSVC
import pickle

In [7]:
%%memit
# %%time
print('loading transformer....')
# load the transformer from disk
filename = 'models/SVM_vect.sav'
SVM_vect = pickle.load(open(filename, 'rb'))
print('loading transformer....done!')

print('loading SVM model (89% accuracy rate)....')
# load the model from disk  89%
filename = 'models/SVM_model.sav'
SVM_model = pickle.load(open(filename, 'rb'))
print('loading SVM model (89% accuracy rate)....done!')

loading transformer....
loading transformer....done!
loading SVM model (89% accuracy rate)....
loading SVM model (89% accuracy rate)....done!
peak memory: 796.86 MiB, increment: 410.19 MiB




In [8]:
def SVM_predict_sentiment(text):
    """
    expect: a string of text
    modify: vectorize the string with 'tfidf_vectorizer' and 'selector' transformers
    return: an int (1:'postive' ; 0:'negative')
    """
    text_list=[]
    text_list.append(text)
    raw_dtm = SVM_vect.transform(text_list)
    pred_class = int(SVM_model.predict(raw_dtm))    
    return pred_class

In [9]:
SVM_predict_sentiment(""" This guy doesn't get comedy. Amy schumer is a great comedian, but her show is trying to shove "funny" down your throat so hard that it loses credibility. The same goes for the kroll show. Review is a fresh concept and Andy pulls off laughs without trying so hard you shudder from the douche chills. I felt like it had just enough painful awkwardness without going overboard and relying on it to carry the show.I honestly thought the show was gonna be dumb when i saw the previews but I laughed non stop through the whole first episode. If it was any other host I think the show would be a flop but Andy is a perfect fit and plays the part flawlessly IMO. I give it a MILLION STARS!!!!!!!
""")

0

### Keras

In [10]:
# pip install h5py

In [11]:
# pip install keras

In [12]:
# pip install PyYAML

In [13]:
# pip install anvil-uplink

In [14]:
%%memit
import keras # for sentitment model
import tensorflow as tf  # to resolve the loading issue with different version of keras
from nltk.tokenize import RegexpTokenizer
import pickle # load transformers: 'tfidf_vectorizer' and 'selector'
import dill # load language model
from collections import defaultdict # to make language model work

Using TensorFlow backend.


peak memory: 836.18 MiB, increment: 153.30 MiB


In [15]:
%%memit
# load model 92%
print('loading keras model (92% accuracy)...')
model = tf.keras.models.load_model('models/kera_model_dropout_nn.h5')
print('loading keras model (92% accuracy)...done!')

# summarize model.
model.summary()

loading keras model (92% accuracy)...
loading keras model (92% accuracy)...done!
Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dropout (Dropout)            (None, 20000)             0         
_________________________________________________________________
dense (Dense)                (None, 64)                1280064   
_________________________________________________________________
dropout_1 (Dropout)          (None, 64)                0         
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 65        
Total params: 1,280,129
Trainable params: 1,280,129
Non-trainable params: 0
_________________________________________________________________
peak memory: 868.11 MiB, increment: 31.91 MiB


In [16]:
%%memit
# %%time
# load the tfidf_vectorizer from disk
filename = 'models/tfidf_vectorizer.sav'
tfidf_vectorizer = pickle.load(open(filename, 'rb'))

# load the selector from disk
filename = 'models/selector.sav'
selector = pickle.load(open(filename, 'rb'))

peak memory: 1578.82 MiB, increment: 710.71 MiB


In [17]:
def keras_predict_sentiment(text):
    """
    expect: a string of text
    modify: vectorize the string with 'tfidf_vectorizer' and 'selector' transformers
    return: an int (1:'postive' ; 0:'negative')
    """
    text_list=[]
    text_list.append(text)
    raw_dtm = tfidf_vectorizer.transform(text_list)
    selected_dtm = selector.transform(raw_dtm).astype('float32')  
    pred_class = int(model.predict_classes(selected_dtm.toarray()))    
    return pred_class

In [18]:
keras_predict_sentiment("""comedy central is a vast wasteland of unsuccessful tv shows based around standup comics, but "review" is a refreshing winner. reviewing life itself (and playing the inanities of life to the extreme) is a simple concept, but it works, and really just churns out one ridiculously uncomfortable moment after another. but it's from that discomfort that the big laughs arise. andrew daly is perfect for the slightly hapless forrest macneil, and his exasperated wife and wryly awkward cohost are my favorite parts of the show.""")

0

# Text Prediction

In [19]:
# pip install memory_profiler

In [20]:
%%memit
print('loading language model....')
# %%time
# Load models
with open('models/bigram_model.p', 'rb') as file:
   bigram_model = dill.load(file)
with open('models/trigram_model.p', 'rb') as file:
   trigram_model = dill.load(file)
# with open('models/fourgram_model.p', 'rb') as file:
#    fourgram_model = dill.load(file)
# with open('models/fivegram_model.p', 'rb') as file:
#    fivegram_model = dill.load(file)

# define tokenizer to get words
tokenizer = RegexpTokenizer(r'\w+')
print('loading language model....done!')

loading language model....
loading language model....done!
peak memory: 3205.05 MiB, increment: 1711.70 MiB


In [21]:
# function to predict the next word based on bigram model
def bigram_predict_next_word(word):
    '''
    word: a list of token
    '''
    if len(bigram_model[word[0]]) == 0:
        return None
    else:
        prob_list = bigram_model[word[0]].values()
        # find the max prob
        most_likely = max(prob_list)
        #print(most_likely)
        # predicted words
        pred_words = [word for word, prob in bigram_model[word[0]].items() if prob == most_likely]
#         pred_word = random.choice(pred_words)
    return pred_words[0]

In [22]:
# function to predict next word with trigram model
def trigram_predict_next_word(words):
    '''
    words: a list of token
    '''
    if len(trigram_model[words[0], words[1]]) == 0:
        last_word = words[-1]
        return bigram_predict_next_word(last_word)
    else:
        # get probabilities of next word
        prob_list = trigram_model[words[0], words[1]].values()
        # find the max prob
        most_likely = max(prob_list)
        # predicted words
        pred_words = [word for word, prob in trigram_model[words[0], words[1]].items() if prob == most_likely]
    return pred_words[0]

In [23]:
def fourgram_predict_next_word(words):
    '''
    words: a list of token
    '''
    if len(fourgram_model[words[0], words[1], words[2]]) == 0:
        last_two_words = words[-2:] 
        return trigram_predict_next_word(last_two_words)
    else:
        # get probabilities of next word
        prob_list = fourgram_model[words[0], words[1], words[2]].values()
        # find max prob
        most_likely = max(prob_list)
        # get the predicted word(s)
        pred_words = [word for word, prob in fourgram_model[words[0], words[1], words[2]].items() if prob == most_likely]
#         pred_word = random.choice(pred_words)
    return pred_words[0]

In [24]:
# function to predict next word with 5-gram model
def fivegram_predict_next_word(words):
    '''
    words: a list of token
    '''
    if len(fivegram_model[words[0], words[1], words[2], words[3]]) == 0:
        last_three_words = words[-3:]
        return fourgram_predict_next_word(last_three_words)
    else:
        # get probabilities of next word
        prob_list = fivegram_model[words[0], words[1], words[2], words[3]].values()
        # find max prob
        most_likely = max(prob_list)
        # predicted words
        pred_words = [w for w, p in fivegram_model[words[0], words[1], words[2], words[3]].items() if p == most_likely]
#         pred_word = random.choice(pred_words)
    return pred_words[0]

In [25]:
def ngram_prediction(text):
    """
    expect: a raw string of text
    modify: tokenize the string and # check the length to decide to start with which model.
                1. if the string has 4 tokens or above, take the last four tokens as input to fivegram_model
                2. if the string has 3 tokens, take the list of tokens as input to fourgram_model
                3. if the string has 2 tokens, take the list of tokens as input to trigram_model
                4. if the string has 1 tokens, take the token as input to trigram_model
    return: predicted word
    """
    # tokenize the words
    user_tokens = tokenizer.tokenize(text)
    if len(user_tokens) >= 4:
        try:
            return fivegram_predict_next_word(user_tokens[-4:]) # take the last four tokens
        except:    
            return fivegram_predict_next_word(user_tokens) # take the last four tokens
    elif len(user_tokens) == 3:
        return fourgram_predict_next_word(user_tokens)
    elif len(user_tokens) == 2:
        return trigram_predict_next_word(user_tokens)
    elif len(user_tokens) == 1:
        return bigram_predict_next_word(user_tokens) 
   

In [26]:
# for AWS EC2
def make_prediction(text):
    """
    expect: a raw string of text
    modify: tokenize the string and # check the length to decide to start with which model.
            1. if the string has 2 tokens or above, take the last four tokens as input to trigram_model
            2. if the string has 1 token, take the list of tokens as input to bigram_model
    return: predicted word
    """
    # tokenize the words
    user_tokens = tokenizer.tokenize(text)
    if len(user_tokens) >= 2:
        try:
            return trigram_predict_next_word(user_tokens[-2:]) # take the last four tokens
        except:    
            return trigram_predict_next_word(user_tokens) # take the last four tokens
    elif len(user_tokens) == 1:
        return bigram_predict_next_word(user_tokens) 
    

In [27]:
from platform import python_version
print(python_version())

3.7.3


# Anvil

In [28]:
# %%memit
# import anvil.server

# anvil.server.connect("KDO33RPUPH27AGMVCZBAHU2U-PDHT24SNT663M2H7")
# print('final step is running')

In [29]:
# # @anvil.server.callable
# # # Put the function that will be used in anvil client server (web)
# # # This block of code should be running forever
# # # The @anvil.server.callable decorator can only take one function

# def app_prediction(text):
#     """
#     expect: a string of text
#     modify: send the string to two functions:
#                 1. ngram_prediction
#                 2. keras_predict_sentiment
#     return: a tuple that contains a predicted word and a predicted class
#     """
#     ## Sentiment Prediction
#     pred_word = make_prediction(text)
    
#     ## Text Prediction
#     pred_class = keras_predict_sentiment(text)
        
#     return pred_word, pred_class

# # anvil.server.wait_forever()