### Exercise 4.2: Sentiment Analysis - Extra

#### DSC 550

Taniya Adhikari 1/10/2021

In [666]:
import pandas as pd
import numpy as np
import re
from re import sub
import multiprocessing
from unidecode import unidecode
import unicodedata
import nltk
import string

from gensim.models.phrases import Phrases, Phraser
from gensim.models import Word2Vec
from gensim.test.utils import get_tmpfile
from gensim.models import KeyedVectors


from time import time 
from collections import defaultdict

import logging  # Setting up the loggings to monitor gensim
logging.basicConfig(format="%(levelname)s - %(asctime)s: %(message)s", datefmt= '%H:%M:%S', level=logging.INFO)
import textblob

nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
from textblob import TextBlob
from sklearn.cluster import KMeans

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\taniy\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Text Pre-processing before modeling for sentiment analysis

In [667]:
def pre_processing(df):
    # converting all text to lowercase
    df["comments"] = df["comments"].str.lower()
    
    # removing punctuation using string.punctuations and join()
    df["comments"] = df["comments"].apply(lambda x: "".join([i for i in x if i not in string.punctuation]))
    
    # remove stop words
    stop = stopwords.words('english')

    df['stopwords'] = df["comments"].apply(lambda x: len([i for i in x.split() if i in stop]))
    df["comments"]= df["comments"].apply(lambda x: " ".join(i for i in x.split() if i not in stop))
    df['stopwords'] = df['comments'].apply(lambda x: len([i for i in x.split() if i in stop]))
    
    # stemming
    porter = PorterStemmer()
    df["comments"] = df["comments"].apply(lambda x: " ".join([porter.stem(word) for word in x.split()]))
    return df

# converting string into list of words for each comment
def text_to_list(text):
    text = unidecode(text)
    pattern = r'[^A-Za-z ]'
    regex = re.compile(pattern)
    text = regex.sub('', text).split(' ')
    return text

In [668]:
df = pd.read_csv('stockerbot-export.csv')
df = df.dropna().drop_duplicates().reset_index(drop=True).rename(columns={'text':'comments'})
df1 = df.sample(n = 20)
df2 = df1.copy()
df2 = pre_processing(df2)
df2

Unnamed: 0,source,comments,symbols,stopwords
14583,EnterpriseLeade,allianc data system ad research coverag start ...,ADS,0
19670,dakotafinancial,comerica cma upgrad outperform evercor isi htt...,CMA,0
18675,SeekingAlpha,amzn bbbi httpstco6oyrozzxzx,BBBY,0
20508,PotentTrading,top gapper higher volum eric nok ms mu,NOK,0
12196,MarketBeatNews,san francisco consid tax compani help homeless...,MCK,0
21923,Sumi57417058,rt 420invest 🍁420invest🍁 fund marijuana opport...,ABBV,0
16364,KarenMccaygirl,rt reuter ibm ask us juri award 167 million la...,GRPN,0
6725,ZolmaxNews,offic depot inc odp short interest updat https...,ODP,0
13600,dakotafinancial,pnc financi servic group inc forecast post fy2...,PNC,0
10617,mmahotstuff1,edison intern eix analyst see 091 ep httpstcow...,EIX,0


In [679]:
clean_df = df2.copy()
clean_df.comments = clean_df.comments.apply(lambda x: text_to_list(x))
clean_df.count()

source       20
comments     20
symbols      20
stopwords    20
dtype: int64

In [680]:
#extraction of phrases and bigram model
sent = [row for row in clean_df.comments]
phrases = Phrases(sent, min_count=1, progress_per=7)
bigram = Phraser(phrases)
sentences = bigram[sent]
sentences[19]

INFO - 14:05:30: collecting all words and their counts
INFO - 14:05:30: PROGRESS: at sentence #0, processed 0 words and 0 word types
INFO - 14:05:30: PROGRESS: at sentence #7, processed 76 words and 139 word types
INFO - 14:05:30: PROGRESS: at sentence #14, processed 144 words and 258 word types
INFO - 14:05:30: collected 378 word types from a corpus of 214 words (unigram + bigrams) and 20 sentences
INFO - 14:05:30: using 378 counts as vocab in Phrases<0 vocab, min_count=1, threshold=10.0, max_vocab_size=40000000>
INFO - 14:05:30: source_vocab length 378
INFO - 14:05:30: Phraser built with 1 phrasegrams


['aaoi', 'dip']

In [681]:
w2v_model = Word2Vec(min_count=1, size=2)

# building vocab
w2v_model.build_vocab(sentences, progress_per=20)

INFO - 14:05:31: collecting all words and their counts
INFO - 14:05:31: PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
INFO - 14:05:31: collected 183 word types from a corpus of 205 raw words and 20 sentences
INFO - 14:05:31: Loading a fresh vocabulary
INFO - 14:05:31: effective_min_count=1 retains 183 unique words (100% of original 183, drops 0)
INFO - 14:05:31: effective_min_count=1 leaves 205 word corpus (100% of original 205, drops 0)
INFO - 14:05:31: deleting the raw counts dictionary of 183 items
INFO - 14:05:31: sample=0.001 downsamples 183 most-common words
INFO - 14:05:31: downsampling leaves estimated 124 word corpus (60.6% of prior 205)
INFO - 14:05:31: estimated required memory for 183 words and 2 dimensions: 94428 bytes
INFO - 14:05:31: resetting layer weights


In [682]:
# training the model
w2v_model.train(sentences, total_examples=w2v_model.corpus_count, epochs=4, report_delay=1)
w2v_model.init_sims(replace=True)

INFO - 14:05:31: training model with 3 workers on 183 vocabulary and 2 features, using sg=0 hs=0 sample=0.001 negative=5 window=5
INFO - 14:05:31: worker thread finished; awaiting finish of 2 more threads
INFO - 14:05:31: worker thread finished; awaiting finish of 1 more threads
INFO - 14:05:31: worker thread finished; awaiting finish of 0 more threads
INFO - 14:05:31: EPOCH - 1 : training on 205 raw words (126 effective words) took 0.0s, 36974 effective words/s
INFO - 14:05:31: worker thread finished; awaiting finish of 2 more threads
INFO - 14:05:31: worker thread finished; awaiting finish of 1 more threads
INFO - 14:05:31: worker thread finished; awaiting finish of 0 more threads
INFO - 14:05:31: EPOCH - 2 : training on 205 raw words (139 effective words) took 0.0s, 58868 effective words/s
INFO - 14:05:31: worker thread finished; awaiting finish of 2 more threads
INFO - 14:05:31: worker thread finished; awaiting finish of 1 more threads
INFO - 14:05:31: worker thread finished; await

In [683]:
print(w2v_model)

Word2Vec(vocab=183, size=2, alpha=0.025)


In [684]:
w2v_model.save("word2vec2.model")

INFO - 14:05:37: saving Word2Vec object under word2vec2.model, separately None
INFO - 14:05:37: not storing attribute vectors_norm
INFO - 14:05:37: not storing attribute cum_table
INFO - 14:05:37: saved word2vec2.model


In [685]:
word_vectors = Word2Vec.load("word2vec2.model").wv

INFO - 14:05:38: loading Word2Vec object from word2vec2.model
INFO - 14:05:38: loading wv recursively from word2vec2.model.wv.* with mmap=None
INFO - 14:05:38: setting ignored attribute vectors_norm to None
INFO - 14:05:38: loading vocabulary recursively from word2vec2.model.vocabulary.* with mmap=None
INFO - 14:05:38: loading trainables recursively from word2vec2.model.trainables.* with mmap=None
INFO - 14:05:38: setting ignored attribute cum_table to None
INFO - 14:05:38: loaded word2vec2.model


In [699]:
# k-means clustering to create sentiment dictionary for each word
model = KMeans(n_clusters=2, max_iter=1000, random_state=True, n_init=2).fit(X=word_vectors.vectors.astype("double"))

In [700]:
word_vectors.similar_by_vector(model.cluster_centers_[0], topn=20, restrict_vocab=None)

[('ms', 0.999941885471344),
 ('marathon', 0.999916672706604),
 ('depot', 0.9993242621421814),
 ('fuel', 0.9989069104194641),
 ('squeez', 0.9967518448829651),
 ('gene', 0.9948847889900208),
 ('ko', 0.9946352243423462),
 ('award', 0.9944318532943726),
 ('mtsl', 0.9880948066711426),
 ('energi', 0.9870859384536743),
 ('mro', 0.9867139458656311),
 ('process', 0.9862383604049683),
 ('seen', 0.984703779220581),
 ('pnc', 0.9828438758850098),
 ('bmo', 0.9781846404075623),
 ('schedul', 0.9776513576507568),
 ('juri', 0.9754140973091125),
 ('timestamp', 0.9745939373970032),
 ('odp', 0.9716524481773376),
 ('gapper', 0.9616198539733887)]

In [701]:
positive = model.cluster_centers_[0]
negative = model.cluster_centers_[1]

In [702]:
words = pd.DataFrame(word_vectors.vocab.keys())
words.columns = ['words']
words['vectors'] = words.words.apply(lambda x: word_vectors[f'{x}'])
words['cluster'] = words.vectors.apply(lambda x: model.predict([np.array(x)]))
words.cluster = words.cluster.apply(lambda x: x[0])

In [703]:
words['cluster_value'] = [1 if i==0 else -1 for i in words.cluster]
words['closeness_score'] = words.apply(lambda x: 1/(model.transform([x.vectors]).min()), axis=1)
words['sentiment_coeff'] = words.closeness_score * words.cluster_value

words

Unnamed: 0,words,vectors,cluster,cluster_value,closeness_score,sentiment_coeff
0,allianc,"[0.6164918, -0.7873613]",0,1,0.886811,0.886811
1,data,"[0.68229717, -0.7310749]",0,1,0.931173,0.931173
2,system,"[-0.4534326, -0.89129055]",1,-1,1.416431,-1.416431
3,ad,"[-0.90975183, 0.41515255]",1,-1,1.467843,-1.467843
4,research,"[0.98846287, 0.15146352]",0,1,2.352660,2.352660
...,...,...,...,...,...,...
178,less,"[-0.6440829, 0.76495564]",1,-1,1.025363,-1.025363
179,seller,"[0.65319747, 0.75718755]",0,1,2.384190,2.384190
180,httpstcoedeswcykz,"[0.5230623, 0.8522945]",0,1,1.926815,1.926815
181,aaoi,"[-0.87779075, -0.47904414]",1,-1,2.785060,-2.785060


In [704]:
def sentiment_analyzer(wordlist):
    cluster_count = {}
    for w in wordlist:
        x = cluster_freq(w, words)
        if x in cluster_count:
            cluster_count[x] = cluster_count[x] + 1
        else:
            cluster_count[x] = 1
    sentiment = ""              
    if '-1' in cluster_count.keys() and '1' in cluster_count.keys():
        pos = cluster_count.get('1')
        neg = cluster_count.get('-1')
        if pos >= neg:
            sentiment = "Positive"
        else:
            sentiment = "Negative"

    elif '-1' not in cluster_count.keys():
        sentiment = "Positive"
    elif '1' not in cluster_count.keys():
        sentiment = "Negative"
    else:
        None
    return sentiment

In [705]:
def cluster_freq(word, df):
    x = ''
    for i, r in df.iterrows():
        if r['words'] == word:
            x = str(r['cluster_value'])
        else:
            None
    return x
        

In [706]:
cluster_count = {}
sentiment = []
result = ""
for i, r in df2.iterrows():
    comment = r['comments']
    wordlist = comment.split()
    result = sentiment_analyzer(wordlist)
    sentiment.append(result)
    
df1['sentiment'] = sentiment    
df2['sentiment'] = sentiment

In [707]:
def sentiment_func(comments):
    pol = TextBlob(comments).sentiment.polarity
    if pol >= 0:
        x = "Positive"
    else:
        x = "Negative"
    return x

In [708]:
df1['textblob_sentiment']= df2['comments'].apply(sentiment_func)
df1

Unnamed: 0,source,comments,symbols,sentiment,textblob_sentiment
14583,EnterpriseLeade,Alliance Data Systems $ADS Research Coverage S...,ADS,Positive,Positive
19670,dakotafinancial,Comerica $CMA Upgraded to Outperform at Everco...,CMA,Negative,Positive
18675,SeekingAlpha,$AMZN $BBBY https://t.co/6oyROZZxZx,BBBY,Positive,Positive
20508,PotentTrading,Top gappers (up) with higher volume: $ERIC $NO...,NOK,Positive,Positive
12196,MarketBeatNews,San Francisco to consider tax on companies to ...,MCK,Positive,Positive
21923,Sumi57417058,RT @420_invest: 🍁@420_invest🍁 We fund #mariju...,ABBV,Negative,Positive
16364,KarenMccaygirl,RT @Reuters: IBM asks a U.S. jury to award it ...,GRPN,Negative,Positive
6725,ZolmaxNews,Office Depot Inc $ODP Short Interest Update ht...,ODP,Negative,Positive
13600,dakotafinancial,PNC Financial Services Group Inc Forecasted to...,PNC,Positive,Positive
10617,mmahotstuff1,Edison International $EIX Analysts See $0.91 E...,EIX,Negative,Positive


The algorithm I created to create a sentiment for comment does not work with bigger dataset. However, up until kmeans, the code works fine. For sentiment prediction it will need a different method.