### Exercise 4.2: Sentiment Analysis - Extra

#### DSC 550

Taniya Adhikari 1/10/2021

In [625]:
import pandas as pd
import numpy as np
import re
from re import sub
import multiprocessing
from unidecode import unidecode
import unicodedata
import nltk
import string

from gensim.models.phrases import Phrases, Phraser
from gensim.models import Word2Vec
from gensim.test.utils import get_tmpfile
from gensim.models import KeyedVectors


from time import time 
from collections import defaultdict

import logging  # Setting up the loggings to monitor gensim
logging.basicConfig(format="%(levelname)s - %(asctime)s: %(message)s", datefmt= '%H:%M:%S', level=logging.INFO)
import textblob

nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
from textblob import TextBlob
from sklearn.cluster import KMeans

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\taniy\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Text Pre-processing before modeling for sentiment analysis

In [626]:
def pre_processing(df):
    # converting all text to lowercase
    df["comments"] = df["comments"].str.lower()
    
    # removing punctuation using string.punctuations and join()
    df["comments"] = df["comments"].apply(lambda x: "".join([i for i in x if i not in string.punctuation]))
    
    # remove stop words
    stop = stopwords.words('english')

    df['stopwords'] = df["comments"].apply(lambda x: len([i for i in x.split() if i in stop]))
    df["comments"]= df["comments"].apply(lambda x: " ".join(i for i in x.split() if i not in stop))
    df['stopwords'] = df['comments'].apply(lambda x: len([i for i in x.split() if i in stop]))
    
    # stemming
    porter = PorterStemmer()
    df["comments"] = df["comments"].apply(lambda x: " ".join([porter.stem(word) for word in x.split()]))
    return df

# converting string into list of words for each comment
def text_to_list(text):
    text = unidecode(text)
    pattern = r'[^A-Za-z ]'
    regex = re.compile(pattern)
    text = regex.sub('', text).split(' ')
    return text

In [627]:
df = pd.read_csv('stockerbot-export.csv')
df = df.dropna().drop_duplicates().reset_index(drop=True).rename(columns={'text':'comments'})
df1 = df.sample(n = 20)
df2 = df1.copy()
df2 = pre_processing(df2)
df2

Unnamed: 0,source,comments,symbols,stopwords
8998,WeekHerald,everest group amp markel mkl headtohead survey...,RE,0
4201,BoardCentral,adsk adsk motley fool messag board httpstcogf1...,ADSK,0
17243,StockScoops,intel corpor nasdaqintc – may data fuel invest...,INTC,0
25979,mrlogoman247,rt ebaynewsroom 10 minut well kick ebay q2 201...,EBAY,0
478,luzgarciacalde1,rt myrollingstock gww umpq tmk earn httpstco2h...,TMK,0
20914,42Stocks,httpstco0hzjumh14a dal delta air line notabl m...,DAL,0
4191,MITickWatcher,top stock ta score trend sp500 pwr hfc up bmi ...,KIM,0
11742,dakotafinancial,zack brokerag expect express script hold co es...,ESRX,0
18935,DeweyRange,faster coin list bot 🤑 join us ➡️ httpstcoeyio...,RAD,0
22000,INVESTonero,tsn 65c 817 stop 64,TSN,0


In [628]:
clean_df = df2.copy()
clean_df.comments = clean_df.comments.apply(lambda x: text_to_list(x))
clean_df.count()

source       20
comments     20
symbols      20
stopwords    20
dtype: int64

In [629]:
#extraction of phrases and bigram model
sent = [row for row in clean_df.comments]
phrases = Phrases(sent, min_count=1, progress_per=7)
bigram = Phraser(phrases)
sentences = bigram[sent]
sentences[19]

INFO - 13:40:59: collecting all words and their counts
INFO - 13:40:59: PROGRESS: at sentence #0, processed 0 words and 0 word types
INFO - 13:40:59: PROGRESS: at sentence #7, processed 86 words and 154 word types
INFO - 13:40:59: PROGRESS: at sentence #14, processed 165 words and 292 word types
INFO - 13:40:59: collected 417 word types from a corpus of 235 words (unigram + bigrams) and 20 sentences
INFO - 13:40:59: using 417 counts as vocab in Phrases<0 vocab, min_count=1, threshold=10.0, max_vocab_size=40000000>
INFO - 13:40:59: source_vocab length 417
INFO - 13:40:59: Phraser built with 0 phrasegrams


['discoveri',
 'commun',
 'inc',
 'common',
 'stock',
 'disca',
 'see',
 'larg',
 'growth',
 'short',
 'interest',
 'httpstcomdyybie']

In [645]:
w2v_model = Word2Vec(min_count=1, size=2)

# building vocab
w2v_model.build_vocab(sentences, progress_per=20)

INFO - 13:43:28: collecting all words and their counts
INFO - 13:43:28: PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
INFO - 13:43:28: collected 202 word types from a corpus of 220 raw words and 20 sentences
INFO - 13:43:28: Loading a fresh vocabulary
INFO - 13:43:28: effective_min_count=1 retains 202 unique words (100% of original 202, drops 0)
INFO - 13:43:28: effective_min_count=1 leaves 220 word corpus (100% of original 220, drops 0)
INFO - 13:43:28: deleting the raw counts dictionary of 202 items
INFO - 13:43:28: sample=0.001 downsamples 202 most-common words
INFO - 13:43:28: downsampling leaves estimated 142 word corpus (64.8% of prior 220)
INFO - 13:43:28: estimated required memory for 202 words and 2 dimensions: 104232 bytes
INFO - 13:43:28: resetting layer weights


In [646]:
# training the model
w2v_model.train(sentences, total_examples=w2v_model.corpus_count, epochs=4, report_delay=1)
w2v_model.init_sims(replace=True)

INFO - 13:43:29: training model with 3 workers on 202 vocabulary and 2 features, using sg=0 hs=0 sample=0.001 negative=5 window=5
INFO - 13:43:29: worker thread finished; awaiting finish of 2 more threads
INFO - 13:43:29: worker thread finished; awaiting finish of 1 more threads
INFO - 13:43:29: worker thread finished; awaiting finish of 0 more threads
INFO - 13:43:29: EPOCH - 1 : training on 220 raw words (146 effective words) took 0.0s, 49930 effective words/s
INFO - 13:43:29: worker thread finished; awaiting finish of 2 more threads
INFO - 13:43:29: worker thread finished; awaiting finish of 1 more threads
INFO - 13:43:29: worker thread finished; awaiting finish of 0 more threads
INFO - 13:43:29: EPOCH - 2 : training on 220 raw words (146 effective words) took 0.0s, 48047 effective words/s
INFO - 13:43:29: worker thread finished; awaiting finish of 2 more threads
INFO - 13:43:29: worker thread finished; awaiting finish of 1 more threads
INFO - 13:43:29: worker thread finished; await

In [647]:
print(w2v_model)

Word2Vec(vocab=202, size=2, alpha=0.025)


In [648]:
w2v_model.save("word2vec.model")

INFO - 13:43:31: saving Word2Vec object under word2vec.model, separately None
INFO - 13:43:31: not storing attribute vectors_norm
INFO - 13:43:31: not storing attribute cum_table
INFO - 13:43:31: saved word2vec.model


In [649]:
word_vectors = Word2Vec.load("word2vec.model").wv

INFO - 13:43:32: loading Word2Vec object from word2vec.model
INFO - 13:43:32: loading wv recursively from word2vec.model.wv.* with mmap=None
INFO - 13:43:32: setting ignored attribute vectors_norm to None
INFO - 13:43:32: loading vocabulary recursively from word2vec.model.vocabulary.* with mmap=None
INFO - 13:43:32: loading trainables recursively from word2vec.model.trainables.* with mmap=None
INFO - 13:43:32: setting ignored attribute cum_table to None
INFO - 13:43:32: loaded word2vec.model


In [650]:
# k-means clustering to create sentiment dictionary for each word
model = KMeans(n_clusters=2, max_iter=1000).fit(X=word_vectors.vectors.astype("double"))

In [655]:
positive = model.cluster_centers_[0]
negative = model.cluster_centers_[1]

In [656]:
word_vectors.similar_by_vector(model.cluster_centers_[0], restrict_vocab=None)

[('masternod', 0.9999260902404785),
 ('co', 0.9998615384101868),
 ('market', 0.9998602867126465),
 ('gww', 0.9994181394577026),
 ('ebay', 0.9992091655731201),
 ('headtohead', 0.9971724152565002),
 ('burn', 0.9960905909538269),
 ('earn', 0.9917668700218201),
 ('httpstconweopozb', 0.9907124638557434),
 ('apc', 0.9905270934104919)]

In [657]:
words = pd.DataFrame(word_vectors.vocab.keys())
words.columns = ['words']
words['vectors'] = words.words.apply(lambda x: word_vectors[f'{x}'])
words['cluster'] = words.vectors.apply(lambda x: model.predict([np.array(x)]))
words.cluster = words.cluster.apply(lambda x: x[0])

In [659]:
words['cluster_value'] = [1 if i==1 else -1 for i in words.cluster]
words['closeness_score'] = words.apply(lambda x: 1/(model.transform([x.vectors]).min()), axis=1)
words['sentiment_coeff'] = words.closeness_score * words.cluster_value

words

Unnamed: 0,words,vectors,cluster,cluster_value,closeness_score,sentiment_coeff
0,everest,"[0.4738456, -0.88060796]",1,1,1.386529,1.386529
1,group,"[-0.8107721, 0.58536184]",0,-1,0.997127,-0.997127
2,amp,"[0.66654176, -0.7454677]",1,1,1.146746,1.146746
3,markel,"[-0.9536343, -0.30096793]",1,1,1.220324,1.220324
4,mkl,"[0.99999994, 0.0004636285]",0,-1,0.969323,-0.969323
...,...,...,...,...,...,...
197,larg,"[0.22235467, -0.9749659]",1,1,1.805448,1.805448
198,growth,"[-0.6832685, 0.73016727]",0,-1,1.138297,-1.138297
199,short,"[-0.35384795, 0.935303]",0,-1,1.614675,-1.614675
200,interest,"[-0.59522563, 0.80355865]",0,-1,1.245874,-1.245874


In [660]:
def sentiment_analyzer(wordlist):
    cluster_count = {}
    for w in wordlist:
        x = cluster_freq(w, words)
        if x in cluster_count:
            cluster_count[x] = cluster_count[x] + 1
        else:
            cluster_count[x] = 1
    sentiment = ""              
    if '-1' in cluster_count.keys() and '1' in cluster_count.keys():
        pos = cluster_count.get('1')
        neg = cluster_count.get('-1')
        if pos >= neg:
            sentiment = "Positive"
        else:
            sentiment = "Negative"

    elif '-1' not in cluster_count.keys():
        sentiment = "Positive"
    elif '1' not in cluster_count.keys():
        sentiment = "Negative"
    else:
        None
    return sentiment

In [661]:
def cluster_freq(word, df):
    x = ''
    for i, r in df.iterrows():
        if r['words'] == word:
            x = str(r['cluster_value'])
        else:
            None
    return x
        

In [662]:
cluster_count = {}
sentiment = []
result = ""
for i, r in df2.iterrows():
    comment = r['comments']
    wordlist = comment.split()
    result = sentiment_analyzer(wordlist)
    sentiment.append(result)
    
df1['sentiment'] = sentiment    
df2['sentiment'] = sentiment

In [663]:
def sentiment_func(comments):
    pol = TextBlob(comments).sentiment.polarity
    if pol >= 0:
        x = "Positive"
    else:
        x = "Negative"
    return x

In [664]:
df1['textblob_sentiment']= df2['comments'].apply(sentiment_func)
df1

Unnamed: 0,source,comments,symbols,sentiment,textblob_sentiment
8998,WeekHerald,Everest Re Group $RE &amp; Markel $MKL Head-To...,RE,Positive,Positive
4201,BoardCentral,"$ADSK ""ADSK"" on The Motley Fool message boards...",ADSK,Positive,Positive
17243,StockScoops,Intel Corporation (NASDAQ:INTC) – May This Dat...,INTC,Negative,Positive
25979,mrlogoman247,RT @eBayNewsroom: In 10 minutes we'll kick off...,EBAY,Negative,Positive
478,luzgarciacalde1,RT @myrollingstocks: $GWW $UMPQ $TMK before ea...,TMK,Negative,Positive
20914,42Stocks,https://t.co/0HzjUMh14A $DAL Delta Air Lines n...,DAL,Positive,Positive
4191,MITickWatcher,Top stocks with TA score trending DOWN (SP500)...,KIM,Negative,Positive
11742,dakotafinancial,Zacks: Brokerages Expect Express Scripts Holdi...,ESRX,Negative,Positive
18935,DeweyRange,We are faster than any other coin listing bots...,RAD,Positive,Positive
22000,INVESTonero,$TSN 65C 8/17 Stop @ 64,TSN,Positive,Positive
