In [69]:
import os
import time
import pickle
import warnings
import numpy as np
import pandas as pd
from tqdm import tqdm   
import data_cleaner

import matplotlib.pyplot as plt

from hmmlearn import hmm
from sklearn.cluster import DBSCAN
from sklearn.mixture import GaussianMixture

import umap

In [70]:
with open('twitter_model.pkl', 'rb') as f:
    model = pickle.load(f)

# Analysis of Words to State

In [71]:
B = model.emissionprob_.T
for i in range(10):
    print(f"Cluster {i}: {B[i][0]:.3f} {B[i][1]:.3f}")

Cluster 0: 0.047 0.040
Cluster 1: 0.278 0.203
Cluster 2: 0.067 0.044
Cluster 3: 0.045 0.031
Cluster 4: 0.078 0.057
Cluster 5: 0.347 0.523
Cluster 6: 0.029 0.018
Cluster 7: 0.029 0.026
Cluster 8: 0.023 0.021
Cluster 9: 0.056 0.037


From the above we see that <b>Cluster 1</b> will most likely be in <b>State 0</b>, while <b>Cluster 5</b> will most likely be in <b>State 1</b>. Taking a closer look at these clusters we find the following:

In [72]:
with open("original_twitter_data.pkl", "rb") as f:
    dat = pickle.load(f)

with open("twitter_model_data.pkl", "rb") as f:
    mod_dat = pickle.load(f)

In [73]:
dat.head()

Unnamed: 0,target,id,date,flag,user,text,cleaned
36796,0,1565880141,Mon Apr 20 07:45:16 PDT 2009,NO_QUERY,colombo1971,One kid better. One still sick. Poor little ...,"[one, kid, better, one, still, sick, poor, lit..."
220682,0,1976680246,Sat May 30 18:45:19 PDT 2009,NO_QUERY,kiddiescorner,Omg what the heck is up with twitter tonight i...,"[omg, heck, twitter, tonight, im, miss, thepar..."
731538,0,2263848932,Sun Jun 21 01:59:50 PDT 2009,NO_QUERY,vicbecpai,Happy fathers day shame i dont feel to well ha...,"[happi, father, day, shame, dont, feel, well, ..."
891589,4,1690871703,Sun May 03 16:34:11 PDT 2009,NO_QUERY,kinseyymiller,i almost got killed today! and i was on local ...,"[almost, got, kill, today, local, tv, mr, saal..."
645381,0,2236263306,Fri Jun 19 03:15:26 PDT 2009,NO_QUERY,myloismylife,Officially appointed as CL &amp; given monthly...,"[offici, appoint, cl, given, monthli, allow, s..."


In [74]:
mod_dat.head()

Unnamed: 0_level_0,text,target,hmm_data
original_index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
36796,"[0, 5, 9, 0, 1, 1, 5, 1, 1]",0,"[1, 0, 1, 0, 1, 0, 1, 0, 1]"
220682,"[9, 1, 5, 5, 1, 8, 1]",0,"[1, 0, 1, 0, 1, 0, 1]"
731538,"[5, 5, 5, 5, 0, 5, 1, 9, 1, 4, 1]",0,"[1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1]"
891589,"[4, 2, 1, 2, 2, 1, 1]",4,"[1, 0, 1, 0, 1, 0, 1]"
645381,"[5, 7, 4, 1, 1, 0, 4, 1]",0,"[1, 0, 1, 0, 1, 0, 1, 0]"


In [75]:
cluster_0 = list()
cluster_5 = list()

for i in range(len(mod_dat)):
    row = mod_dat.iloc[i]
    idx = row.name
    text = row['text']

    if len(text) < 1:
        continue

    og_text = dat.loc[idx]['cleaned']
    for j, w in enumerate(text):
        if w == 1:
            try:
                og_word = og_text[j]
                cluster_0.append(og_word)
            except:
                continue
        elif w == 5:
            try:
                og_word = og_text[j]
                cluster_1.append(og_word)
            except:
                continue

In [76]:
cluster_0, cluster_1 = np.array(cluster_0), np.array(cluster_1)

In [77]:
uq, counts = np.unique(cluster_0, return_counts=True)
ordr = np.argsort(counts)[::-1]
print(uq[ordr][:15])

['im' 'work' 'realli' 'know' 'think' 'oh' 'well' 'new' 'much' 'still'
 'that' 'last' 'would' 'great' 'didnt']


We see that the state predicted by the HMM given the words above is mostly correlated with negative sentiment. It is interesting that words like 'great' are in this list, but that could either be indicative of the model detecting some form of sarcasm or the model misclassifying 'great' as negative.

In [79]:
uq, counts = np.unique(cluster_1, return_counts=True)
ordr = np.argsort(counts)[::-1]
print(uq[ordr][:15])

['acmeuser' 'good' 'time' 'thank' 'feel' 'well' 'miss' 'u' 'think' 'one'
 'know' 'that' 'much' 'haha' 'hey']


We see that the state predicted by the HMM given the words above is mostly correlated with negative positive. Again we see some misclassification as the word 'miss' is in this state with 'good', 'haha', and 'thank'. 

From the results above, we can see that our goal of interpretability is somewhat being met with our model. Due to the misclassifications above, I would hesitate to rely solely on these results without further work. However, the ability to look at what the HMM is predicting and be able to determine sentiment without using a neural network is a novel idea and from these results we seem to have a good foundation from which further work can build upon.