In [119]:
import pickle
import pandas as pd
import numpy as np

from hmmlearn import hmm
from gensim.models import Word2Vec

# Load the Cleaned Data

In [120]:
with open('twitter.pkl', 'rb') as f:
    df = pickle.load(f)['data']
df.head()

Unnamed: 0,text,target
244708,"[2, 4, 5, 4, 3, 5, 4, 5, 1]",0
1476105,"[2, 3, 4, 5, 5]",4
796772,"[0, 0]",0
1137601,"[3, 1, 1, 3, 5, 1, 7, 1]",4
1068392,"[3, 9, 5, 8, 5, 3, 5, 5, 5, 3, 4, 4, 4]",4


# Init and Train Model

In [121]:
def flatten_list(X):
    # make the data a 1d array
    sequences = []
    seq_lens = []
    for seq in X:
        sequences.extend(seq)
        seq_lens.append(len(seq))

    # reshape
    sequences = np.reshape(sequences, (-1, 1))

    return sequences, seq_lens

In [122]:
X, y = df['text'].tolist(), np.array(df['target'])
flat_X, lens = flatten_list(X)

model = hmm.CategoricalHMM(n_components = 2)
model.fit(flat_X, lens) 

In [123]:
B = model.emissionprob_.T
print(B)

for i in range(10):
    cluster = i
    state = np.argmax(B[i])
    diff = np.abs(B[i][0] - B[i][1])
    if diff > .1:
        print(f"Cluster {i} - State {state} - Diff {diff:4f}")


[[1.10644108e-01 5.85038858e-02]
 [2.21979531e-01 9.46773266e-02]
 [1.95115686e-02 5.26388605e-01]
 [1.12787107e-01 6.25796974e-02]
 [1.54143443e-01 9.85814020e-02]
 [1.42073734e-01 5.83575786e-02]
 [2.73075007e-03 8.67286059e-17]
 [9.52739430e-02 3.70448936e-02]
 [5.98598437e-02 2.21565048e-02]
 [8.09959708e-02 4.17101066e-02]]
Cluster 1 - State 0 - Diff 0.127302
Cluster 2 - State 1 - Diff 0.506877


In [124]:
# load w2v model
w2vmodel = Word2Vec.load('twitter_w2v.model')

In [129]:
with open("twitter_cluster.pkl", 'rb') as f:
    dat = pickle.load(f)

vecs = dat[0]
labels = dat[1]

c7 = vecs[labels == 2]
print(len(c7))

i = 0
for v in c7:
    similar = w2vmodel.wv.most_similar(v, topn=5)
    if i % 500 == 0:
        print(similar)
    i += 1

3312
[('acmeuser', 1.0), ('acmeurl', 0.9998157620429993), ('lol', 0.9998036623001099), ('got', 0.9998028874397278), ('ive', 0.9997963309288025)]
[('im', 1.0), ('time', 0.9998244047164917), ('got', 0.9998231530189514), ('day', 0.9998115301132202), ('new', 0.9998112916946411)]
[('acmeuser', 1.0), ('acmeurl', 0.9998157620429993), ('lol', 0.9998036623001099), ('got', 0.9998028874397278), ('ive', 0.9997963309288025)]
[('im', 1.0), ('time', 0.9998244047164917), ('got', 0.9998231530189514), ('day', 0.9998115301132202), ('new', 0.9998112916946411)]
[('acmeuser', 1.0), ('acmeurl', 0.9998157620429993), ('lol', 0.9998036623001099), ('got', 0.9998028874397278), ('ive', 0.9997963309288025)]
[('acmeuser', 1.0), ('acmeurl', 0.9998157620429993), ('lol', 0.9998036623001099), ('got', 0.9998028874397278), ('ive', 0.9997963309288025)]
[('acmeuser', 1.0), ('acmeurl', 0.9998157620429993), ('lol', 0.9998036623001099), ('got', 0.9998028874397278), ('ive', 0.9997963309288025)]


# Yelp Data

In [55]:
# load stuff
with open('cluster_centers.pkl', 'rb') as f:
    dat = pickle.load(f)

vecs = dat[0]
labels = dat[1]

model = Word2Vec.load('yelp_w2v.model')


In [59]:
# train hmm
with open("yelp.pkl", "rb") as f:
    df = pd.DataFrame(pickle.load(f)['data'])

df.head()

Unnamed: 0,text,target
36346,"[3, 0, 2, 2, 2, 2, 4, 2, 6, 7, 4, 1, 1, 7, 1, ...",4.0
1135674,"[4, 4, 6, 9, 1, 2, 4, 4, 6, 0, 2, 2, 9, 9, 1, ...",1.0
756105,"[7, 6, 9, 9, 4, 3, 4, 6, 9, 4, 4, 9, 0, 4, 1, ...",5.0
910468,"[6, 8, 1, 0, 4, 0, 1, 7, 4, 4, 2, 4, 9, 9, 0, ...",5.0
442520,"[2, 4, 9, 4, 4, 0, 0, 5, 2, 1, 9, 9, 1, 4, 1, ...",5.0


In [61]:
X, y = df['text'].tolist(), np.array(df['target'])
flat_X, lens = flatten_list(X)

model = hmm.CategoricalHMM(n_components = 5)
model.fit(flat_X, lens) 

In [62]:
B = model.emissionprob_.T
print(B)

# for i in range(10):
#     cluster = i
#     state = np.argmax(B[i])
#     diff = np.abs(B[i][0] - B[i][1])
#     if diff > .1:
#         print(f"Cluster {i} - State {state} - Diff {diff:4f}")


[[0.14173786 0.04679124 0.03567238 0.02482341 0.0862748 ]
 [0.15937268 0.224374   0.13724712 0.11391942 0.02900571]
 [0.10784002 0.07583219 0.18095975 0.21190909 0.11823229]
 [0.05712658 0.09560933 0.00263079 0.02062601 0.06453086]
 [0.11085953 0.12800155 0.12595417 0.29301536 0.16158681]
 [0.05264739 0.01236799 0.03915321 0.04511243 0.0403932 ]
 [0.01074384 0.02193552 0.04944793 0.11715914 0.03155407]
 [0.12522733 0.20191131 0.04936728 0.04918314 0.10352335]
 [0.04076411 0.03786219 0.01757019 0.04962286 0.03348215]
 [0.19368067 0.15531468 0.36199718 0.07462914 0.33141676]]
