In [1]:
import pandas as pd

In [177]:
import numpy as np
import re
from collections import defaultdict

In [2]:
import ast 

In [3]:
words = pd.read_csv("features_freq_final.csv")

In [4]:
data = pd.read_csv("better_data.csv")

In [5]:
data.drop(["Unnamed: 0"], axis = 1, inplace = True)

In [122]:
class word2vec():
    def __init__ (self):
        self.n = settings['n']
        self.eta = settings['learning_rate']
        self.epochs = settings['epochs']
        self.window = settings['window_size']
        pass
    
    def create_dic(self, corpus):
        
        word_counts = defaultdict(int)
        for row in corpus:
            for word in row:
                word_counts[word] += 1

        self.v_count = len(word_counts.keys())

        # GENERATE LOOKUP DICTIONARIES
        self.words_list = sorted(list(word_counts.keys()),reverse=False)
        self.word_index = dict((word, i) for i, word in enumerate(self.words_list))
        self.index_word = dict((i, word) for i, word in enumerate(self.words_list))
        self.w1 = np.random.uniform(-0.8, 0.8, (self.v_count, self.n))     # embedding matrix
        self.w2 = np.random.uniform(-0.8, 0.8, (self.n, self.v_count))     # context matrix
        
        pass
        
    def c_word2onehot(self, words):
        word_vec = [0 for i in range(0, self.v_count)]
        for word in words:
            if word in self.word_index:
                word_index = self.word_index[word]
                word_vec[word_index] = 1
            #else: print ( "missing word ==> ", word) 
        return word_vec
    
    
    # GENERATE TRAINING DATA
    def generate_training_data(self, settings, corpus):

        training_data = []
        # CYCLE THROUGH EACH SENTENCE IN CORPUS
        for sentence in corpus:
            sent_len = len(sentence)
            if (len(sentence) >= 4):
                # CYCLE THROUGH EACH WORD IN SENTENCE
                for i, word in enumerate(sentence):
                    #w_target  = sentence[i]
                    w_target = w2v.word2onehot(sentence[i])

                    # CYCLE THROUGH CONTEXT WINDOW
                    context_words = []
                    for j in range(i-self.window, i+self.window+1):
                        if j!=i and j<=sent_len-1 and j>=0:
                            context_words.append(sentence[j])
                    w_context = self.c_word2onehot(context_words)
                    training_data.append([w_context, w_target])
        return  np.array(training_data)


    # SOFTMAX ACTIVATION FUNCTION
    def softmax(self, x):
        e_x = np.exp(x - np.max(x))
        return e_x / e_x.sum(axis=0)


    # CONVERT WORD TO ONE HOT ENCODING
    def word2onehot(self, word):
        word_vec = [0 for i in range(0, self.v_count)]
        if word in self.word_index:
            word_index = self.word_index[word]
            word_vec[word_index] = 1
        else: print ( "missing word ==> ", word)
            
        return word_vec
        

    # FORWARD PASS
    def forward_pass(self, x):
        h = np.dot(self.w1.T, x)
        u = np.dot(self.w2.T, h)
        y_c = self.softmax(u)
        return y_c, h, u
                

    # BACKPROPAGATION
    def backprop(self, e, h, x):
        dl_dw2 = np.outer(h, e)  
        dl_dw1 = np.outer(x, np.dot(self.w2, e.T))

        # UPDATE WEIGHTS
        self.w1 = self.w1 - (self.eta * dl_dw1)
        self.w2 = self.w2 - (self.eta * dl_dw2)
        pass


    # TRAIN W2V model
    def train(self, training_data):

        # CYCLE THROUGH EACH EPOCH
        for i in range(0, self.epochs):

            # CYCLE THROUGH EACH TRAINING SAMPLE
            for w_t, w_c in training_data:

                # FORWARD PASS
                y_pred, h, u = self.forward_pass(w_t)
                
                # CALCULATE ERROR
                EI = np.subtract(y_pred, w_c) 

                # BACKPROPAGATION
                self.backprop(EI, h, w_t)
                
            print ('EPOCH:',i)
        pass


    # input a word, returns a vector (if available)
    def word_vec(self, word):
        if word in self.word_index:
            w_index = self.word_index[word]
            v_w = self.w1[w_index]
            return v_w
        else:
            return np.zeros(w2v.n)


    # input a vector, returns nearest word(s)
    def vec_sim(self, vec, top_n):

        # CYCLE THROUGH VOCAB
        word_sim = {}
        for i in range(self.v_count):
            v_w2 = self.w1[i]
            theta_num = np.dot(vec, v_w2)
            theta_den = np.linalg.norm(vec) * np.linalg.norm(v_w2)
            theta = theta_num / theta_den

            word = self.index_word[i]
            word_sim[word] = theta

        words_sorted =  sorted(word_sim.items(), key=lambda x: x[1], reverse=True)

        for word, sim in words_sorted[:top_n]:
            print (word, sim)
            
        pass

    # input word, returns top [n] most similar words
    def word_sim(self, word, top_n):
        
        w1_index = self.word_index[word]
        v_w1 = self.w1[w1_index]

        # CYCLE THROUGH VOCAB
        word_sim = {}
        for i in range(self.v_count):
            v_w2 = self.w1[i]
            theta_num = np.dot(v_w1, v_w2)
            theta_den = np.linalg.norm(v_w1) * np.linalg.norm(v_w2)
            theta = theta_num / theta_den

            word = self.index_word[i]
            word_sim[word] = theta

        words_sorted =  sorted(word_sim.items(), key=lambda x: x[1], reverse=True)

        for word, sim in words_sorted[:top_n]:
            print (word, sim)
            
        pass


In [178]:
#--- EXAMPLE RUN --------------------------------------------------------------+

settings = {}
settings['n'] = 100                   # dimension of word embeddings
settings['window_size'] = 2         # context window +/- center word
settings['min_count'] = 0           # minimum word count
settings['epochs'] = 1           # number of training epochs
settings['learning_rate'] = 0.01    # learning rate
np.random.seed(0)                   # set the seed for reproducibility


In [8]:
w2v = word2vec()

In [180]:
tweets_corpus= []
for tweet in data["dic_words"][:]:
    new_list = list(ast.literal_eval(tweet).keys())
    if (new_list != []):
        tweets_corpus.append(new_list)

In [181]:
len(tweets_corpus)

2561858

In [182]:
w2v.create_dic(tweets_corpus)

In [183]:
len (w2v.index_word)

11365

In [69]:
#len(data['dic_words'])
for i in range (0, 2_00_000, 2000):
    training = w2v.generate_training_data(settings, tweets_corpus[i:i+1500])
    print ("done generating", i)
    w2v.train(training)
    print ("done training", i)
   # input ()

done generating 0
EPOCH: 0
done training 0
done generating 1500
EPOCH: 0
done training 1500
done generating 3000
EPOCH: 0
done training 3000
done generating 4500
EPOCH: 0
done training 4500
done generating 6000
EPOCH: 0
done training 6000
done generating 7500
EPOCH: 0
done training 7500
done generating 9000
EPOCH: 0
done training 9000
done generating 10500
EPOCH: 0
done training 10500
done generating 12000
EPOCH: 0
done training 12000
done generating 13500
EPOCH: 0
done training 13500
done generating 15000
EPOCH: 0
done training 15000
done generating 16500
EPOCH: 0
done training 16500
done generating 18000
EPOCH: 0
done training 18000
done generating 19500
EPOCH: 0
done training 19500
done generating 21000
EPOCH: 0
done training 21000
done generating 22500
EPOCH: 0
done training 22500
done generating 24000
EPOCH: 0
done training 24000
done generating 25500
EPOCH: 0
done training 25500
done generating 27000
EPOCH: 0
done training 27000
done generating 28500
EPOCH: 0
done training 28500


In [140]:
training = w2v.generate_training_data(settings, tweets_corpus[:500])

In [141]:
len(training)

2079

In [154]:
w2v.train(training)

EPOCH: 0
EPOCH: 1
EPOCH: 2
EPOCH: 3
EPOCH: 4
EPOCH: 5
EPOCH: 6
EPOCH: 7
EPOCH: 8
EPOCH: 9
EPOCH: 10
EPOCH: 11
EPOCH: 12
EPOCH: 13
EPOCH: 14
EPOCH: 15
EPOCH: 16
EPOCH: 17
EPOCH: 18
EPOCH: 19
EPOCH: 20
EPOCH: 21
EPOCH: 22
EPOCH: 23
EPOCH: 24
EPOCH: 25
EPOCH: 26
EPOCH: 27
EPOCH: 28
EPOCH: 29
EPOCH: 30
EPOCH: 31
EPOCH: 32
EPOCH: 33
EPOCH: 34
EPOCH: 35
EPOCH: 36
EPOCH: 37
EPOCH: 38
EPOCH: 39
EPOCH: 40
EPOCH: 41
EPOCH: 42
EPOCH: 43
EPOCH: 44
EPOCH: 45
EPOCH: 46
EPOCH: 47
EPOCH: 48
EPOCH: 49


In [165]:
w2v.index_word

{0: 'AA',
 1: 'AB',
 2: 'AD',
 3: 'AE',
 4: 'AF',
 5: 'AG',
 6: 'AH',
 7: 'AI',
 8: 'AK',
 9: 'AL',
 10: 'AP',
 11: 'AQ',
 12: 'AR',
 13: 'AU',
 14: 'AV',
 15: 'AW',
 16: 'AX',
 17: 'AZ',
 18: 'Aa',
 19: 'Ab',
 20: 'Ac',
 21: 'Ad',
 22: 'Af',
 23: 'Ag',
 24: 'Ai',
 25: 'Al',
 26: 'Ar',
 27: 'Au',
 28: 'Av',
 29: 'Aw',
 30: 'BA',
 31: 'BC',
 32: 'BD',
 33: 'BF',
 34: 'BI',
 35: 'BK',
 36: 'BL',
 37: 'BM',
 38: 'BN',
 39: 'BO',
 40: 'BP',
 41: 'BR',
 42: 'BS',
 43: 'BT',
 44: 'BU',
 45: 'BX',
 46: 'Ba',
 47: 'Bi',
 48: 'Bk',
 49: 'Bl',
 50: 'Bt',
 51: 'Bu',
 52: 'CA',
 53: 'CB',
 54: 'CC',
 55: 'CE',
 56: 'CF',
 57: 'CG',
 58: 'CH',
 59: 'CI',
 60: 'CK',
 61: 'CL',
 62: 'CM',
 63: 'CO',
 64: 'CP',
 65: 'CQ',
 66: 'CR',
 67: 'CS',
 68: 'CT',
 69: 'CU',
 70: 'CV',
 71: 'CW',
 72: 'Ca',
 73: 'Cd',
 74: 'Ce',
 75: 'Ch',
 76: 'Ci',
 77: 'Ck',
 78: 'Co',
 79: 'Cs',
 80: 'Cu',
 81: 'DA',
 82: 'DB',
 83: 'DC',
 84: 'DD',
 85: 'DE',
 86: 'DG',
 87: 'DH',
 88: 'DI',
 89: 'DJ',
 90: 'DK',
 91: 'DM'

In [98]:
w2v.word_sim("mom", 10)

mom 1.0000000000000002
mum 0.5595741506419553
boy 0.514231955963347
mother 0.4764960149289138
mommy 0.47135040869763706
parent 0.4656997518725402
friend 0.46367358088798083
sister 0.45186882878428436
cat 0.4498926235271211
guy 0.44790347708640765


In [104]:
w2v.word_vec("guy")

array([-0.12224432,  0.03747687, -0.12396694, -0.27410764,  0.41339067,
        0.12642511,  0.18226308, -0.09979483,  0.08829539,  0.39080589,
        0.04314398,  0.20270002, -0.06870591,  0.05815173, -0.1097281 ,
       -0.07186058, -0.19140144, -0.09409969, -0.18010543,  0.00915305,
        0.30558383, -0.03464191,  0.02582664, -0.26030897, -0.08756514,
       -0.00139091, -0.1273009 ,  0.15103193, -0.03929935,  0.40017723,
       -0.10965818,  0.15056327, -0.15903334,  0.38296775, -0.12131426,
       -0.32434434,  0.22004278, -0.04684089, -0.22648904, -0.02958335,
        0.01209048, -0.16255598, -0.3360576 , -0.21658969,  0.00942108,
       -0.20527873,  0.17783241, -0.28926751, -0.0637944 ,  0.33087001,
        0.24912587, -0.18262771,  0.1035765 ,  0.04605405,  0.24724475,
        0.18137646,  0.00549742, -0.42302168, -0.25477276,  0.04505494,
        0.0733705 , -0.22300202, -0.0170097 , -0.34286688, -0.09392042,
       -0.01843463, -0.00868924,  0.31119685,  0.13527746, -0.06

In [173]:
def vectorize_tweets(df):
  
    tweets_vector = {}
    
    for i, tweets in  enumerate(df["dic_words"]):        
        new_list = list(ast.literal_eval(tweets).keys())
        words = []
        for word in new_list:
            print (word)
            #input()
            words.append (w2v.word_vec(word))
        if len(words) > 0:
            words = 1/len(words) * np.sum(words, axis=0)
            tweets_vector[i] = (words, data["label"][i]) 
    return tweets_vector
    
    
    
    

In [174]:
dic = vectorize_tweets(data.iloc[:10])      
    

upset
hi
text
might
result
school
today
also
blah
dive
time
ball
save
rest
go
bound
whole
feel
like
fire
mad
see
whole
crew
need
hug
hey
long
time
see
ye
rain
bit
fine
thank
nope
spring
break
plain
snow


In [176]:
dic

{0: (array([ 0.13359647, -0.02849117,  0.00770858,  0.0724925 ,  0.15620009,
          0.08491806,  0.00099081, -0.15391996, -0.02070729,  0.12729839,
          0.10382291,  0.04761364, -0.09254125, -0.00211133, -0.12712959,
         -0.14543578,  0.08265676,  0.0284215 , -0.11450602, -0.06865195,
         -0.23927347,  0.02966267,  0.00636718, -0.15866366, -0.13305092,
         -0.07740138, -0.01034695,  0.13287087, -0.14552642,  0.07895818,
         -0.01722671,  0.0485129 , -0.2716531 ,  0.13136121, -0.10710916,
         -0.1419488 ,  0.11733118,  0.20045541,  0.03679432,  0.1671558 ,
          0.00083581,  0.02674998, -0.09121197, -0.00641445, -0.0350443 ,
         -0.06322702, -0.05095166, -0.04134278, -0.03464825,  0.26300644,
         -0.01791029, -0.10505042,  0.08402966,  0.15668508,  0.25796186,
          0.048896  , -0.13085932, -0.20444021, -0.06279778,  0.18254995,
          0.0506735 , -0.10829283,  0.19293825,  0.20163288, -0.03636234,
         -0.05294459,  0.07693896, 

In [121]:
w2v.n

100

In [125]:
w2v.w1

array([[ 0.06700095,  0.27763115,  0.15828083, ..., -0.74345008,
         0.50260862, -0.76444362],
       [ 0.30153716, -0.34526668,  0.35836766, ..., -0.37949228,
        -0.70880819, -0.11854959],
       [-0.2294075 ,  0.25202407, -0.25257779, ...,  0.56702432,
         0.6886235 ,  0.61612253],
       ...,
       [-0.36336109, -0.34062914, -0.43581701, ..., -0.76587208,
        -0.0604406 ,  0.37770625],
       [-0.14215398, -0.04474402, -0.56877792, ..., -0.66204459,
        -0.48753687,  0.51850722],
       [-0.76030531, -0.79197764, -0.61393642, ...,  0.47001247,
        -0.33515513,  0.68094115]])

In [126]:
w2v.w2

array([[-0.09822671,  0.673908  , -0.24662002, ...,  0.67853833,
        -0.7065876 , -0.36476853],
       [-0.40104942, -0.26825159, -0.50467772, ..., -0.61012034,
         0.49011851,  0.1059515 ],
       [-0.53017753,  0.10560229,  0.76411598, ...,  0.41782441,
         0.20493591, -0.23524497],
       ...,
       [-0.44542425, -0.01822307,  0.32161718, ...,  0.28717606,
        -0.11713515,  0.10327296],
       [-0.14767703, -0.44398604, -0.34948476, ..., -0.30495064,
        -0.18534835,  0.31980297],
       [ 0.46437483, -0.1237962 ,  0.50543626, ..., -0.02086641,
         0.68786148, -0.60041626]])

In [128]:
words_vectors = {}
for i in w2v.word_index:
    words_vectors[i] = w2v.word_vec(i)
    

In [129]:
words_vectors

{'AA': array([ 0.06700095,  0.27763115,  0.15828083,  0.03989645, -0.01305015,
         0.25648276, -0.11087239,  0.5980455 ,  0.75013199, -0.14420796,
         0.4631221 ,  0.05822356,  0.06484714,  0.68172103, -0.67521295,
        -0.67324476, -0.72659399,  0.49566681,  0.39870777,  0.56439887,
         0.69582275,  0.49186765, -0.04571074,  0.39562091, -0.64033815,
         0.26875583, -0.57653819,  0.75824839, -0.0125614 , -0.09426064,
        -0.39186027,  0.46696334, -0.13897704,  0.09989676, -0.74854079,
         0.11757628,  0.18961367,  0.229975  ,  0.62862233,  0.25676314,
        -0.17591737, -0.0987722 ,  0.33474162, -0.75121187,  0.20874769,
         0.20046179, -0.47956684, -0.58793685, -0.21449445, -0.1989698 ,
         0.09237371, -0.13093973,  0.73715849, -0.55903427, -0.39977985,
        -0.49597369,  0.22672958, -0.40993945, -0.03858026, -0.36481015,
        -0.50257509, -0.64909066,  0.2999824 , -0.58255861, -0.47157342,
        -0.21326393,  0.57177644, -0.58408541

In [132]:
df_words_vectors = pd.DataFrame.from_dict(words_vectors).transpose()

In [134]:
df_words_vectors.to_csv("words_vectors_ours.csv")