In [None]:
!git clone https://github.com/formiel/word2vec.git

fatal: destination path 'word2vec' already exists and is not an empty directory.


# Data

In [None]:
import pandas as pd
import numpy as np
df_group=pd.read_csv("/content/Book1.csv")
df_group

Unnamed: 0,Words,Group
0,abandon,-1
1,abandoned,-1
2,abandoning,-1
3,abandonment,-1
4,abandonments,-1
...,...,...
2704,wrongdoing,-1
2705,wrongdoings,-1
2706,wrongful,-1
2707,wrongfully,-1


In [None]:
df=pd.read_csv('/content/all-data.csv', header=None,encoding="latin-1")
df


Unnamed: 0,0,1
0,neutral,"According to Gran , the company has no plans t..."
1,neutral,Technopolis plans to develop in stages an area...
2,negative,The international electronic industry company ...
3,positive,With the new production plant the company woul...
4,positive,According to the company 's updated strategy f...
...,...,...
4841,negative,LONDON MarketWatch -- Share prices ended lower...
4842,neutral,Rinkuskiai 's beer sales fell by 6.5 per cent ...
4843,negative,Operating profit fell to EUR 35.4 mn from EUR ...
4844,negative,Net sales of the Paper segment decreased to EU...


In [None]:
l=[]
for i in range(df.shape[0]):
  for j in df[1][i].split():
    l.append(j)


In [None]:
len(l)

111948

In [None]:
l=list(set(l))

In [None]:
len(l)

12971

In [None]:
idx=[]
for i,j in enumerate(df_group['Words']):
  if( j in l ):
    idx.append(i)

In [None]:
idx[:10]

[12, 16, 46, 51, 54, 63, 64, 65, 68, 77]

In [None]:
df_group_new=df_group.loc[idx]
df_group_new.reset_index(inplace=True,drop=True)

In [None]:
df_group_new

Unnamed: 0,Words,Group
0,aberration,-1
1,able,1
2,accident,-1
3,accomplish,1
4,accomplishing,1
...,...,...
315,win,1
316,winner,1
317,winners,1
318,winning,1


In [None]:
df_group_new['Group'].unique()

array([-1,  1])

In [None]:
df_group_new['Words']

0         aberration
1               able
2           accident
3         accomplish
4      accomplishing
           ...      
315              win
316           winner
317          winners
318          winning
319         worrying
Name: Words, Length: 320, dtype: object

# Libraries

In [None]:
from __future__ import division
import argparse
import pandas as pd

# useful stuff
import numpy as np
import pickle # to save and load the model
import os # to join the path
import random # for randomly generated numbers
import time # to record time for the report
import matplotlib # to plot the loss of experiments for the report
# matplotlib.use('TkAgg') # macOS
matplotlib.use("agg")   # Linux
import matplotlib.pyplot as plt 

# text2sentences

In [None]:



def text2sentences(path):
    """
    Function to load, clean and tokenize the text

    - Input: path to text file used to train model
    - Output: a list of tokenized sentences
    """
    # Initialize list to store processed sentences
    sentences = []
    # Define punctuations to be removed
    table = str.maketrans('', '', '''!"#$%&()*+,./:;<=>?@[\]^_`{|}~''')

    # Process and append each sentence to sentences
    with open(path, encoding="utf8") as f:
        for l in f:
            # Convert to lower case and split into words
            sent = l.lower().split()
            # Remove punctuation
            sent_removed_punk = [word.translate(table) for word in sent]
            # Append sent_removed to the list of sentences
            sentences.append(sent_removed_punk)

    return sentences

In [None]:
def loadPairs(path):
    """
    Function to load test data

    - Input: path to test file
    - Output: pairs of word1, word2 and pre-annotated similarity 
    """
    data = pd.read_csv(path, delimiter='\t')
    pairs = zip(data['word1'], data['word2'], data['similarity'])

    return pairs


In [None]:

def sigmoid(x):
    """
    Return logarithmic value of the sigmoid function for a given x
    """
    return 1.0 /(1 + np.exp(-x))

# SkipGram

In [None]:




class SkipGram:
    def __init__(self, sentences, nEmbed=50, negativeRate=5, winSize=5, minCount=3, groups=df_group_new):
        self.sentences = sentences
        self.nEmbed = nEmbed # dimension of embedding
        self.negativeRate = negativeRate # ratio of negative samples over positive samples
        self.winSize = winSize
        self.minCount = max(minCount, 1)
        
        self.word2idx = None
        self.unigram = None
        
        self.groups=groups


    def compute_word2idx_and_unigram(self, unigram_power=0.75):
        """
        Function perform word count and mapping from word to index and vice versa

        - Input: a list of tokenized sentences
        - Output: 
            -- V: vocabulary size
            -- word_list: list of vocabulary sorted by alphabetical order
            -- word2idx: dictionary with word as key, index as value
            -- word_freq: dictionary with word as key, number of word occurence as value
        """
        # Initialize a dictionary of word frequency
        word_freq = {}
        # Iterate over each sentence in the list of sentences
        for sent in self.sentences:
            # Iterate over each word in sentence
            for word in sent:
                # Create the frequency dictionary to count each word
                word_freq[word] = word_freq.get(word, 0) + 1

        # Remove words that have frequency < minCount
        if self.minCount > 1:
            word_freq = {word:freq for word, freq in word_freq.items() if freq >= self.minCount}

        # Create word2idx and idx2word dictionaries from word_list
        self.word2idx = {w: idx for (idx, w) in enumerate(word_freq.keys())}

        ############### Compute unigram
        # Initialize an array of unigram
        unigram = np.zeros(len(self.word2idx))
        # Iterate over list of words and calculate the probability for each word
        for word, frequency in word_freq.items():
            # Raise each word frequency to the power chosen
            f = frequency ** unigram_power
            # Update unigram array
            unigram[self.word2idx[word]] = f
        
        # Normalization
        self.unigram = unigram / np.sum(unigram)


    def compute_positives(self):
        """
        Generate data for training

        - Input: list of tokenized sentences
        - Output:
            -- P: 2D list where P[t] is the list of positive samples wrt a word t
        """
        # Initialize useful lists for optimization phase
        P = [] # Indexes of Positive samples of t

        V = len(self.word2idx)
        number_of_sentences = len(self.sentences)

        # I represent the sentences using word indices (rather than words themselves)
        # If the word does not exist in the dictionary (due to setting minCount > 1) then set its index to -1
        sentences_index_form = [None]*number_of_sentences
        for idx, sent in enumerate(self.sentences):
            sentences_index_form[idx] = [self.word2idx.get(w, -1) for w in sent]

        # For efficiency, pre-compute the number of positives for each word
        number_of_positives = np.zeros(V, dtype=int)
        for idx, sent_word_indices in enumerate(sentences_index_form):
            for i, word_idx in enumerate(sent_word_indices):
                if word_idx < 0:
                    continue
                first = max(0, i-self.winSize)
                last = min(i+self.winSize+1, len(sent_word_indices))
                number_of_positives[word_idx] += (last - first - 1)

        # Now I can allocate the memory for P in advance
        P = [None]*V
        for word_idx in range(V):
            P[word_idx] = np.zeros(number_of_positives[word_idx], dtype=int)

        # Now start filling P
        # For each word t, I keep track of the last position that has
        # been filled in P[t] using P_next_position[t]
        P_next_position = [0]*V

        # Iterate over each sentence in the corpus to extract the target word and context words
        for idx, sent_word_indices in enumerate(sentences_index_form):
            if (idx + 1) % 100 == 0:
                print('Processing sentence', idx + 1, '/', number_of_sentences)
            # Iterate over each word in sentence and add to its Positive list
            for i, word_idx in enumerate(sent_word_indices):
                if word_idx < 0:
                    continue
                first = max(0, i-self.winSize)
                last = min(i+self.winSize+1, len(sent_word_indices))
                number_of_words = (last - first - 1)
                position = P_next_position[word_idx]
                # print('first =', first, 'last =', last, 'position =', position)
                P[word_idx][position:position+number_of_words] = np.asarray(sent_word_indices[first:i] + sent_word_indices[i+1:last])
                P_next_position[word_idx] += number_of_words

        # If minCount > 1 then a lot of -1 in P. I will remove them
        # print('V =', V)
        if self.minCount > 1:
            for word_idx in range(V):
                P[word_idx] = np.delete(P[word_idx], np.where(P[word_idx] < 0))
                # if np.any(P[word_idx] >= V):
                #     print(P[word_idx])
                #     raise "error"
        # Remove duplicates
        for word_idx in range(V):
            P[word_idx] = np.unique(P[word_idx])
        return P


    def negative_sampling(self, t, Pt):
        """
        Draw negative samples from unigram distribution
        t: a target word
        Pt: list of words w such that (t, w) is a positive samples
        - Output: a list of indexes of the negative samples
        """
        # Remove indices of t and Pt as they cannot be negative wrt to t
        invalid_indices = Pt.tolist() + [t]
        # print('invalid_indices =', invalid_indices)
        
        # Now for each postive sample (i.e. each element of Pt),
        # I will randomly generate self.negativeRate negative samples.
        # To avoid mistakenly obtaining postive samples or t itself,
        # I set the probabilities of these indices to 0.
        probabilities = np.copy(self.unigram)
        probabilities[invalid_indices] = 0
        probabilities /= np.sum(probabilities)
        negative_samples=np.random.choice(len(self.unigram), size=self.negativeRate, p=probabilities)

       

        return negative_samples

    

    def train(self, stepsize, epochs, patience=5, save_model_path=None):

        """
        Train the model

        - Input: 
            -- path_to_model: path to store model
        - Output:
            -- Return trained embeddings and saved to path_to_model
        """

        import math
        print('Start compute_word2idx_and_unigram')
        start = time.time()
        self.compute_word2idx_and_unigram()
        print('Took', time.time() - start, '(s).')

        print('Start compute_positives')
        start = time.time()
        P = self.compute_positives()
        print('Took', time.time() - start, '(s).')

        if save_model_path is not None:
            save_model_path = os.path.expanduser(save_model_path)
            save_dir = os.path.dirname(save_model_path)
            if save_dir != '' and not os.path.exists(save_dir):
                os.makedirs(save_dir)

        V = len(self.word2idx)
        print('Total number of words =', V)

        # Initialization
        W = np.random.rand(self.nEmbed, V)


        C = np.random.rand(V, self.nEmbed)


        losses = []
        loss_best = 1e100
        epochs_no_improvement = 0


# ------------------------------------------------------------------------------------------------
        idx_1=[]
        idx_neg=[]

        for i,j in zip(self.groups["Words"],self.groups["Group"]):
          for m,n in zip(self.word2idx.keys(),self.word2idx.values()):
            if i==m:
              if(j==1):
                idx_1.append(n)
              else:
                idx_neg.append(n);
              


# ------------------------------------------------------------------------------------------------

        start = time.time()
        for epoch_idx in range(epochs):
            print('Epoch', epoch_idx + 1)
            # print('-----------------------------------------')
            # I iterate through the index of each word directly,
            # because index is the only thing we need, we dont need the word itself
            loss_epoch = 0.0 # accumulate the loss for all words



            for t in range(V): # it is the index of the target word in the vocab
                # Get the current embedding vectors
                wt = W[:, t]
                positive_samples = P[t]
                for p in positive_samples:
                    negative_samples = self.negative_sampling(t, positive_samples)

                    # print('number of positive samples =', len(positive_samples))
                    # print('number of negative samples =', len(negative_samples))

                    # context vector of the postive sample and the negative ones
                    cp = C[p, :]
                    C_neg = C[negative_samples, :]

                    # intermediate values that are helpful
                    sp = sigmoid(-np.dot(cp, wt))
                    s_neg = sigmoid(np.dot(C_neg, wt))
                    # print('S_neg.shape =', s_neg.shape)

                    # Compute partial derivatives
                    dwt = -sp*cp + np.dot(s_neg, C_neg)
                    dcp = -sp*wt
                    dC_neg = np.outer(s_neg, wt)

                    # Gradient descent update
                    wt -= (stepsize/(epoch_idx+1))*dwt
                    cp -= (stepsize/(epoch_idx+1))*dcp
                    C_neg -= stepsize*dC_neg
                    
                    loss = -np.log(sigmoid(np.dot(cp, wt))) \
                            + np.sum(-np.log(sigmoid(-np.dot(C_neg, wt))))

                        
                    loss_epoch += loss


                # -----------------------------------------------------------------------------------------------
                
                # print("##################",word2idx['aberration'])


                

                final_word=''
                for q1,w1 in zip(self.word2idx.keys(),self.word2idx.keys()):
                  if(w1==t):
                    final_word=q1
                    break
                if(final_word in self.groups['Words']):
                    # for a,b in zip(self.groups['Words'],self.groups['Group']):
                    #   if( b==self.groups[self.groups['Words']==word].iloc[0]['Group']):
                    #     same_group_idx.append(self.word2idx[a])
                    #   else:
                    #     diff_group_idx.append(self.word2idx[a])

                    if(self.groups[self.groups['Words']==final_word].iloc[0]['Group'] ==1):
                      same_group_idx=idx_1
                      diff_group_idx=idx_neg
                    else:
                      same_group_idx=idx_neg
                      diff_group_idx=idx_1

                    
                    
                    len(diff_group_idx)

                    # print(same_group_idx[:4],"-----",len(same_group_idx))
                    # print(same_group_idx[:4],'------',len(same_group_idx))
                    # diff_group
                    x1=0
                    w=W[:,t]
                    
                    for z in diff_group_idx:
                      print(z)
                      v = W[:, z]
                      x1+=(w-v)

                    # same_group
                    x2=0
                    w=W[:,t]
                    for z in same_group_idx:
                      v = W[:, z]
                      x2+=(w-v)

                    
                    dwt=2*((0.8)*x2-(0.2)*x1)
                    w -= (stepsize/(epoch_idx+1))*dwt

                    
                    # Loss
                    l1=0
                    for z in diff_group_idx:
                      v = W[:, z]
                      l1+=pow(math.dist(w,v),2)
                    
                    l2=0
                    for z in same_group_idx:
                      v = W[:, z]
                      l2+= pow(math.dist(w,v),2)


                    loss1=(0.8)*l2-(0.2)*l1
                    
                    loss_epoch += 0.5(loss1)




                    # # print('Epoch', epoch_idx, ', step', t,': loss =', loss)
                    # losses[epoch_idx, t] = loss

                if (t+1)%100 == 0:
                    print('\t step ' + str(t + 1) + '/' + str(V) + '\t loss: %.2f'%(loss) + ' accul.loss: %.2f'%(loss_epoch), end='\r')
            # print('\n', end='\r')
            # Done updating for all words
            losses.append(loss_epoch)
            print('\t Loss: %.2f'%loss_epoch, 'Elapse time:', time.time() - start, '(s)--------')
            if loss_epoch < loss_best:
                loss_best = loss_epoch
                epochs_no_improvement = 0
                # Only save the best parameters
                self.W = W
                self.C = C
                if save_model_path is not None:
                    self.save(save_model_path)
            else:
                epochs_no_improvement += 1
                print('\t No improvement for', epochs_no_improvement, 'epochs')
            
            fname = 'losses' + '_nEmbed' + str(self.nEmbed) \
                    + '_negativeRate' + str(self.negativeRate) \
                    + '_winSize' + str(self.winSize) \
                    + '_minCount' + str(self.minCount) \
                    + '_stepsize' + str(stepsize)

            np.save(fname + '.npy', losses)
            # Plot the loss and save for report
            plt.xlabel('epoch')
            plt.xlabel('loss')
            plt.plot(losses, 'r-')
            plt.savefig(fname + '.png')

            if epochs_no_improvement >= patience:
                print('EARLY STOPPING.')
                break
            

    def save(self, path):
        import pickle
        """
        save the data to file
        """
        data = {'word2idx': self.word2idx,
                'W': self.W,
                'C': self.C,
                'negativeRate': self.negativeRate,
                'nEmbed': self.nEmbed,
                'winSize': self.winSize,
                'minCount': self.minCount}

        with open(path, 'wb') as f:
            pickle.dump(data, f, pickle.HIGHEST_PROTOCOL)



    def similarity(self, word1, word2):
        """
        Computes similiarity between the two words. unknown words are mapped to one common vector
        :param word1:
        :param word2:
        :return: a float in [0,1] indicating the similarity (the higher the more similar)
        """
        # Get the indices of the words, if not exist then take the first index
        idx1 = self.word2idx.get(word1, 0)
        idx2 = self.word2idx.get(word2, 0)

        # Get learned embedding vectors
        w1 = self.W[:, idx1]
        w2 = self.W[:, idx2]

        # Calculate cosine similarity score
        norm1 = np.linalg.norm(w1)
        norm2 = np.linalg.norm(w2)
        score = np.dot(w1, w2)/ (norm1 * norm2)

        return score

    @staticmethod
    def load(path):
        """
        Load the parameters for testing
        """
        with open(path, "rb") as f:
            data = pickle.load(f)
        sg = SkipGram(sentences=None,
                      nEmbed=data['nEmbed'],
                      negativeRate=data['negativeRate'],
                      winSize=data['winSize'],
                      minCount=data['minCount'])
        sg.W = data['W']
        sg.C = data['C']
        sg.word2idx = data['word2idx']
        return sg




# Main

In [None]:
# def main():
#     """
#     main function
#     """
#     parser = argparse.ArgumentParser()
#     parser.add_argument('--text', help='path containing training data', required=True)
#     parser.add_argument('--model', help='path to store/read model (when training/testing)', required=True)
#     parser.add_argument('--test', help='enters test mode (for submission to Prof.)', action='store_true')
#     parser.add_argument('--validate', help='enters validation mode (compute cross-validation with ground-truth)', action='store_true')
#     parser.add_argument('--nEmbed', help='embedding dimension', type=int, default=100)
#     parser.add_argument('--negativeRate', help='ratio of negative sampling', type=int, default=5)
#     parser.add_argument('--winSize', help='context window size', type=int, default=5)
#     parser.add_argument('--minCount', help='take into account only words with more appearances than this number', type=int, default=5)
#     parser.add_argument('--stepsize', help='stepsize for gradient descent', type=float, default=0.0001)
#     parser.add_argument('--epochs', help='number of training epochs', type=int, default=100)
#     parser.add_argument('--patience', type=int, default=5, help='patience (in number of epochs) for early stopping: stop training if the loss has not been improved over the last N epochs')

#     opts = parser.parse_args()

#     if not opts.test:
#         if not opts.validate:
#             print('Read text to sentences')
#             start = time.time()
#             sentences = text2sentences(opts.text)
#             print('Took', time.time() - start, '(s). Total', len(sentences), 'sentences.')

#             sg = SkipGram(sentences, opts.nEmbed, opts.negativeRate, opts.winSize, opts.minCount)
#             print('Start training')
#             start = time.time()
#             sg.train(stepsize=opts.stepsize, epochs=opts.epochs, patience=opts.patience, save_model_path=opts.model)
#             print('Total training time:', time.time() - start, '(s)')
#             # sg.save(opts.model) # It is safer to save the model during training
#         else:
#             print('Validation mode')
#             data = pd.read_csv(opts.text, delimiter='\t')
#             pairs = zip(data['word1'], data['word2'])
#             sim_gt = data['similarity'].values

#             sg = SkipGram.load(opts.model)
#             sim_predicted = np.zeros(sim_gt.shape)
#             for idx, (a,b) in enumerate(pairs):
#                 if (idx+1)%100 == 0:
#                     print(idx + 1, '/', len(sim_predicted))
#                 sim_predicted[idx] = sg.similarity(a,b)
#             # Compute cross-correlation
#             corr = np.corrcoef(sim_gt, sim_predicted)
#             print('correlation:', corr)
#     else:
#         pairs = loadPairs(opts.text)
#         sg = SkipGram.load(opts.model)
#         for a,b,_ in pairs:
#             print(sg.similarity(a,b))


# if __name__ == '__main__':
#     main()


# Our work

In [None]:



def text2sentences_new(f):
    """
    Function to load, clean and tokenize the text

    - Input: path to text file used to train model
    - Output: a list of tokenized sentences
    """
    # Initialize list to store processed sentences
    sentences = []
    # Define punctuations to be removed
    table = str.maketrans('', '', '''!"#$%&()*+,./:;<=>?@[\]^_`{|}~''')

    # Process and append each sentence to sentences
    
    for l in f:
        # Convert to lower case and split into words
        sent = l.lower().split()
        # Remove punctuation
        sent_removed_punk = [word.translate(table) for word in sent]
        # Append sent_removed to the list of sentences
        sentences.append(sent_removed_punk)


    return sentences

In [None]:
df

Unnamed: 0,0,1
0,neutral,"According to Gran , the company has no plans t..."
1,neutral,Technopolis plans to develop in stages an area...
2,negative,The international electronic industry company ...
3,positive,With the new production plant the company woul...
4,positive,According to the company 's updated strategy f...
...,...,...
4841,negative,LONDON MarketWatch -- Share prices ended lower...
4842,neutral,Rinkuskiai 's beer sales fell by 6.5 per cent ...
4843,negative,Operating profit fell to EUR 35.4 mn from EUR ...
4844,negative,Net sales of the Paper segment decreased to EU...


In [None]:
sen=[]
for i in df[1]:
  # for j in i.split():
    sen.append(i)

In [None]:
sen[:2]

['According to Gran , the company has no plans to move all production to Russia , although that is where the company is growing .',
 'Technopolis plans to develop in stages an area of no less than 100,000 square meters in order to host companies working in computer technologies and telecommunications , the statement said .']

In [None]:
print(text2sentences_new(sen))



In [None]:
len_1=[]
for i in text2sentences_new(sen):
  for j in i:
    len_1.append(j)

print(len(set(len_1)))

11326


# Run


In [None]:
azx=text2sentences_new(sen)
print(azx)



In [None]:
save("/content")

'' was not found in history, as a file, url, nor in the user namespace.


In [None]:
# sent = text2sentences(sen)
sg = SkipGram(text2sentences_new(sen))

sg.train(stepsize=0.3,epochs=100)


Start compute_word2idx_and_unigram
Took 0.02444744110107422 (s).
Start compute_positives
Processing sentence 100 / 4846
Processing sentence 200 / 4846
Processing sentence 300 / 4846
Processing sentence 400 / 4846
Processing sentence 500 / 4846
Processing sentence 600 / 4846
Processing sentence 700 / 4846
Processing sentence 800 / 4846
Processing sentence 900 / 4846
Processing sentence 1000 / 4846
Processing sentence 1100 / 4846
Processing sentence 1200 / 4846
Processing sentence 1300 / 4846
Processing sentence 1400 / 4846
Processing sentence 1500 / 4846
Processing sentence 1600 / 4846
Processing sentence 1700 / 4846
Processing sentence 1800 / 4846
Processing sentence 1900 / 4846
Processing sentence 2000 / 4846
Processing sentence 2100 / 4846
Processing sentence 2200 / 4846
Processing sentence 2300 / 4846
Processing sentence 2400 / 4846
Processing sentence 2500 / 4846
Processing sentence 2600 / 4846
Processing sentence 2700 / 4846
Processing sentence 2800 / 4846
Processing sentence 2900

In [None]:
sg.save("/content/Saved/saved_data")

In [None]:
qwer = np.load("/content/losses_nEmbed50_negativeRate5_winSize5_minCount3_stepsize0.15.npy")
qwer

array([319529.37549375, 258041.11641629, 218779.65567492, 195533.37982512,
       180137.61079492, 170760.74097182, 161807.71587879, 158256.36961674,
       152793.39729218, 150482.15673637, 148559.79164451, 145839.24175999,
       143735.84290593, 142170.65598579, 141057.44998318, 139624.16510649,
       139564.31051328, 138240.20907198, 137707.54144447, 137562.76285305,
       137751.91441283, 136796.91831526, 136263.91647496, 137389.12553184,
       137748.53877577, 136475.33328266, 136287.66891882, 136697.94145322])

In [None]:
k=0
for i in df_group_new['Words']:
  print(i)
  k+=1
  if(k>2):
    break

aberration
able
accident


In [None]:
df_group_new[df_group_new['Words']=='contrary'].iloc[0]

Words    contrary
Group          -1
Name: 50, dtype: object

In [None]:
df_group_new[df_group_new['Words']=='contrary'].iloc[0]['Group']

-1

In [None]:
# for i,j in zip(word2idx.keys(),word2idx.values()):
#   print(i,j)

In [None]:
import pickle
saved_mode_file=pickle.load(open("/content/Saved/saved_data", 'rb'))
saved_mode_file['W']

array([[-0.02040267, -0.33868476,  0.24068728, ...,  0.53641079,
         0.54776101, -0.95159103],
       [-0.31098651,  0.36066004, -2.49935856, ..., -0.1739237 ,
         0.63400896, -0.2297733 ],
       [ 0.23840693,  0.63040172, -0.11801094, ...,  0.04479725,
         0.10099731, -0.15919819],
       ...,
       [ 0.11665943, -0.03641491,  0.1387244 , ..., -0.13084466,
         0.11509145,  0.28522243],
       [ 0.07926916,  0.37198158, -0.61584713, ..., -0.2983312 ,
        -0.29020075,  0.71443053],
       [-0.2835972 , -0.03350819, -1.94155006, ..., -0.18995404,
        -1.14145389, -0.0236671 ]])

In [None]:
(saved_mode_file['W'].T).shape

(3553, 50)

In [None]:
saved_mode_file['W'].shape

(50, 3553)

In [None]:
saved_mode_file.keys()

dict_keys(['word2idx', 'W', 'C', 'negativeRate', 'nEmbed', 'winSize', 'minCount'])

In [None]:
saved_mode_file['word2idx']

{'according': 0,
 'to': 1,
 '': 2,
 'the': 3,
 'company': 4,
 'has': 5,
 'no': 6,
 'plans': 7,
 'move': 8,
 'all': 9,
 'production': 10,
 'russia': 11,
 'although': 12,
 'that': 13,
 'is': 14,
 'where': 15,
 'growing': 16,
 'technopolis': 17,
 'develop': 18,
 'in': 19,
 'stages': 20,
 'an': 21,
 'area': 22,
 'of': 23,
 'less': 24,
 'than': 25,
 '100000': 26,
 'square': 27,
 'meters': 28,
 'order': 29,
 'companies': 30,
 'working': 31,
 'computer': 32,
 'technologies': 33,
 'and': 34,
 'telecommunications': 35,
 'statement': 36,
 'said': 37,
 'international': 38,
 'electronic': 39,
 'industry': 40,
 'elcoteq': 41,
 'laid': 42,
 'off': 43,
 'employees': 44,
 'from': 45,
 'its': 46,
 'tallinn': 47,
 'facility': 48,
 'earlier': 49,
 'layoffs': 50,
 'contracted': 51,
 'office': 52,
 'workers': 53,
 'daily': 54,
 'reported': 55,
 'with': 56,
 'new': 57,
 'plant': 58,
 'would': 59,
 'increase': 60,
 'capacity': 61,
 'meet': 62,
 'expected': 63,
 'demand': 64,
 'improve': 65,
 'use': 66,
 'raw

In [None]:
df

Unnamed: 0,0,1
0,neutral,"According to Gran , the company has no plans t..."
1,neutral,Technopolis plans to develop in stages an area...
2,negative,The international electronic industry company ...
3,positive,With the new production plant the company woul...
4,positive,According to the company 's updated strategy f...
...,...,...
4841,negative,LONDON MarketWatch -- Share prices ended lower...
4842,neutral,Rinkuskiai 's beer sales fell by 6.5 per cent ...
4843,negative,Operating profit fell to EUR 35.4 mn from EUR ...
4844,negative,Net sales of the Paper segment decreased to EU...


In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X_train,X_test,y_train,y_test=  train_test_split(df[1],df[0],test_size=0.3,random_state=345,stratify=df[0])

In [None]:
X_train.shape,X_test.shape,y_train.shape,y_test.shape

((3392,), (1454,), (3392,), (1454,))

In [None]:
sentence_matrix=[]
for i in df[1]:
  word_matrix=np.zeros(50)
  for j in i.split():
    if(j in saved_mode_file['word2idx'].keys()):
      k=saved_mode_file['word2idx'][j]
      word_matrix+=saved_mode_file['W'].T[k]
  
  sentence_matrix.append(word_matrix)

sentence_matrix=np.array(sentence_matrix)

In [None]:
(np.array(sentence_matrix)).shape

(4846, 50)

In [None]:
np.zeros(5)

array([0., 0., 0., 0., 0.])

In [None]:
y=df[0].map({'neutral':0, "negative":-1 , "positive":1})

In [None]:
X_train_matrix,X_test_matrix,y_train_matrix,y_test_matrix=  train_test_split(sentence_matrix,y,test_size=0.3,random_state=345,stratify=y)


In [None]:
import pandas as pd
pd.DataFrame(X_train_matrix)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,40,41,42,43,44,45,46,47,48,49
0,0.843070,1.444610,0.200691,0.260546,0.473482,-0.061789,-1.128974,-1.524165,0.262601,1.339871,...,-0.323181,0.242413,-0.955853,1.096937,2.072834,1.552675,0.957518,-2.716916,1.514281,-1.126462
1,0.460971,0.442955,2.332216,-1.412247,1.583451,1.870011,-3.731503,-0.498262,0.589126,0.703824,...,-2.512105,1.162584,-2.026188,-0.061393,3.112351,0.100879,0.156995,-0.270631,0.852235,-2.086012
2,0.075030,2.522282,-0.679949,-0.433289,2.636402,-0.834239,-3.399396,-2.841401,0.354969,-0.244580,...,-1.156686,0.833241,-0.028032,1.894514,2.702017,-1.018218,0.506528,-2.787130,0.386414,1.049767
3,2.005112,0.733643,-0.278851,-0.772641,1.180461,1.202111,-2.828352,-2.171650,0.561005,0.152483,...,-1.374467,2.300581,0.294233,-0.365987,0.600403,-0.390507,1.759091,-0.671893,0.647161,-0.663499
4,-1.994485,1.689477,-0.690166,0.143554,2.151990,-0.400109,-4.898136,-3.319612,0.327303,0.537824,...,-1.195159,1.528469,0.339685,1.561953,2.839891,0.403246,1.620827,-4.012030,0.684657,-0.128666
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3387,-0.130024,1.002091,-0.385183,1.037418,-0.256525,-0.530444,-1.128750,0.061314,0.709359,-0.153012,...,0.191237,0.347238,-0.087892,0.091743,0.428606,0.956499,0.818426,-0.472806,0.479059,-0.699789
3388,-0.098416,0.387249,1.363639,-0.880541,0.620059,-0.690006,-3.900721,-1.900213,-1.008606,0.456896,...,-0.534771,1.172749,-0.134310,-0.314026,2.140786,-0.246355,0.600667,-2.439595,1.757805,-1.094506
3389,-0.473285,-0.040210,1.654042,0.256007,1.492365,-0.005569,-0.446907,-0.949112,0.819135,0.270231,...,-0.660858,0.338111,0.068486,0.166129,1.706912,0.287133,0.528402,-1.399061,0.196438,0.774360
3390,-0.184758,0.681731,3.027598,-0.436319,1.576298,1.547002,-6.125069,-0.436404,1.030552,0.335911,...,0.199627,2.553181,-2.020303,-0.497199,3.943545,-0.870383,-0.020742,-3.619509,3.345324,-0.920553


In [None]:
pd.DataFrame(y_train_matrix)

Unnamed: 0,0
4158,0
156,1
1593,0
3878,0
4808,-1
...,...
2840,0
1471,0
3232,0
132,1


In [None]:
from sklearn.ensemble import RandomForestClassifier
rfc=RandomForestClassifier()
rfc.fit(X_train_matrix,y_train_matrix)


RandomForestClassifier()

In [None]:
rfc.score(X_test_matrix,y_test_matrix)

0.6616231086657497

In [None]:
pd.DataFrame(rfc.predict(X_test_matrix))

Unnamed: 0,0
0,0
1,0
2,0
3,0
4,1
...,...
1449,0
1450,0
1451,0
1452,0


# Word2Vec

In [None]:
from gensim.models import Word2Vec

In [None]:
w2v=Word2Vec(sentences=azx,size=50,window=5,min_count=3)
w2v

<gensim.models.word2vec.Word2Vec at 0x7fda0486d950>

In [None]:
w2v.train(azx,epochs=10,total_examples=len(azx))

(695543, 1119480)

In [None]:
vocab=w2v.wv.vocab

In [None]:
word_vec_dict={}
for word in vocab:
  word_vec_dict[word]=w2v.wv.get_vector(word)

In [None]:
pd.DataFrame(word_vec_dict).T

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,40,41,42,43,44,45,46,47,48,49
according,-0.005956,0.985839,-1.027003,-0.084009,-0.008241,0.256304,0.038649,0.207554,-1.004673,-0.478326,...,-0.657952,0.094820,-0.212943,0.320940,-0.329104,-0.002742,0.306103,-0.406307,0.106290,-0.204776
to,0.664694,0.239535,-0.022821,0.327582,-0.443341,-0.771120,0.241658,0.043624,0.882714,0.265189,...,-0.192648,0.210869,-0.125080,0.430193,-1.367137,-0.172332,1.239550,1.050946,-0.298174,2.027730
,1.023271,-0.609809,0.695464,-0.383544,-0.482898,0.470689,0.124387,-0.611734,-0.896939,0.542686,...,-0.767186,0.998048,0.119814,0.415691,-0.232704,0.057336,-0.795796,0.698772,0.942007,-0.288262
the,1.477080,0.291600,0.447553,0.450669,-0.554947,-0.494174,0.236054,-0.716546,1.065318,-1.616416,...,-0.029545,-0.025046,-0.736911,0.689000,-0.674085,1.552348,1.046209,-1.146416,-0.255206,0.537194
company,1.007590,0.314498,-1.327238,-0.041028,0.407674,1.033896,0.468327,0.211558,0.038526,0.447201,...,-0.735693,1.127088,-0.291700,0.082666,-0.143322,0.851487,0.015233,-0.493276,-0.314281,-0.308669
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
tough,0.273677,-0.111346,-0.051467,-0.086982,-0.134292,0.037731,0.043596,0.048483,-0.062171,-0.138327,...,-0.172574,-0.030088,-0.070228,-0.038523,-0.071068,0.052309,0.050443,-0.118675,0.165790,0.028350
drop,0.156333,-0.136118,0.025808,-0.071010,-0.063351,0.002030,0.004235,0.081826,0.145699,-0.049542,...,-0.249420,0.094137,-0.024887,-0.238489,-0.102134,0.129851,-0.049900,0.018690,0.142776,0.031842
decline,0.229755,-0.138199,-0.053819,-0.052024,-0.112754,0.134431,0.067309,0.053861,-0.074569,-0.118467,...,-0.106212,-0.014578,-0.039501,-0.120080,-0.090197,0.100486,-0.014690,-0.154125,0.199110,-0.043990
151,0.117092,-0.037901,-0.065522,-0.087170,-0.094453,-0.012190,0.027342,0.069829,-0.082722,-0.083545,...,-0.150728,0.053594,0.008842,0.010389,-0.041117,0.085218,0.069559,-0.053906,0.098103,0.066291


In [None]:
w2v_sentence_matrix=[]
for i in df[1]:
  w2v_word_matrix=np.zeros(50)
  for j in i.split():
    if(j in word_vec_dict.keys()):
      k=word_vec_dict[j]
      w2v_word_matrix+=k
  
  w2v_sentence_matrix.append(w2v_word_matrix)

w2v_sentence_matrix=np.array(w2v_sentence_matrix)

In [None]:
pd.DataFrame(w2v_sentence_matrix)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,40,41,42,43,44,45,46,47,48,49
0,20.110776,2.373242,-8.520920,0.753813,-7.040235,1.462847,1.477619,0.695464,3.572620,-7.123033,...,-5.469552,6.431845,-5.746503,5.567005,-11.431198,4.127975,15.252394,-5.833228,4.697776,9.359825
1,22.254954,-3.227739,-6.612145,-7.059734,-9.320910,1.934128,-1.226302,2.693051,0.509162,-7.602604,...,-7.871751,6.134100,-6.446882,6.110448,-10.821691,7.598785,12.499221,-1.749437,12.199616,5.733531
2,22.652216,-4.176022,-7.441591,-1.881510,-8.566513,3.474516,0.254096,1.865923,3.528879,-10.560802,...,-11.008905,8.855610,-6.265792,3.618499,-8.743490,11.605127,13.132441,-8.097489,8.322693,5.696046
3,32.439018,3.465699,-7.804332,0.859633,-6.933578,-1.933895,3.987377,-2.085441,11.228035,-19.358907,...,-6.866107,6.670974,-10.936302,6.236051,-17.747311,15.394998,20.583669,-9.969729,10.071703,9.306451
4,19.410299,-8.267907,-6.555014,-7.880454,-11.183859,1.852289,-3.838647,7.456189,13.043391,-7.017892,...,-12.556442,22.690464,-0.838455,-1.486377,-10.496302,23.578834,12.708515,-0.441513,9.111563,8.713860
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4841,13.455146,-4.234414,-0.171052,-3.947183,-5.592707,2.973323,-0.582120,1.225277,1.741298,-4.624296,...,-4.563014,2.722463,-3.949795,0.838898,-4.487290,6.771473,3.515305,1.376210,7.549398,1.001520
4842,9.960198,-11.375382,6.890025,-6.632468,-5.066894,-5.710932,1.622552,12.368551,8.292749,-6.433425,...,-6.891310,12.644795,4.245826,-5.385955,0.133016,13.126072,7.980428,2.733545,3.452190,11.475687
4843,8.689046,-3.060437,-1.704791,-11.067628,-4.188651,-6.054751,2.952231,7.750974,7.764516,-5.939016,...,-4.449118,10.841503,1.341122,-8.473668,-5.502935,15.981822,-8.592893,4.887146,11.576731,11.476682
4844,28.357082,-5.583897,-1.732846,-17.683826,-8.446217,-8.074578,6.518252,12.345627,31.610121,-15.621520,...,-8.725429,23.365635,-5.335275,-18.917081,-15.120339,43.330185,-13.881932,7.613772,21.203061,24.149261


In [None]:
X_train_w2v,X_test_w2v,y_train_w2v,y_test_w2v=  train_test_split(w2v_sentence_matrix,y,test_size=0.3,random_state=345,stratify=y)

In [None]:
from sklearn.ensemble import RandomForestClassifier
rfc1=RandomForestClassifier()
rfc1.fit(X_train_w2v,y_train_w2v)


RandomForestClassifier()

In [None]:
rfc.score(X_test_w2v,y_test_w2v)

0.4752407152682256