In [1]:
from gensim.test.utils import common_texts, get_tmpfile
from gensim.models import Word2Vec

import time
import os
import numpy as np

import tensorflow as tf

modelfileName = "CS286-Word2Vec.model"

dirNameInput = os.getcwd() + '/Word2VecData_Input'
dirNameOutput = os.getcwd() + '/Word2VecData_Output'

Word2Vec_model = None

X = None
Y = None
x_train = None
y_train = None
x_test = None
y_test = None

model = None


In [2]:
def createDataFiles():
    import time
    start = time.time()
    stride = 2
    messageSize = 5
    
    # For Simulated data
    fileInput = open(dirNameInput + "/Word2VecDataSimulated.txt", "w")
    fileOutput = open(dirNameOutput + "/Word2VecDataSimulated.txt", "w")
    fileNamesSimulated = ['Speed20_40.txt', 'Speed40_60.txt']
    arrSimulated = [1, 2]

    for i in range(len(fileNamesSimulated)):
        with open(fileNamesSimulated[i]) as f:
            content = f.readlines()

        # Remove the extra spaces from the sentences
        content = [x.strip() for x in content]

        cntCntr = 0
        while cntCntr < len(content):
            iteration = 0
            currMessagePackets = ""

            while iteration < messageSize and cntCntr < len(content):
                # Remove the noise from the files
                DATA = content[cntCntr][15:38]

                # this is the whole message
                parts = DATA.split()

                tempo = ""
                for j in range(8):
                    if j >= len(parts) or parts[j] == "  ":
                        tempo += "00"
                    else:
                        tempo += parts[j]

                currMessagePackets += str(tempo) + " "
                cntCntr += 1
                iteration += 1

            if cntCntr < len(content):
                cntCntr -= stride
            else:
                break
            fileInput.write(currMessagePackets + "\n")
            fileOutput.write(str(arrSimulated[i]) + "\n")

    fileInput.close()
    fileOutput.close()
    # To only work with actual data end commenting till here
    end = time.time()

    print("Awesome !!! File processing done !!!")
    print("Total Time for file processing ---> ", end - start)


In [3]:
class IteratingClass:
    def __init__(self, dirName):
        self.dirName = dirName

    def __iter__(self):
        for fName in os.listdir(self.dirName):
            for line in open(os.path.join(self.dirName, fName)):
                yield line.split()

def createWord2VecModels():
    start = time.time()
    dataDirec = IteratingClass(dirNameInput)
    model = Word2Vec(dataDirec, size=200, window=5, min_count=1, workers=8)
    model.save(modelfileName)
    end = time.time()
    print("\n\nTraining Successful for Word2Vec Model!!!")
    print("Total Time for Word2Vec model -> ", (end - start))


In [4]:
def train():
    createDataFiles()
    createWord2VecModels()

In [5]:
train()

Awesome !!! File processing done !!!
('Total Time for file processing ---> ', 0.0526728630065918)


Training Successful for Word2Vec Model!!!
('Total Time for Word2Vec model -> ', 0.05155682563781738)


In [6]:

# Convert the data into Numpy Arrays
def getNumpyArray():
    start = time.time()
    global X
    global Y
    global dirNameInput
    Word2Vec_model = Word2Vec.load(modelfileName)
    X = None
    Y = None
    print("Processing started ")
    result = []
    for fName in os.listdir(dirNameInput):
        for line in open(os.path.join(dirNameInput, fName)):
            parts = line.split()
            currV = Word2Vec_model.wv[parts[0]]
            currV.setflags(write=1)
            for i in range(1, len(parts)):
                currV += Word2Vec_model.wv[parts[i]]
            currV /= len(parts)
            result.append(np.array(currV))
    X = np.array(result)
               
    print("Processing started for output ")
    result = []
    for fName in os.listdir(dirNameOutput):
        for line in open(os.path.join(dirNameOutput, fName)):
            currV = int(line)
            result.append(np.array(currV))

    Y = np.array(result)        
    
    print(X.shape)    
    print(Y.shape)    
    end = time.time()
    print("total time -> ", end -start)





In [7]:
def createDNNModel():
    global model     
    model = tf.keras.models.Sequential()
    model.add(tf.keras.layers.Flatten())
    model.add(tf.keras.layers.Dense(256, activation=tf.nn.relu))
    model.add(tf.keras.layers.Dense(256, activation=tf.nn.relu))
    model.add(tf.keras.layers.Dense(3, activation=tf.nn.softmax))

    model.compile(optimizer='adam',
                 loss='sparse_categorical_crossentropy',
                 metrics=['accuracy'])
    

    

    

In [8]:
getNumpyArray()
# Commnet this line for your part
#createDNNModel()
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(X, Y,test_size=0.30, random_state=1)


Processing started 
Processing started for output 
(1197, 200)
(1197,)
('total time -> ', 0.04828810691833496)


In [9]:
from sklearn.neighbors import KNeighborsClassifier

In [13]:
x_train, x_test, y_train, y_test = train_test_split(X, Y,test_size=0.30, random_state=1)
#x_train.reshape(200,1)
#LSTMModel(10)

# KNN 

In [14]:
# Actuals and Simulated

from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(X, Y,test_size=0.30, random_state=1)

In [15]:
print ("\n")
print ("-----------------------------------------------------------")
print ("\n")
print ("X: ",X)
print ("\n")
print ("-----------------------------------------------------------")
print ("\n")
print ("Y: ",Y)
print ("\n")
print ("-----------------------------------------------------------")
print ("\n")



-----------------------------------------------------------


('X: ', array([[-0.00523348, -0.00283842, -0.01179395, ..., -0.00554015,
         0.00026569,  0.00518289],
       [-0.00548416, -0.00319688, -0.01428992, ..., -0.00554803,
         0.00027015,  0.00475326],
       [-0.00721464, -0.00348424, -0.01524161, ..., -0.0056422 ,
        -0.00011218,  0.00468908],
       ...,
       [-0.00660806, -0.00284081, -0.01271116, ..., -0.00500994,
        -0.00050537,  0.00394728],
       [-0.00662199, -0.00284247, -0.01278575, ..., -0.00501054,
        -0.00050947,  0.00396174],
       [-0.00662941, -0.0028517 , -0.01279869, ..., -0.00501879,
        -0.00049444,  0.00399296]], dtype=float32))


-----------------------------------------------------------


('Y: ', array([1, 1, 1, ..., 2, 2, 2]))


-----------------------------------------------------------




# KNN N = 1

In [16]:
knn = KNeighborsClassifier(n_neighbors=1)
print knn
knn.fit(x_train,np.ravel(y_train,order='C'))  #train the model

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=1, p=2,
           weights='uniform')


KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=1, p=2,
           weights='uniform')

In [17]:
y_pred=knn.predict(x_test)
from sklearn import metrics
print metrics.accuracy_score(y_test,y_pred)

0.9972222222222222


# KNN N = 3

In [18]:
knn = KNeighborsClassifier(n_neighbors=3)
print knn
knn.fit(x_train,np.ravel(y_train,order='C'))  #train the model

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=3, p=2,
           weights='uniform')


KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=3, p=2,
           weights='uniform')

In [19]:
y_pred=knn.predict(x_test)
from sklearn import metrics
print metrics.accuracy_score(y_test,y_pred)

0.9944444444444445


In [20]:
score, acc = model.evaluate(x_test, y_test)
print('Test score:', score)
print('Test accuracy:', acc)

AttributeError: 'NoneType' object has no attribute 'evaluate'

In [None]:
def evaluateWordToVec():
    Word2Vec_model = Word2Vec.load(modelfileName)
    print("Loaded successfully")
    #print(model.wv['FEB0FF999E500000'])
    print(len(Word2Vec_model.wv['FEB0FF999E500000']))

    #print(model.wv['000200000000002A'])
    print(len(Word2Vec_model.wv['000200000000002A']))
    
    #print(model.wv['FEB0FF999E500000'] + model.wv['000200000000002A'])
    
    v = Word2Vec_model.wv['FEB0FF999E500000'] + Word2Vec_model.wv['000200000000002A']
    print(type(v))
    #print(v/2)
evaluateWordToVec()