In [1]:
from gensim.test.utils import common_texts, get_tmpfile
from gensim.models import Word2Vec

import time
import os
import numpy as np

import tensorflow as tf

modelfileName = "CS286-Word2Vec.model"
dirNameInput = os.getcwd() + '/Word2VecData_Input'
dirNameOutput = os.getcwd() + '/Word2VecData_Output'

from sklearn.model_selection import train_test_split

Word2Vec_model = None

#BOTH ACTUAL AND SIMULATED
X = None
Y = None
#ONLY ACTUAL
X_Actual = None
Y_Actual = None
#ONLY SIMULATED
X_Simulated = None
Y_Simulated = None
x_train = None
y_train = None
x_test = None
y_test = None

model = None
result = []


In [2]:
def createDataFiles():
    import time
    start = time.time()
    stride = 2
    
    fileNamesActual = ['autopark.dat', 'drive.dat', 'idle.dat']
    
    # To only work with simulated data start commenting from here
    fileInput = open("Word2VecData_Input\Word2VecDataActual.txt", "w")
    fileOutput = open("Word2VecData_Output\Word2VecDataActual.txt", "w")
    

    for i in range(len(fileNamesActual)):
        with open(fileNamesActual[i]) as f:
            content = f.readlines()

        # Remove the extra spaces from the sentences
        content = [x.strip() for x in content]
        value = int(0.8 * len(content))
        
        value = len(content)

        cntCntr = 0
        while cntCntr < value:
            iteration = 0
            currMessagePackets = ""

            while iteration < 5 and cntCntr < value:
                parts = content[cntCntr].split(",")

                # this is the whole message
                DATA = parts[3].split(":")[1].replace(" ", "")

                currMessagePackets += str(DATA) + " "

                cntCntr += 1
                iteration += 1

            if cntCntr < value:
                cntCntr -= stride
            else:
                break
            fileInput.write(currMessagePackets + "\n")
            fileOutput.write(str(i) + "\n")
    fileInput.close()
    fileOutput.close()
    # To only work with Simulated data end commenting till here

    # To only work with actual data start commenting from here
    # For Simulated data
    fileInput = open("Word2VecData_Input\Word2VecDataSimulated.txt", "w")
    fileOutput = open("Word2VecData_Output\Word2VecDataSimulated.txt", "w")
    fileNamesSimulated = ['New_DRIVE_Data.txt', 'New_IDLE_Data.txt', 'DRIVE.rtf', 'IDLE.rtf']
    arrSimulated = [1, 2, 1, 2]

    for i in range(len(fileNamesSimulated)):
        with open(fileNamesSimulated[i]) as f:
            content = f.readlines()

        # Remove the extra spaces from the sentences
        content = [x.strip() for x in content]

        cntCntr = 0
        while cntCntr < len(content):
            iteration = 0
            currMessagePackets = ""

            while iteration < 5 and cntCntr < len(content):
                # Remove the noise from the files
                DATA = content[cntCntr][15:38]

                # this is the whole message
                parts = DATA.split()

                tempo = ""
                for j in range(8):
                    if j >= len(parts) or parts[j] == "  ":
                        tempo += "00"
                    else:
                        tempo += parts[j]

                currMessagePackets += str(tempo) + " "
                cntCntr += 1
                iteration += 1

            if cntCntr < len(content):
                cntCntr -= stride
            else:
                break
            fileInput.write(currMessagePackets + "\n")
            fileOutput.write(str(arrSimulated[i]) + "\n")

    fileInput.close()
    # To only work with actual data end commenting till here
    end = time.time()

    print("Awesome !!! File processing done !!!")
    print("Total Time for file processing ---> ", end - start)


In [3]:
class IteratingClass:
    def __init__(self, dirName):
        self.dirName = dirName

    def __iter__(self):
        for fName in os.listdir(self.dirName):
            for line in open(os.path.join(self.dirName, fName)):
                yield line.split()

def createWord2VecModels():
    start = time.time()
    dataDirec = IteratingClass(dirNameInput)
    model = Word2Vec(dataDirec, size=200, window=5, min_count=1, workers=8)
    model.save(modelfileName)
    end = time.time()
    print("\n\nTraining Successful for Word2Vec Model!!!")
    print("Total Time for Word2Vec model -> ", (end - start))


In [4]:
def train():
    createDataFiles()
    createWord2VecModels()

In [5]:
train()

Awesome !!! File processing done !!!
('Total Time for file processing ---> ', 0.9378280639648438)


Training Successful for Word2Vec Model!!!
('Total Time for Word2Vec model -> ', 1.2917981147766113)


In [6]:

# Convert the data into Numpy Arrays
start = time.time()
Word2Vec_model = Word2Vec.load(modelfileName)
x_train = None
def getNumpyArray():
    global X
    global Y
    global X_Actual
    global Y_Actual
    global X_Simulated
    global Y_Simulated
    global dirNameInput
    X = None
    Y = None
    result = []
    for fName in os.listdir(dirNameInput):
        for line in open(os.path.join(dirNameInput, fName)):
            parts = line.split()
            currV = Word2Vec_model.wv[parts[0]]
            currV.setflags(write=1)
            for i in range(1, len(parts)):
                currV += Word2Vec_model.wv[parts[i]]
            currV /= len(parts)
            result.append(np.array(currV))
        if "Actual" in fName: 
            X_Actual = np.array(result)
            result = []
        else:
            X_Simulated = np.array(result)
            result = []
    
    X = np.concatenate((X_Actual, X_Simulated), axis=0)
                        
    result = []
    for fName in os.listdir(dirNameOutput):
        for line in open(os.path.join(dirNameOutput, fName)):
            currV = int(line)
            result.append([currV])
        if "Actual" in fName:
            Y_Actual = np.array(result)
            result = []
        else:
            Y_Simulated = np.array(result)
            result = []
                
    Y = np.concatenate((Y_Actual, Y_Simulated), axis=0)
    
    print(X.shape)
    print(Y.shape)
    end = time.time()
    print("Total time for array conversion -> ", end -start)

In [8]:

getNumpyArray()



(26777, 200)
(26777, 1)
('Total time for array conversion -> ', 1.4609870910644531)


# KNN Actuals+Simulated

In [9]:
from sklearn.neighbors import KNeighborsClassifier

In [11]:
# Actuals and Simulated

from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(X, Y,test_size=0.30, random_state=1)

In [12]:
print ("\n")
print ("-----------------------------------------------------------")
print ("\n")
print ("X: ",X)
print ("\n")
print ("-----------------------------------------------------------")
print ("\n")
print ("Y: ",Y)
print ("\n")
print ("-----------------------------------------------------------")
print ("\n")



-----------------------------------------------------------


('X: ', array([[ 0.1075408 ,  0.07346245, -0.06177807, ...,  0.07590635,
        -0.10396608, -0.1726506 ],
       [ 0.09373774,  0.05824638, -0.04724154, ...,  0.06501274,
        -0.07302765, -0.13795492],
       [ 0.15305845,  0.08584963, -0.08658248, ...,  0.10051292,
        -0.13710465, -0.23729539],
       ...,
       [ 0.27077964,  0.2344991 , -0.10557387, ...,  0.14585146,
        -0.4501658 , -0.46895775],
       [ 0.26700965,  0.23123837, -0.10410259, ...,  0.14384682,
        -0.44362217, -0.46236187],
       [ 0.27014086,  0.23393063, -0.10531533, ...,  0.14549246,
        -0.4490799 , -0.46783525]], dtype=float32))


-----------------------------------------------------------


('Y: ', array([[0],
       [0],
       [0],
       ...,
       [2],
       [2],
       [2]]))


-----------------------------------------------------------




# KNN N = 1


In [13]:
knn = KNeighborsClassifier(n_neighbors=1)
print knn
knn.fit(x_train,np.ravel(y_train,order='C'))  #train the model

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=1, p=2,
           weights='uniform')


KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=1, p=2,
           weights='uniform')

In [14]:
y_pred=knn.predict(x_test)
from sklearn import metrics
print metrics.accuracy_score(y_test,y_pred)

0.9680109534478466


# KNN N = 3

In [15]:
knn = KNeighborsClassifier(n_neighbors=3)
print knn
knn.fit(x_train,np.ravel(y_train,order='C'))  #train the model

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=3, p=2,
           weights='uniform')


KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=3, p=2,
           weights='uniform')

In [16]:
y_pred=knn.predict(x_test)
from sklearn import metrics
print metrics.accuracy_score(y_test,y_pred)

0.9678864824495892


# KNN N = 5

In [17]:
knn = KNeighborsClassifier(n_neighbors=5)
print knn
knn.fit(x_train,np.ravel(y_train,order='C'))  #train the model

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=5, p=2,
           weights='uniform')


KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=5, p=2,
           weights='uniform')

In [18]:
y_pred=knn.predict(x_test)
from sklearn import metrics
print metrics.accuracy_score(y_test,y_pred)

0.964152352501867


# KNN N = 7

In [19]:
knn = KNeighborsClassifier(n_neighbors=7)
print knn
knn.fit(x_train,np.ravel(y_train,order='C'))  #train the model

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=7, p=2,
           weights='uniform')


KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=7, p=2,
           weights='uniform')

In [20]:
y_pred=knn.predict(x_test)
from sklearn import metrics
print metrics.accuracy_score(y_test,y_pred)

0.962907642519293


# KNN N = 10

In [21]:
knn = KNeighborsClassifier(n_neighbors=10)
print knn
knn.fit(x_train,np.ravel(y_train,order='C'))  #train the model

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=10, p=2,
           weights='uniform')


KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=10, p=2,
           weights='uniform')

In [22]:
y_pred=knn.predict(x_test)
from sklearn import metrics
print metrics.accuracy_score(y_test,y_pred)

0.962285287528006


# KNN Actuals

In [23]:
# Actuals

from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(X_Actual, Y_Actual,test_size=0.30, random_state=1)

# KNN N = 1

In [24]:
knn = KNeighborsClassifier(n_neighbors=1)
print knn
knn.fit(x_train,np.ravel(y_train,order='C'))  #train the model

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=1, p=2,
           weights='uniform')


KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=1, p=2,
           weights='uniform')

In [25]:
y_pred=knn.predict(x_test)
from sklearn import metrics
print metrics.accuracy_score(y_test,y_pred)

0.8539057841383423


# KNN N = 3

In [26]:
knn = KNeighborsClassifier(n_neighbors=3)
print knn
knn.fit(x_train,np.ravel(y_train,order='C'))  #train the model

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=3, p=2,
           weights='uniform')


KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=3, p=2,
           weights='uniform')

In [27]:
y_pred=knn.predict(x_test)
from sklearn import metrics
print metrics.accuracy_score(y_test,y_pred)

0.8556946929039952


# KNN N = 5

In [28]:
knn = KNeighborsClassifier(n_neighbors=5)
print knn
knn.fit(x_train,np.ravel(y_train,order='C'))  #train the model

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=5, p=2,
           weights='uniform')


KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=5, p=2,
           weights='uniform')

In [29]:
y_pred=knn.predict(x_test)
from sklearn import metrics
print metrics.accuracy_score(y_test,y_pred)

0.8449612403100775


# KNN N = 7

In [30]:
knn = KNeighborsClassifier(n_neighbors=7)
print knn
knn.fit(x_train,np.ravel(y_train,order='C'))  #train the model

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=7, p=2,
           weights='uniform')


KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=7, p=2,
           weights='uniform')

In [31]:
y_pred=knn.predict(x_test)
from sklearn import metrics
print metrics.accuracy_score(y_test,y_pred)

0.8389982110912343


# KNN N = 10

In [32]:
knn = KNeighborsClassifier(n_neighbors=10)
print knn
knn.fit(x_train,np.ravel(y_train,order='C'))  #train the model

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=10, p=2,
           weights='uniform')


KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=10, p=2,
           weights='uniform')

In [33]:
y_pred=knn.predict(x_test)
from sklearn import metrics
print metrics.accuracy_score(y_test,y_pred)

0.8294573643410853


# KNN Simulated

In [34]:
# Simulated

from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(X_Simulated, Y_Simulated,test_size=0.30, random_state=1)

# KNN N = 1

In [35]:
knn = KNeighborsClassifier(n_neighbors=1)
print knn
knn.fit(x_train,np.ravel(y_train,order='C'))  #train the model

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=1, p=2,
           weights='uniform')


KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=1, p=2,
           weights='uniform')

In [36]:
y_pred=knn.predict(x_test)
from sklearn import metrics
print metrics.accuracy_score(y_test,y_pred)

0.9949661790152587


# KNN N = 3

In [37]:
knn = KNeighborsClassifier(n_neighbors=3)
print knn
knn.fit(x_train,np.ravel(y_train,order='C'))  #train the model

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=3, p=2,
           weights='uniform')


KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=3, p=2,
           weights='uniform')

In [38]:
y_pred=knn.predict(x_test)
from sklearn import metrics
print metrics.accuracy_score(y_test,y_pred)

0.9957527135441245


# KNN N = 5

In [39]:
knn = KNeighborsClassifier(n_neighbors=5)
print knn
knn.fit(x_train,np.ravel(y_train,order='C'))  #train the model

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=5, p=2,
           weights='uniform')


KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=5, p=2,
           weights='uniform')

In [40]:
y_pred=knn.predict(x_test)
from sklearn import metrics
print metrics.accuracy_score(y_test,y_pred)

0.995123485921032


# KNN N = 7

In [41]:
knn = KNeighborsClassifier(n_neighbors=7)
print knn
knn.fit(x_train,np.ravel(y_train,order='C'))  #train the model

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=7, p=2,
           weights='uniform')


KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=7, p=2,
           weights='uniform')

In [42]:
y_pred=knn.predict(x_test)
from sklearn import metrics
print metrics.accuracy_score(y_test,y_pred)

0.9946515652037125


# KNN N = 10

In [43]:
knn = KNeighborsClassifier(n_neighbors=10)
print knn
knn.fit(x_train,np.ravel(y_train,order='C'))  #train the model

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=10, p=2,
           weights='uniform')


KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=10, p=2,
           weights='uniform')

In [44]:
y_pred=knn.predict(x_test)
from sklearn import metrics
print metrics.accuracy_score(y_test,y_pred)

0.9935504168633003
