In [3]:
import kagglehub

import subprocess

import pandas as pd
import numpy as np
import fasttext
import tensorflow as tf
from sklearn.model_selection import train_test_split 

# Datasets download

Requires [Kaggle API](https://www.kaggle.com/docs/api#authentication) token in one of these directories:
```
~/.kaggle/kaggle.json

~/.config/kaggle/kaggle.json
```

Used datasets:
- [Main OSes terminal commands](https://www.kaggle.com/datasets/vaibhavdlights/linuxcmdmacos-commands)
- [Wikipedia sentences](https://www.kaggle.com/datasets/mikeortman/wikipedia-sentences)
- [Wikipedia plaintext 2023](https://www.kaggle.com/datasets/jjinho/wikipedia-20230701)

In [None]:
# Data storage
subprocess.run(["mkdir", "data"])

# CLI commands dataset
pathCommands = kagglehub.dataset_download("vaibhavdlights/linuxcmdmacos-commands")
subprocess.run(["mv", pathCommands, "./data/commands/"])


# Wikipedia sentences dataset
pathWiki = kagglehub.dataset_download("mikeortman/wikipedia-sentences")
subprocess.run(["mv", pathWiki, "./data/wikisen/"])

/home/jakubpiasek/.cache/kagglehub/datasets/vaibhavdlights/linuxcmdmacos-commands/versions/1
/home/jakubpiasek/.cache/kagglehub/datasets/mikeortman/wikipedia-sentences/versions/3


# Data manipulation

Removing punctuation and formatting the text so it's only plain text for unsupervised learning

General dataset and training/test splits preparation

In [27]:
# Joining data frames
linuxCommandsDf = pd.read_csv('data/commands/linux_commands.csv')
cmdCommandsDf = pd.read_csv('data/commands/cmd_commands.csv')
macOsCommandsDf = pd.read_csv('data/commands/macos_commands.csv')
vbscriptCommandsDf = pd.read_csv('data/commands/vbscript_commands.csv')

commandsDf = pd.concat([linuxCommandsDf, cmdCommandsDf, macOsCommandsDf, vbscriptCommandsDf], ignore_index = True)


# Data cleaning

# Removing duplicate columns (indexes)
commandsDf = commandsDf.drop(columns=['Unnamed: 0'])

# Removing unwanted parts of strings
commandsDf['description'] = commandsDf['description'].str.replace(' •', '')
commandsDf['description'] = commandsDf['description'].str.replace('•', '')

# Saving the data frame to plain text file
commandsDf.to_csv('data/commandsDf.txt', sep='\t', index=False, header=False)


# Joining text files
data = data2 = ""

with open('data/commandsDf.txt') as fileWrite:
    data = fileWrite.read()

with open('data/wikisen/wikisent2.txt') as fileWrite:
    data2 = fileWrite.read()

data += data2

with open ('data/data.txt', 'w') as fileWrite:
    fileWrite.write(data)

# Training word vectors

Word representation

Data split - 80% training 20% test

CBOW vs skipgrams

In [None]:
# Splitting the data

with open('data/data.txt', 'r') as file:
    data = file.readlines()

train_data, test_data = train_test_split(data, test_size=0.2, random_state=42, shuffle=True)

# Saving the split data to new files
with open('data/train_data.txt', 'w') as train_file:
    train_file.writelines(train_data)

with open('data/test_data.txt', 'w') as test_file:
    test_file.writelines(test_data)

# TODO zostawic commandsDf.txt w train data

Read 127M words
Number of words:  486699
Number of labels: 0
Progress: 100.0% words/sec/thread:   52007 lr:  0.000000 avg.loss:  0.485565 ETA:   0h 0m 0s53s  70494 lr:  0.048440 avg.loss:  1.697119 ETA:   0h20m51s  70493 lr:  0.048086 avg.loss:  1.669816 ETA:   0h20m42s  6.6% words/sec/thread:   69553 lr:  0.046714 avg.loss:  1.605509 ETA:   0h20m23s 1.518650 ETA:   0h19m50s 0.045232 avg.loss:  1.483721 ETA:   0h19m42s 10.2% words/sec/thread:   69605 lr:  0.044902 avg.loss:  1.464940 ETA:   0h19m35s lr:  0.043324 avg.loss:  1.413436 ETA:   0h18m55s 13.6% words/sec/thread:   69489 lr:  0.043212 avg.loss:  1.411198 ETA:   0h18m52s 1.410845 ETA:   0h18m52s 13.6% words/sec/thread:   69462 lr:  0.043187 avg.loss:  1.410705 ETA:   0h18m52s 14.2% words/sec/thread:   69289 lr:  0.042906 avg.loss:  1.405271 ETA:   0h18m48s 18.9% words/sec/thread:   67672 lr:  0.040567 avg.loss:  1.279591 ETA:   0h18m12ss 0.040246 avg.loss:  1.247219 ETA:   0h18m 7s 1.210666 ETA:   0h18m 2s 0.039320 avg.loss:  1

In [16]:
# modelDef = fasttext.train_unsupervised('data/train_data.txt')
modelCbow = fasttext.train_unsupervised('data/train_data.txt', 'cbow')
modelSkipgram = fasttext.train_unsupervised('data/train_data.txt', 'skipgram')

# Saving different model versions to binary file
subprocess.run(["mkdir", "result"])

# modelDef.save_model("result/model1.bin")
modelCbow.save_model("result/modelCbow.bin")
modelSkipgram.save_model("result/modelSkipgram.bin")

Read 127M words
Number of words:  486699
Number of labels: 0
Progress: 100.0% words/sec/thread:   99146 lr:  0.000000 avg.loss:  1.046694 ETA:   0h 0m 0s lr:  0.047926 avg.loss:  1.859780 ETA:   0h 9m40s 9m27s words/sec/thread:  149067 lr:  0.046768 avg.loss:  1.750912 ETA:   0h 9m31s32s 16.2% words/sec/thread:  145456 lr:  0.041901 avg.loss:  1.568963 ETA:   0h 8m44s avg.loss:  1.555797 ETA:   0h 8m44s% words/sec/thread:  134246 lr:  0.038837 avg.loss:  1.503370 ETA:   0h 8m47s 0.038825 avg.loss:  1.502928 ETA:   0h 8m47s avg.loss:  1.473101 ETA:   0h 8m46s% words/sec/thread:  123724 lr:  0.034045 avg.loss:  1.394181 ETA:   0h 8m21s words/sec/thread:  122907 lr:  0.033567 avg.loss:  1.385990 ETA:   0h 8m17s 122863 lr:  0.033545 avg.loss:  1.385620 ETA:   0h 8m17s 121384 lr:  0.032575 avg.loss:  1.370889 ETA:   0h 8m 8s 41.5% words/sec/thread:  117372 lr:  0.029262 avg.loss:  1.327538 ETA:   0h 7m34s 0.029032 avg.loss:  1.321859 ETA:   0h 7m32s22s ETA:   0h 7m19s% words/sec/thread:  11

In [4]:
modelSkipgram = fasttext.load_model("result/modelSkipgram.bin")

In [5]:
modelSkipgram.get_nearest_neighbors("enviromnent")

[(0.7255551218986511, 'environs'),
 (0.7216176390647888, 'environment'),
 (0.7126461863517761, 'encroaches'),
 (0.7096948027610779, 'human-environment'),
 (0.6984639167785645, 'environment;'),
 (0.6963351368904114, 'environs,'),
 (0.6915313601493835, 'environment,"'),
 (0.6869011521339417, 'encroach'),
 (0.6825509071350098, 'environment)'),
 (0.6798412203788757, 'encroachment')]

In [6]:
modelSkipgram.get_word_vector("bot")

array([ 0.65084994,  0.7445628 , -0.11910941,  0.20016773, -0.10904255,
       -0.02735315, -0.40722805,  0.7602869 ,  0.04186199,  0.5296395 ,
        0.11413755,  0.21134733, -0.16696438, -0.0735234 ,  0.31618723,
        0.18560089, -0.31940514,  0.3650617 ,  0.04022288, -0.24242678,
        0.66452146,  0.24958292,  0.03491217, -0.3655611 , -0.11557037,
       -0.53072083,  0.4137996 , -0.6410987 ,  0.24087656, -0.2723353 ,
        0.4926008 ,  0.48552412, -0.47121677, -0.37111726, -0.21230687,
       -0.13511966,  0.17259976, -0.37861803,  0.14162818, -0.4307526 ,
        0.7686412 ,  0.1490578 ,  0.8975334 , -0.5342256 ,  0.48184624,
       -0.03560356,  0.7507417 , -0.3205803 ,  0.24960923,  0.36136618,
       -0.60626477, -0.5483774 , -0.55231214,  0.7599971 ,  0.2439419 ,
       -0.02152452, -0.58692247, -0.4081239 , -0.6492689 , -0.82244086,
        0.2619828 , -0.4954007 , -0.13332376,  0.30128834,  0.5696984 ,
       -0.2193577 , -0.9030422 , -0.33348453, -0.67988   , -0.28

In [7]:
# model functionality prototype
x = "mkidr abcd"
y = x.split()

for char in y:
    print(char)
    print(modelSkipgram.get_nearest_neighbors(char)[0])
    print("\n")

mkidr
(0.6492562890052795, 'minjung')


abcd
(0.6941401362419128, 'rea,')




In [13]:
# Convert fasttext to tensorflow




# Load the FastText model
# fasttext_model = fasttext.load_model("your_model.bin")

# Get the vocabulary and embedding dimension
words = modelSkipgram.get_words()
embedding_dim = modelSkipgram.get_dimension()


# Create a dictionary mapping words to their indices
word_index = {word: idx for idx, word in enumerate(words)}

# Initialize the embedding matrix
embedding_matrix = np.zeros((len(words), embedding_dim))

# Populate the embedding matrix
for word, idx in word_index.items():
    embedding_matrix[idx] = modelSkipgram.get_word_vector(word)


# Initialize the TensorFlow embedding layer
embedding_layer = tf.keras.layers.Embedding(
    input_dim=len(words),
    output_dim=embedding_dim,
    weights=[embedding_matrix],
    trainable=False
)




def get_word_embedding(word):
    word_id = word_index.get(word)
    if word_id is None:
        raise ValueError(f"Word '{word}' not in vocabulary.")
    return embedding_layer(tf.constant([word_id]))[0].numpy()

# Example usage
word_vector = get_word_embedding("bot")
print(word_vector)

[ 0.65084994  0.7445628  -0.11910941  0.20016773 -0.10904255 -0.02735315
 -0.40722805  0.7602869   0.04186199  0.5296395   0.11413755  0.21134733
 -0.16696438 -0.0735234   0.31618723  0.18560089 -0.31940514  0.3650617
  0.04022288 -0.24242678  0.66452146  0.24958292  0.03491217 -0.3655611
 -0.11557037 -0.53072083  0.4137996  -0.6410987   0.24087656 -0.2723353
  0.4926008   0.48552412 -0.47121677 -0.37111726 -0.21230687 -0.13511966
  0.17259976 -0.37861803  0.14162818 -0.4307526   0.7686412   0.1490578
  0.8975334  -0.5342256   0.48184624 -0.03560356  0.7507417  -0.3205803
  0.24960923  0.36136618 -0.60626477 -0.5483774  -0.55231214  0.7599971
  0.2439419  -0.02152452 -0.58692247 -0.4081239  -0.6492689  -0.82244086
  0.2619828  -0.4954007  -0.13332376  0.30128834  0.5696984  -0.2193577
 -0.9030422  -0.33348453 -0.67988    -0.2830395  -0.50229293  0.49356088
  1.0049576   0.3194215  -0.33465976 -0.5167149  -0.06780219  0.18769087
 -0.5868039   0.33246017  0.0095676  -0.52469957 -1.003292