# DATA

$$ MSE = ∑ (y_{actual} - βX)^2 $$


$$\partial\beta = -\frac{2}{n}\Sigma X^T(y - \beta X)$$


$$\partial\beta = -\frac{2}{n}\Sigma X^T(y - \beta X) + \alpha \Sigma (abs(\beta))$$

$$\partial\beta = -\frac{2}{n}\Sigma X^T(y - \beta X) + \alpha * {sign} (\beta)$$


$$ = \begin{bmatrix}
-1 \\
0.01332 \\
2.334 \\
\end{bmatrix}$$


In [3]:
import pandas as pd
from nltk.tokenize import word_tokenize
import gensim
from gensim.models import Word2Vec
import json
import nltk
nltk.download('punkt')
from matplotlib import pyplot as plt
import numpy as np


data_filename = "data/mapped_dataset.csv"
word_index_file = "config/word_to_index.json"
classes_file = "config/classes.txt"
max_tokens_file = "config/max_input"
data = pd.read_csv(data_filename)


# Train Word2Vec model on your data
tokenized_data = [word_tokenize(text.lower()) for text in data["X"]]
word2vec_model = Word2Vec(tokenized_data, vector_size=100, window=5, min_count=1, sg=1)


# Create a word-to-index mapping
word_to_index = {word: word2vec_model.wv.key_to_index[word] for word in word2vec_model.wv.index_to_key}

with open(word_index_file, 'w') as json_file:
    json.dump(word_to_index, json_file)

data["X_tokenized"] = data["X"].apply(lambda text: [word_to_index[word] for word in word_tokenize(text.lower())])

X_tokenized = pd.DataFrame({'X_tokenized': data['X_tokenized']})
X_tokenized['X_tokenized'] = X_tokenized['X_tokenized'].apply(lambda x: ",".join(map(str, x)))


max_tokens = max(X_tokenized['X_tokenized'].apply(lambda x: len(x.split(','))))

# Save max_tokens to a file
with open(max_tokens_file, "w") as file:
    file.write(str(max_tokens))

column_names = [f'Token_{i}' for i in range(1, max_tokens + 1)]
X_tokenized[column_names] = X_tokenized['X_tokenized'].str.split(',', expand=True)
X_tokenized = X_tokenized.drop(columns=['X_tokenized'])



X = X_tokenized.fillna(0).astype(int)
X = X
for column in  X.columns:
    X[column] = pd.to_numeric(X[column])

Y_tokenized = data["mapping"].str.split(",", expand=True)
Y_tokenized.columns = [f'Y{i+1}' for i in range(Y_tokenized.shape[1])]

# Save Y_tokenized as a separate DataFrame
Y = Y_tokenized.fillna(0).astype(int)


[nltk_data] Downloading package punkt to /home/amzker/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


polynomial feature code

# Training

In [4]:
X

Unnamed: 0,Token_1,Token_2,Token_3,Token_4,Token_5,Token_6,Token_7,Token_8,Token_9,Token_10,Token_11,Token_12
0,6,9,5,0,0,0,0,0,0,0,0,0
1,43,1,25,0,0,0,0,0,0,0,0,0
2,6,5,0,0,0,0,0,0,0,0,0,0
3,22,1,25,0,0,0,0,0,0,0,0,0
4,14,32,0,0,0,0,0,0,0,0,0,0
5,37,36,35,8,9,33,0,0,0,0,0,0
6,14,17,40,0,0,0,0,0,0,0,0,0
7,46,10,49,0,0,0,0,0,0,0,0,0
8,50,51,0,0,0,0,0,0,0,0,0,0
9,21,0,54,0,0,0,0,0,0,0,0,0


In [7]:
Y

Unnamed: 0,Y1,Y2,Y3
0,1,0,0
1,1,0,0
2,1,0,0
3,1,0,0
4,2,0,0
5,2,0,0
6,2,0,0
7,2,0,0
8,2,0,0
9,3,0,0


In [8]:
class POLY2:

  def __init__(self):
      self.beta = None
      self.c = None
      self.degree = None
      self.mean = None
      self.std = None
      self.mse = []
      self.betas = []
      self.itr = []

  def polyrise(self, X, degree, interactions=False):
      newx = np.asarray(X)

      if newx.ndim == 1:
          newx = newx.reshape(-1, 1)
      X_poly = newx.copy()

      for i in range(2, degree + 1):
          X_poly = np.append(X_poly, newx ** i, axis=1)

      return X_poly

  def normalize(self, X):
      smallvalue = 1e-10

      X = (X - self.mean) / (self.std + smallvalue)
      return X

  def fit(self, X, y, lr=0.01, epochs=100, degree=1, interactions=False, alpha=0.01):
      self.degree = degree

      X_poly = self.polyrise(X, degree, interactions)
      y = np.asarray(y)
      if y.ndim == 1:
        y = y.reshape(-1, 1)

      n_samples, n_features = X_poly.shape
      n_outputs = y.shape[1]
      self.beta = np.zeros((n_features, n_outputs))
      self.c = np.zeros(n_outputs)
      self.mean = np.mean(X_poly, axis=0)
      self.std = np.std(X_poly, axis=0)
      X_norm = self.normalize(X_poly)

      for i in range(epochs):
          self.itr.append(i)
          pred = X_norm.dot(self.beta) + self.c
          error = y - pred
          self.betas.append(self.beta)

          self.mse.append(np.mean(np.absolute(error)))

          # ∂β = −2/n Σ X.T(y−βX) +  α∗sign(β)

          db = -2 / len(X_norm) * X_norm.T.dot(error)
          lasso = alpha * np.sign(self.beta)
          db = db + lasso

          dc = (-2) * np.mean(error, axis=0)
          self.beta = self.beta - (lr * db)
          self.c = self.c - (lr * dc)
      print("LAST MSE: ", np.mean(np.absolute(self.mse[-1])))
      return self

  def predict(self, X):
      if self.beta is None or self.c is None:
          raise RuntimeError("Model has not been trained. Please call model.fit() before model.predict().")
      X_poly = self.polyrise(X, self.degree, interactions=False)  # Ensure interactions are disabled
      X_norm = self.normalize(X_poly)
      return X_norm.dot(self.beta) + self.c


In [9]:
def plot_graphs(x, y_true, model):
  fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 4))

  ax1.scatter(x, y_true, color="yellow", label="True Data")
  ax1.plot(x, model.predict(x), color="red", label="Model Prediction")
  ax1.set_xlabel("x")
  ax1.set_ylabel("y")
  ax1.set_title("True Data vs. our Model Prediction")
  ax1.legend()

  equation = f"y = {model.c}"
  for i, coeff in enumerate(model.beta):
      equation += f" + {coeff} * x^{i+1}"
  print(equation)

  ax2.plot(model.itr, model.mse)
  ax2.set_xlabel("Iterations")
  ax2.set_ylabel("MSE")
  ax2.set_title("Mean Squared Error our model")

  plt.tight_layout()
  plt.show()

def plot_depth_graph(model):
    betas = np.asarray(model.betas)
    mse = np.asarray(model.mse)

    num_features = betas.shape[1]
    num_iterations = betas.shape[0]

    num_rows = int(np.ceil(num_features / 2))
    fig, axes = plt.subplots(num_rows, 2, figsize=(12, 2 * num_rows))

    for i in range(num_features):
        row = i // 2
        col = i % 2
        axes[row, col].plot(betas[:, i], mse)
        axes[row, col].set_xlabel(f'Beta[{i+1}]')
        axes[row, col].set_ylabel('MSE')

    fig.suptitle('MSE vs. Beta')
    plt.tight_layout()
    plt.show()


In [10]:
modelpoly = POLY2()

modelpoly.fit(X,Y,lr=0.01,epochs=1000,degree=30,alpha=0)
#plot_depth_graph(modelpoly)

LAST MSE:  0.01640590407007313


<__main__.POLY2 at 0x7ff5ebee9280>

In [11]:
import json
import pandas as pd
from nltk.tokenize import word_tokenize
import numpy as np


def load_word_to_index(filename):
    with open(filename, 'r') as json_file:
        word_to_index = json.load(json_file)
    return word_to_index

def load_max_input_size(filename):
    with open(filename, 'r') as file:
        max_input_size = int(file.read())
    return max_input_size



def make_predictions(input_text, word_to_index, model, max_input_size):
    input_tokens = [word_to_index.get(word, 0) for word in word_tokenize(input_text.lower())]

    # Pad or truncate input tokens to match the model's input size
    input_tokens = input_tokens[:max_input_size] + [0] * (max_input_size - len(input_tokens))


    predicted_output = model.predict(np.array([input_tokens]))

    return predicted_output


word_to_index = load_word_to_index('config/word_to_index.json')
max_input_size = load_max_input_size('config/max_input')

In [40]:
input_sentence = "search image from clipboard"
predictions = make_predictions(input_sentence, word_to_index, modelpoly, max_input_size)
predictions = np.round(predictions)


mappings = pd.read_csv("data/mappings.csv")


mapping_dict = mappings.set_index("mapping")["commands"].to_dict()


predicted_commands = [", ".join([mapping_dict[pred] for pred in prediction if pred in mapping_dict]) for prediction in predictions]

print(predicted_commands)

['clipboard.save_image_from_clipboard_and_copy_path, browser.search_image']
