# DATA

In [20]:
import pandas as pd
from nltk.tokenize import word_tokenize
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import json

data_filename = "data/dataset.csv"
word_index_file = "config/word_to_index.json"
classes_file = "config/classes.json"
max_token_file = "config/max_token.txt"

data = pd.read_csv(data_filename)

words = set()
for text in data["X"]:
    words.update(word_tokenize(text.lower()))


word_to_index = {word: index + 1 for index, word in enumerate(words)}  # Start index from 1
with open(word_index_file, 'w') as json_file:
    json.dump(word_to_index, json_file)

data["X_tokenized"] = data["X"].apply(lambda text: [word_to_index[word] for word in word_tokenize(text.lower())])

X_tokenized = pd.DataFrame({'X_tokenized': data['X_tokenized']})
X_tokenized['X_tokenized'] = X_tokenized['X_tokenized'].apply(lambda x: ",".join(map(str, x)))
max_tokens = max(X_tokenized['X_tokenized'].apply(lambda x: len(x.split(','))))


with open(max_token_file, 'w') as max_token_file:
    max_token_file.write(str(max_tokens))

column_names = [f'Token_{i}' for i in range(1, max_tokens + 1)]
X_tokenized[column_names] = X_tokenized['X_tokenized'].str.split(',', expand=True)
X_tokenized = X_tokenized.drop(columns=['X_tokenized'])

X = X_tokenized.fillna(0)
for column in  X.columns:
    X[column] = pd.to_numeric(X[column])

Y = data.drop(columns=["X","Y","X_tokenized"])


classes = list(Y.columns)
class_mapping = {str(index): label for index, label in enumerate(classes)}
with open(classes_file, 'w') as json_file:
    json.dump(class_mapping, json_file)


# Training

In [9]:
class POLY2:

  def __init__(self):
      self.beta = None
      self.c = None
      self.degree = None
      self.mean = None
      self.std = None
      self.mse = []
      self.betas = []
      self.itr = []


  def polyrise(self, X, degree, interactions=True):
      newx = np.asarray(X)

      if newx.ndim == 1:
          newx = newx.reshape(-1, 1)
      X_poly = newx.copy()

      for i in range(2, degree + 1):
          X_poly = np.append(X_poly, newx ** i, axis=1)

      if interactions:
          for i in range(newx.shape[1]):
              for j in range(i + 1, newx.shape[1]):
                  interx = newx[:, i] * newx[:, j]
                  X_poly = np.append(X_poly, interx.reshape(-1, 1), axis=1)
      return X_poly

  def normalize(self, X):
      smallvalue = 1e-10

      X = (X - self.mean) / (self.std + smallvalue)
      return X

  def fit(self, X, y, lr=0.01, epochs=100, degree=1,interactions=True,alpha=0.01):
      self.degree = degree

      X_poly = self.polyrise(X, degree, interactions)
      y = np.asarray(y)
      if y.ndim == 1:
        y = y.reshape(-1,1)

      n_samples, n_features = X_poly.shape
      n_outputs = y.shape[1]
      self.beta = np.zeros((n_features, n_outputs))
      self.c = np.zeros(n_outputs)
      self.mean = np.mean(X_poly, axis=0)
      self.std = np.std(X_poly, axis=0)
      X_norm = self.normalize(X_poly)

      for i in range(epochs):
          self.itr.append(i)
          pred = X_norm.dot(self.beta) + self.c
          error = y - pred
          self.betas.append(self.beta)
          self.mse.append(np.mean(np.absolute(error)))

          #∂β = −2/n Σ X.T(y−βX) +  α∗sign(β)

          db = -2 / len(X_norm) * X_norm.T.dot(error)
          lasso = alpha * np.sign(self.beta)
          db = db + lasso

          dc = (-2) * np.mean(error, axis=0)
          self.beta = self.beta - (lr * db)
          self.c = self.c - (lr * dc)
      return self

  def predict(self, X):
      if self.beta is None or self.c is None:
          raise RuntimeError("Model has not been trained. Please call model.fit() before model.predict().")
      X_poly = self.polyrise(X, self.degree)
      X_norm = self.normalize(X_poly)
      return X_norm.dot(self.beta) + self.c

In [10]:
def plot_graphs(x, y_true, model):
  fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 4))

  ax1.scatter(x, y_true, color="yellow", label="True Data")
  ax1.plot(x, model.predict(x), color="red", label="Model Prediction")
  ax1.set_xlabel("x")
  ax1.set_ylabel("y")
  ax1.set_title("True Data vs. our Model Prediction")
  ax1.legend()

  equation = f"y = {model.c}"
  for i, coeff in enumerate(model.beta):
      equation += f" + {coeff} * x^{i+1}"
  print(equation)

  ax2.plot(model.itr, model.mse)
  ax2.set_xlabel("Iterations")
  ax2.set_ylabel("MSE")
  ax2.set_title("Mean Squared Error our model")

  plt.tight_layout()
  plt.show()

def plot_depth_graph(model):
    betas = np.asarray(model.betas)
    mse = np.asarray(model.mse)

    num_features = betas.shape[1]
    num_iterations = betas.shape[0]

    num_rows = int(np.ceil(num_features / 2))
    fig, axes = plt.subplots(num_rows, 2, figsize=(12, 2 * num_rows))

    for i in range(num_features):
        row = i // 2
        col = i % 2
        axes[row, col].plot(betas[:, i], mse)
        axes[row, col].set_xlabel(f'Beta[{i+1}]')
        axes[row, col].set_ylabel('MSE')

    fig.suptitle('MSE vs. Beta')
    plt.tight_layout()
    plt.show()


In [12]:
modelpoly = POLY2()

modelpoly.fit(X,Y,lr=0.01,epochs=100,degree=2,interactions=True,alpha=1)
#plot_depth_graph(modelpoly)

<__main__.POLY2 at 0x7effe4288100>

In [26]:
import json
import pandas as pd
from nltk.tokenize import word_tokenize
import numpy as np

word_index_file = "config/word_to_index.json"
classes_file = "config/classes.json"
max_token_file = "config/max_token.txt"

def load_word_to_index(filename):
    with open(filename, 'r') as json_file:
        word_to_index = json.load(json_file)
    return word_to_index

def load_classes(filename):
    with open(filename, 'r') as json_file:
        classes = json.load(json_file)
    return classes

def load_max_token_input_size(filename):
    with open(filename, 'r') as file:
        max_tokens = file.read()
    return int(max_tokens)

word_to_index = load_word_to_index(word_index_file)
max_tokens = load_max_token_input_size(max_token_file)
classes = load_classes(classes_file)

def make_predictions(input_text, word_to_index, model, max_input_size):
    input_tokens = [word_to_index.get(word, 0) for word in word_tokenize(input_text.lower())]
    input_tokens = input_tokens[:max_input_size] + [0] * (max_input_size - len(input_tokens))
    predicted_output = model.predict(np.array([input_tokens]))

    # Create a dictionary of class names and their scores
    class_scores = {class_name: score for class_name, score in zip(classes, predicted_output[0])}

    return class_scores


In [27]:
input_sentence = "take a screenshot"
predictions = make_predictions(input_sentence, word_to_index, modelpoly, max_tokens)
print(predictions)

{'0': 0.11001999781798301, '1': 0.15575373293895492, '2': 0.18493434685552582, '3': 0.07795655350292555, '4': 0.09870407011169934, '5': -0.003946788382648225, '6': 0.02032262391868296, '7': 0.057098151706865385, '8': 0.0057987153786529105, '9': 0.1395102081904402, '10': 0.03937794323570284, '11': 0.047295046899982915}
