In [None]:

import tensorflow as tf
from tensorflow.keras.layers import TextVectorization
from tensorflow.keras.utils import pad_sequences
from tensorflow.keras.layers import TimeDistributed, Dense, LSTM, Embedding, Dropout, Bidirectional, GlobalMaxPool1D
from tensorflow.keras.models import Model, Sequential
from tensorflow.keras.utils import to_categorical
import matplotlib.pyplot as plt

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from nltk.corpus import stopwords
import nltk
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))
import pickle

np.set_printoptions(threshold=np.inf)

In [None]:
df = pd.read_csv("keywords_output.csv", names = ["paragraphs", "label"],on_bad_lines='skip')
df['paragraphs'] = df['paragraphs'].astype(str)
df['label'] = df['label'].astype(str)


In [None]:

vectorizer = TextVectorization(
    max_tokens=100000,
    output_mode="int",
    output_sequence_length=512,
    standardize="lower_and_strip_punctuation",
)


vectorizer.adapt(df["paragraphs"])
vocab = {token:  index for token, index in enumerate(vectorizer.get_vocabulary())}
encoded_sequences = vectorizer(df["paragraphs"]).numpy()

sentence_column = []
keyword_column = []


for index, row in df.iterrows():
    new_keywords = []
    sentence = row["paragraphs"]
    keywords = row["label"]
    tokens = sentence.split()
    for token in tokens:
        if token in keywords:
            if not any(char.isdigit() for char in token):
                new_keywords.append(1)
        else:
            new_keywords.append(0)
    if sum(new_keywords) != 0:
        sentence_column.append(sentence)
        keyword_column.append(new_keywords)


In [None]:


vectorizer.adapt(sentence_column)
X = vectorizer(sentence_column).numpy()
X = pad_sequences(X, padding = "post", truncating = "post", maxlen = 512, value = 0)
y = pad_sequences(keyword_column, padding = "post", truncating = "post", maxlen = 512, value = 0)
y = [to_categorical(i, num_classes = 2) for i in y]
embeddings_index = {}
f = open('embeddings.txt','r')
for line in f:
	values = line.split()
	word = values[0]
	coefs = np.asarray(values[1:], dtype = "float32")
	embeddings_index[word] = coefs
f.close()


In [None]:


word_index = vectorizer.get_vocabulary()


ed = 100
embedding_matrix = np.zeros((len(word_index) + 1, ed))


for word, i in enumerate(word_index):
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

In [None]:


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = 42)
model = Sequential()
model.add(Embedding(len(word_index) + 1, 100, weights = [embedding_matrix]))
model.add(Bidirectional(LSTM(128, return_sequences = True, recurrent_dropout = 0.1)))
model.add(TimeDistributed(Dense(2, activation = "softmax")))
model.compile(loss="categorical_crossentropy", optimizer = "adam", metrics = ["accuracy"])
history = model.fit(X_train, np.array(y_train), batch_size = 64, epochs = 5, validation_split = 0.1)
model_json = model.to_json()


In [None]:
predictions = model.predict(X_test)
yhat_classes = np.argmax(predictions, axis=2)
testy_inverse = label_encoder.inverse_transform(y_test)

# Calculate precision and recall
precision = precision_score(testy_inverse.ravel(), yhat_classes.ravel())
recall = recall_score(testy_inverse.ravel(), yhat_classes.ravel())

In [None]:

plt.plot(history.history['accuracy'], label='Training Accuracy')
plt.title('Training Accuracy')
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.legend()
plt.show()


plt.plot(history.history['loss'], label='Training Loss')
plt.title('Training Loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.legend()
plt.show()


In [None]:

y_test = np.array(y_test)
evaluation_results = model.evaluate(X_test, y_test, batch_size=32)

plt.plot(history.history['accuracy'], label='Training Accuracy')
plt.title('Training Accuracy')
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.legend()
plt.show()


plt.plot(history.history['loss'], label='Training Loss')
plt.title('Training Loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.legend()
plt.show()
loss, accuracy = evaluation_results

print(f"Test loss: {loss:.4f}")
print(f"Test accuracy: {accuracy:.4f}")


In [None]:
example_sentence = "Its a very similar sort of thing You end up integrating u12 It leads to the integral of u12 du Is everybody seeing where this However there is a slightly better method So recommended method And I call this method advanced guessing What advanced guessing means is that youve done enough of these problems that you can see two steps ahead And you know whats going to happen So the advanced guessing leads you to believe that here you had a power 12 here you have the differential of the thing So its going to work out somehow And the advanced guessing allows you to guess that the answer should be something like this 1 x212 So this is your advanced guess And now you just differentiate it and see whether it works Well here it is Its 12 1 x212 2x thats the chain rule here Which sure enough gives you x over square root of 1 x2 So were done And so the answer is square root of 1 x2 c Let me illustrate this further with another example I strongly recommend that you do this but you have to get used to it So heres another example e6x dx My advanced guess is e6x And if I check when I differentiate it I get 6e6x Thats the derivative And so I know that the answer so now I know what the answer is Its 16 e6x c Now OK you could its also OK but slow to use a substitution to use u 6x Then youre going to get du 6dx dot dot dot Its going to work its just a waste of time Well Im going to give you a couple more examples So how about this one x ex2 dx Whats the guess Anybody have a guess Well you could also correct So I dont want you to bother yeah go ahead STUDENT INAUDIBLE PROFESSOR Yeah so youre already one step ahead of me"


encoded_example = vectorizer([example_sentence]).numpy()

padded_example = pad_sequences(encoded_example, maxlen=512, padding="post", truncating="post")


predictions = model.predict(padded_example)
predictions = predictions.squeeze()


In [None]:
inverse_vocab = {token:  index for token, index in enumerate(vectorizer.get_vocabulary())}
sorted_indices = np.argsort(predictions[:, 0])[::-1]


top_5_indices = sorted_indices[:5]

tokens_found = [vocab.get(idx, 'UNKNOWN') for idx in top_5_indices]

print("Tokens found:", tokens_found)



