<a href="https://colab.research.google.com/github/fgs2/f20aa-2024/blob/main/cw2/tfidfCopy.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# F20AA Applied Text Analytics: Coursework 2 - TF-IDF Notebook
#### Deadline: 11:59pm, Monday 1st April 2024 via Canvas group space

#### Members:
- Francis Sandrino (fgs2)
- Jai Varsani (jv81)
- Ahmed Moussa Abdelfattah (asa30)
- Aamir Nazir (mn2025)

### What is this?
The purpose of this notebook is to serve as a form of parallelization with different Google Colab accounts to speed up experimentation. This notebook will have minimal documentation, only to aid the group members in understanding the code. The proper documentation, results, and discussion for all processing notebooks is included in the [main file](../amazonCW.ipynb).

### What does this specific notebook deal with?
Experimentation with TF-IDF.

### TODO: Experimental Design

In [1]:
# This is so I don't have to keep uploading on Colab.

import os
import requests
from requests.auth import HTTPBasicAuth

def downloadFileFromRepo(username, repository, branch, filepath, token):
    # Construct the URL to download the file from GitHub
    url = f"https://raw.githubusercontent.com/{username}/{repository}/{branch}/{filepath}"

    # Send a GET request to download the file
    response = requests.get(url, auth=HTTPBasicAuth(username, token))

    # Check if the request was successful (status code 200)
    if response.status_code == 200:
        # Extract the file name from the URL
        fileName = filepath.split('/')[-1]

        # Create the 'data' directory if it doesn't exist
        if not os.path.exists('data'):
            os.makedirs('data')

        # Define the file path within the 'data' directory
        localFilepath = os.path.join('data', fileName)

        # Write the file content to a local file
        with open(localFilepath, 'wb') as f:
            f.write(response.content)
        print(f"File '{fileName}' downloaded successfully.")
    else:
        print(f"Failed to download file. Status code: {response.status_code}")

username = ""
repository = ""
branch = ""
path_to_file = ""
repoToken = ""
downloadFileFromRepo(username, repository, branch, path_to_file, repoToken)

path_to_file = "cw2/data/trainStemmed.csv"
downloadFileFromRepo(username, repository, branch, path_to_file, repoToken)

path_to_file = "cw2/data/testLemmatized.csv"
downloadFileFromRepo(username, repository, branch, path_to_file, repoToken)

path_to_file = "cw2/data/testStemmed.csv"
downloadFileFromRepo(username, repository, branch, path_to_file, repoToken)

path_to_file = "cw2/lemmaTokenizer.json"
downloadFileFromRepo(username, repository, branch, path_to_file, repoToken)

path_to_file = "cw2/stemTokenizer.json"
downloadFileFromRepo(username, repository, branch, path_to_file, repoToken)

File 'trainLemmatized.csv' downloaded successfully.
File 'trainStemmed.csv' downloaded successfully.
File 'testLemmatized.csv' downloaded successfully.
File 'testStemmed.csv' downloaded successfully.
File 'lemmaTokenizer.json' downloaded successfully.
File 'stemTokenizer.json' downloaded successfully.


In [2]:
!pip install --upgrade pip
!pip install tensorflow
!pip install pyyaml h5py

import tensorflow as tf
import nltk
import numpy as np
import pandas as pd

from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from IPython.display import clear_output
from sklearn.model_selection import train_test_split
from tensorflow.keras.layers import Dense, GlobalAveragePooling1D, Embedding, Conv1D, GlobalMaxPooling1D, LSTM, Bidirectional, Dropout
from tensorflow.keras.optimizers import Adam
from sklearn.model_selection import StratifiedKFold

nltk.download('wordnet')
nltk.download('punkt')

seed = 50

Collecting pip
  Downloading pip-24.0-py3-none-any.whl (2.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.1/2.1 MB[0m [31m2.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pip
  Attempting uninstall: pip
    Found existing installation: pip 23.1.2
    Uninstalling pip-23.1.2:
      Successfully uninstalled pip-23.1.2
Successfully installed pip-24.0
[0m

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [5]:
dataset = pd.read_csv("data/trainLemmatized.csv")
training, testing = train_test_split(dataset, stratify = dataset['labels'], test_size=0.2, random_state=42)
training.to_csv("data/trainLemmatizedTr.csv", index = False)
testing.to_csv("data/trainLemmatizedTe.csv", index = False)

In [6]:
# Empirical value
MAXLENGTH = 1885

# Determines number of rows per batch to process on
trainBatchSize = 512

lemmatizedDataset = tf.data.experimental.make_csv_dataset("data/trainLemmatizedTr.csv",
                                                batch_size = trainBatchSize,
                                                select_columns = ["data", "labels"],
                                                label_name = "labels",
                                                num_epochs = 10,
                                                shuffle_seed = 43,
                                                shuffle = True)

# Loading tokenizers from the JSON files
with open("data/lemmaTokenizer.json", "r") as json_file:
    tokenizerJSON = json_file.read()
    lemmaTokenizer = tf.keras.preprocessing.text.tokenizer_from_json(tokenizerJSON)
lemmaVocabSize = len(lemmaTokenizer.word_index)
print(f"Lemmatized Tokenizer loaded successfully with {lemmaVocabSize} words.")

# Loading tokenizers from the JSON files
with open("data/stemTokenizer.json", "r") as json_file:
    tokenizerJSON = json_file.read()
    stemTokenizer = tf.keras.preprocessing.text.tokenizer_from_json(tokenizerJSON)
stemVocabSize = len(stemTokenizer.word_index)
print(f"Stemmed Tokenizer loaded successfully with {stemVocabSize} words.")

Lemmatized Tokenizer loaded successfully with 77413 words.
Stemmed Tokenizer loaded successfully with 64940 words.


In [11]:
toTest = pd.read_csv("data/trainLemmatizedTe.csv")
toTestData = toTest['data'].tolist()
toTestLabels = toTest['labels'].tolist()

In [22]:
%%time

# Iterator to avoid loading the entire dataset
iterator = iter(lemmatizedDataset)
# To keep track of which batch we're operating on
progress = 0
updateAccuracy = 200
# Folds for cross-validation, uncomment when necessary
# kSplits = 10

# For evaluation later
losses = []
accuracies = []
# holdoutA = 0
# holdoutL = 0
aveAccuracy = 0
maxAccuracy = 0
accuracy = 0

model = tf.keras.Sequential([
    # Embedding(lemmaVocabSize, 128, input_length = MAXLENGTH),
    # Dropout(0.2),
    # Bidirectional(LSTM(128)),
    # Dropout(0.2),
    # Dense(64, activation='relu'),
    # Dense(5, activation='softmax')

    Embedding(lemmaVocabSize, 128, input_length = MAXLENGTH),
    Conv1D(128, 5, activation='relu'),
    GlobalAveragePooling1D(),
    Dense(128, activation='relu'),
    Dropout(0.3),
    Dense(5, activation='softmax')
])

# model = tf.keras.Sequential()
# model.add(Embedding(vocabSize, 64, input_length = maxLength))
# model.add(Bidirectional(LSTM(100)))
#     # Dense(1024, input_shape=(77413,), activation='relu'),
# model.add(tf.keras.layers.Dropout(0.2))
#     # Dense(512, activation='relu'),
#     # tf.keras.layers.Dropout(0.5),
#     # Dense(256, activation='relu'),
#     # tf.keras.layers.Dropout(0.5),
# model.add(Dense(5, activation='softmax'))

model.compile(optimizer = 'adam', loss = 'categorical_crossentropy', metrics = ['accuracy'])

toTestDataList = [str(item) for item in toTestData]
tokenizedTestingData = lemmaTokenizer.texts_to_sequences(toTestDataList)
paddedTestingData = tf.keras.preprocessing.sequence.pad_sequences(tokenizedTestingData, maxlen=MAXLENGTH, padding="post")
encodedToTestLabels = [x - 1 for x in toTestLabels]
encodedTestingLabels = tf.keras.utils.to_categorical(encodedToTestLabels, num_classes = 5)

try:
  while True:
    # Admin stuff
    progress = progress + 1
    if progress % 200 == 0:
      updateAccuracy = progress
    clear_output(wait = True)
    print(f"Batch number: {progress}")
    # print(f"Last holdout accuracy: {holdoutA}")
    print(f"Next accuracy update at batch: {updateAccuracy}")
    print(f"Max accuracy: {maxAccuracy}")
    print(f"Latest accuracy: {accuracy}")
    print(f"Average accuracy: {aveAccuracy}")

    # Obtain batch of text as a list
    batch = next(iterator)
    current = batch[0]['data'].numpy().tolist()
    decoded = list(map((lambda x : x.decode()), current))

    # Keep track of labels of each batch
    currentLabels = batch[1].numpy().tolist()

    # Convert to matrix of binaries (1 if the word occurs, 0 otherwise)
    # tokenizedDocs = loadedTokenizer.texts_to_matrix(decoded, mode = 'tfidf')
    tokenizedDocs = lemmaTokenizer.texts_to_sequences(decoded)
    paddedData=tf.keras.preprocessing.sequence.pad_sequences(tokenizedDocs, maxlen=MAXLENGTH, padding="post")

    adjustedLabels = [x - 1 for x in currentLabels]
    ohEncodedLabels = tf.keras.utils.to_categorical(adjustedLabels, num_classes = 5)


    # skf = StratifiedKFold(n_splits = kSplits, shuffle = True, random_state = seed)
    # # 10-fold cross-validation
    # for trainIndex, testIndex in skf.split(xTrain, yTrain):

    #   xTrainFold, xTestFold = xTrain[trainIndex], xTrain[testIndex]
    #   yTrainFold, yTestFold = [yTrain[i] - 1 for i in trainIndex], [yTrain[i] - 1 for i in testIndex]

    #   yTrainEncoded = tf.keras.utils.to_categorical(yTrainFold, num_classes = 5)
    #   yTestEncoded = tf.keras.utils.to_categorical(yTestFold, num_classes = 5)

    #   model.train_on_batch(xTrainFold, yTrainEncoded)

    #   loss, accuracy = model.evaluate(xTestFold, yTestEncoded)
    #   losses.append(loss)
    #   accuracies.append(accuracy)
    model.train_on_batch(paddedData, ohEncodedLabels)
    if progress % 200 == 0:

      loss, accuracy = model.evaluate(paddedTestingData, encodedTestingLabels)
      if maxAccuracy < accuracy:
        maxAccuracy = accuracy
      aveAccuracy = (((aveAccuracy * (progress - 1)) + accuracy) / updateAccuracy)
      updateAccuracy = updateAccuracy + 200

    # print(f"Validation Loss: {loss}")
    # print(f"Validation Accuracy: {accuracy}")

    # Compute average validation metrics
    # avgLoss = np.mean(losses)
    # avgAcc = np.mean(accuracies)

    # print(f"Average Validation Loss: {avgLoss}")
    # print(f"Average Validation Accuracy: {avgAcc}")

    # losses = []
    # accuracies = []

    # Evaluate the model on holdout set
    # yTest = [x - 1 for x in yTest]
    # yTestEncoded = tf.keras.utils.to_categorical(yTest, num_classes=5)

    # holdoutL, holdoutA = model.evaluate(xTest, yTestEncoded)
    # print(f"Holdout Loss: {holdoutL}, Holdout Accuracy: {holdoutA}")

except StopIteration:
  print("End of iterator reached.")


Batch number: 1
Next accuracy update at batch: 200
Max accuracy: 0
Latest accuracy: 0
Average accuracy: 0


KeyboardInterrupt: 

In [None]:
%%time

testBatchSize = 256

dataset = tf.data.experimental.make_csv_dataset("data/testLemmatized.csv",
                                                batch_size = testBatchSize,
                                                select_columns = ["processed"],
                                                num_epochs = 1,
                                                shuffle = False)

iterator = iter(dataset)

lemmatizer = WordNetLemmatizer()
preds = []
lemmatizedDocs = []
progress = 0

try:
  while True:

    # Admin stuff
    progress = progress + 1
    clear_output(wait = True)
    print(f"Test batch number: {progress}")

    # Obtain batch of text as a list
    batch = next(iterator)
    current = batch['processed'].numpy().tolist()
    decoded = list(map((lambda x : x.decode()), current))

    # Proper model input format
    tokenizedDocs = lemmaTokenizer.texts_to_sequences(decoded)
    paddedData=tf.keras.preprocessing.sequence.pad_sequences(tokenizedDocs, maxlen=MAXLENGTH, padding="post")

    # Testing against the model
    pred = model.predict(paddedData)
    truePredict = np.argmax(pred, axis = 1) + 1
    preds.append(truePredict)

except StopIteration:
  print("End of iterator reached.")



Test batch number: 95


In [None]:
concatenated = np.concatenate(preds)
results = pd.DataFrame(concatenated)
results.rename(columns = {0 : 'overall'}, inplace = True)
results.insert(0, 'id', range(len(results)))
results

In [None]:
results.to_csv("SimpleCNN1.csv", index = False)

In [None]:
model.save('SimpleCNN1.keras')
# loaded_model = tf.keras.saving.load_model('insert-model-name.keras')