## Loading Libraries

In [None]:
import os
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt

In [None]:
parent_folder = 'txt_sentoken'

# Dictionary to store file contents
file_contents_dict = {}

for subfolder_name in os.listdir(parent_folder):
    subfolder_path = os.path.join(parent_folder, subfolder_name)

    # To check if the subfolder is a directory
    if os.path.isdir(subfolder_path):
        # List to store file contents in the subfolder
        subfolder_contents = []

        # Iterate through each file in the subfolder
        for filename in os.listdir(subfolder_path):
            
            if filename.endswith('.txt'):
                # Construct the path to the file
                file_path = os.path.join(subfolder_path, filename)

                # Read the contents of the file and append to subfolder_contents list
                with open(file_path, 'r') as file:
                    file_contents = file.read()
                    subfolder_contents.append(file_contents)

        file_contents_dict[subfolder_name] = subfolder_contents

In [None]:
# Checking the Data Format
for subfolder_name, contents in file_contents_dict.items():
    print(f"First 20 contents of files in folder {subfolder_name}:")
    for i, content in enumerate(contents[:2], start=1):
        print(f"File {i}:")
        print(content)
        print()

In [None]:
# storing reviews in variables
neg_reviews = file_contents_dict['neg']
pos_reviews = file_contents_dict['pos']
all_reviews = neg_reviews + pos_reviews

In [None]:
num_negative = 1000
num_positive = 1000

# Creating labels for the model to be used for learning
labels = [0] * num_negative + [1] * num_positive

## Data Preprocessing

In [None]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
import string
import re

# Download NLTK resources (only need to do this once)
nltk.download('punkt')
nltk.download('stopwords')

# Initialize stemmer and set of stopwords
stemmer = PorterStemmer()
stop_words = set(stopwords.words('english'))

# Function to preprocess a list of documents
def preprocess_documents(documents):
    preprocessed_documents = []
    for doc in documents:
        # Remove HTML tags
        doc = re.sub(r'<.*?>', '', doc)
        
        # Remove non-alphanumeric characters and extra whitespaces
        doc = re.sub(r'[^a-zA-Z0-9\s]', '', doc)
        
        # Tokenization
        tokens = word_tokenize(doc)
        
        # Lowercasing
        tokens = [word.lower() for word in tokens]
        
        # Stopword removal
        tokens = [word for word in tokens if word not in stop_words]
        
        # Stemming
        tokens = [stemmer.stem(word) for word in tokens]
        
        preprocessed_documents.append(tokens)
    
    return preprocessed_documents

# Concatenate negative and positive reviews into a single list
all_reviews = neg_reviews + pos_reviews

preprocessed_all_reviews = preprocess_documents(all_reviews)

In [None]:
from gensim.models import Word2Vec

# Train Word2Vec model
model = Word2Vec(sentences=preprocessed_all_reviews, vector_size=100, window=5, min_count=1, workers=4)

# Save the trained model
model.save("word2vec_model")

# Optionally, you can also save the vectors
model.wv.save_word2vec_format("word_vectors.txt", binary=False)

In [None]:
from gensim.models import KeyedVectors

# Loading the pre-trained Word2Vec model
model = KeyedVectors.load_word2vec_format("word_vectors.txt", binary=False)

# Function to convert text to word vectors
def text_to_vectors(text):
    word_vectors = []
    for word in text:
        try:
            vector = model[word]  
            word_vectors.append(vector)
        except KeyError:
            pass
    return word_vectors

# Apply function to the preprocessed text data
word_vectors = [text_to_vectors(text) for text in preprocessed_all_reviews]

In [None]:
import random
from sklearn.model_selection import train_test_split

# Shuffle the data and labels in unison
combined_data = list(zip(word_vectors, labels))
random.shuffle(combined_data)
word_vectors_shuffled, labels_shuffled = zip(*combined_data)

# Split the shuffled data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(word_vectors_shuffled, labels_shuffled, test_size=0.2, random_state=42)

## ^^^ Above ^^^

Since the data we have was separate, we do not want to end up training the model on more of one type of reviews, above code make sure that doesnt happen by shuffling the features and respective labels together.

In [None]:
# Function to average word vectors for each review
def average_word_vectors(word_vectors):
    averaged_vectors = []
    for review_vectors in word_vectors:
        if len(review_vectors) > 0:
            averaged_vector = np.mean(review_vectors, axis=0)
        else:
            averaged_vector = np.zeros_like(word_vectors[0][0])  # If the review has no word vectors, use zeros
        averaged_vectors.append(averaged_vector)
    return averaged_vectors

# Average word vectors for training data
X_train_averaged = average_word_vectors(X_train)

# Average word vectors for testing data
X_test_averaged = average_word_vectors(X_test)


## Averaging out i.e pooling the word vectors into one to feed to the model



from sklearn.ensemble import RandomForestClassifier

# Initialize Random Forest model
model = RandomForestClassifier()

# Train the model
model.fit(X_train_averaged, y_train)

# Make predictions on the testing set
y_pred = model.predict(X_test_averaged)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

## Trying Using OpenAI Embeddings

instead of using word2vec for the vector representation of words, we can also use other embeddings, but we will still finally have to pool the vectors into one because the ml models only takes one input vector, we lose of a lot of information because of this, not being able to process the words of the sentence in the same order or opposite words negating some affects of each other in the pooled vector.

In [None]:
%pip install -qU langchain-openai

In [None]:
os.environ['OPENAI_API_KEY'] = 'YOUR_OPENAI_API_KEY_HERE'

In [None]:
import langchain
from langchain_openai import OpenAIEmbeddings

embeddings = OpenAIEmbeddings(model="text-embedding-3-large")

# Create a list to store review embeddings
review_embeddings = []

# trying to pool the vectors for each word into one to pass to the ML model.
def average_pooling(embeddings):
    """Calculates the average of the word embeddings in a document."""
    return np.mean(embeddings, axis=0)

# Function to get embeddings for a single review
def get_embeddings(review):
    # Get embeddings for each token
    token_embeddings = embeddings.embed_documents(review)
    # Pool the token embeddings (here, using average pooling)
    pooled_review_embedding = average_pooling(token_embeddings)
    return pooled_review_embedding

# Generate embeddings for each cleaned review
for review in preprocessed_all_reviews:
    embedding = get_embeddings(review)
    review_embeddings.append(embedding)