# Dataset

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
import pandas as pd
import zipfile
import os

In [5]:
zip_file_path = '/content/drive/MyDrive/SEM7_SLP/sentiment+labelled+sentences.zip'

with zipfile.ZipFile(zip_file_path, 'r') as z:
    z.extractall('/content/extracted_data')

extracted_files = os.listdir('/content/extracted_data')
print("Extracted files:", extracted_files)

Extracted files: ['sentiment labelled sentences', '__MACOSX']


In [6]:
sentence_dir = '/content/extracted_data/sentiment labelled sentences'

In [7]:
data = []
for annot_file in os.listdir(sentence_dir):
    file_path = os.path.join(sentence_dir, annot_file)
    with open(file_path, 'r', encoding='utf-8', errors='ignore') as file:
        sentences = file.readlines()
        for sentence in sentences:
            parts = sentence.strip().rsplit('\t', 1)
            if len(parts) == 2:  # Ensure there are exactly two parts: sentence and label
                sentence_text, sentiment_label = parts
                data.append([sentence_text, sentiment_label])

In [8]:
df = pd.DataFrame(data, columns=['sentence', 'sentiment'])
df.head()

Unnamed: 0,sentence,sentiment
0,"A very, very, very slow-moving, aimless movie ...",0
1,Not sure who was more lost - the flat characte...,0
2,Attempting artiness with black & white and cle...,0
3,Very little music or anything to speak of.,0
4,The best scene in the movie was when Gerardo i...,1


# Preprocessing

In [9]:
import numpy as np
import gensim
import nltk
from nltk.corpus import stopwords

In [10]:
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [11]:
stop_words = set(stopwords.words('english')) #create a set from the stopwords module of nltk for enlish stopwords

In [12]:
def preprocess(text):
    tokens = nltk.word_tokenize(text.lower())
    tokens = [t for t in tokens if t.isalpha() and t not in stop_words]
    return tokens

In [13]:
df['processed_text'] = df['sentence'].apply(preprocess)

# With In-built Functions

In [26]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression #can't use NB bc word2vec vectors can have negetive values while NB does not accept -ve values
from sklearn.metrics import accuracy_score
from gensim.models import Word2Vec

In [14]:
X = df['processed_text']
y = df['sentiment']

In [17]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [18]:
# Vector Space Model: TF-IDF
tfidf_vectorizer = TfidfVectorizer(tokenizer=lambda x: x, preprocessor=lambda x: x)
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)



In [28]:
clf_tfidf = LogisticRegression(max_iter=1000)
clf_tfidf.fit(X_train_tfidf, y_train)
y_pred_tfidf = clf_tfidf.predict(X_test_tfidf)
accuracy_tfidf = accuracy_score(y_test, y_pred_tfidf)
print(f'TF-IDF Accuracy: {accuracy_tfidf:.4f}')

TF-IDF Accuracy: 0.7911


In [21]:
# Word Embeddings: Word2Vec
model = Word2Vec(sentences=X_train, vector_size=100, window=5, min_count=1, workers=4)

**Gensim Library's Word2Vec Model**

**vector_size=100**: The dimensionality of the word vectors. Each word will be represented as a 100-dimensional vector.

**window=5**: The maximum distance between the current and predicted word within a sentence. This parameter affects how many surrounding words are considered for context.

**min_count=1**: Ignores all words with a total frequency lower than this. In this case, all words are included since min_count is set to 1.

**workers=4**: Number of worker threads used to train the model.

Word2Vec model generates dense word vectors (embeddings) for each word. The **vectorize_text function** averages the vectors of the words in a document. This approach aggregates word-level information into a document-level representation, which can then be used for various machine learning tasks.

**model.wv**
model: This is an instance of the Word2Vec class after training.
wv: Stands for "word vectors". It is an attribute of the trained Word2Vec model that provides access to the vectors for the words in the vocabulary.

In [22]:
def vectorize_text(text, model):
    vector = np.zeros(100)
    count = 0
    for word in text:
        if word in model.wv:
            vector += model.wv[word]
            count += 1
    return vector / count if count != 0 else vector

In [23]:
X_train_w2v = np.array([vectorize_text(text, model) for text in X_train])
X_test_w2v = np.array([vectorize_text(text, model) for text in X_test])

In [29]:
clf_w2v = LogisticRegression(max_iter=1000)
clf_w2v.fit(X_train_w2v, y_train)
y_pred_w2v = clf_w2v.predict(X_test_w2v)
accuracy_w2v = accuracy_score(y_test, y_pred_w2v)
print(f'Word2Vec Accuracy: {accuracy_w2v:.4f}')

Word2Vec Accuracy: 0.5644


# OBSERVATION

The TF-IDF accuracy is significantly higher than the Word2Vec accuracy at 79.1% and 56.4%. Possible reasons for this output may be due to my choice of classifier being Logistic Regression for which the sparse, count-based nature of TF-IDF vectors provides clear and distinct features that it can leverage to differentiate between classes, resulting in relatively higher accuracy. On the other hand, Word2Vec's dense, continuous vectors while capture semantic relationships and contextual information, include negative values and are aggregated into document vectors by averaging. This aggregation can lead to a loss of individual word-specific features and nuances that reduce the distinction of indivdual features leading to an overall lower accuracy.


# Without In-built Functions

In [36]:
from collections import defaultdict
import math

**df = defaultdict(int)** is used to create a dictionary where missing keys automatically get a default value. Here's a detailed explanation of what this line does:

In [33]:
class TF_IDF:
    def __init__(self):
        self.vocabulary = {}
        self.idf = {}

    def fit(self, documents):
        df = defaultdict(int)
        for doc in documents:
            unique_terms = set(doc)
            for term in unique_terms:
                df[term] += 1
        self.vocabulary = {term: idx for idx, term in enumerate(df.keys())}
        total_docs = len(documents)
        self.idf = {term: math.log((total_docs + 1) / (df_count + 1)) + 1 for term, df_count in df.items()}

    def transform(self, documents):
        tfidf_matrix = np.zeros((len(documents), len(self.vocabulary)))
        for i, doc in enumerate(documents):
            tf = defaultdict(int)
            for term in doc:
                if term in self.vocabulary:
                    tf[term] += 1
            doc_len = len(doc)
            for term, count in tf.items():
                if term in self.vocabulary:
                    tf_val = count / doc_len
                    tfidf = tf_val * self.idf.get(term, 0.0)
                    tfidf_matrix[i, self.vocabulary[term]] = tfidf
        return tfidf_matrix

In [37]:
tfidf = TF_IDF()
tfidf.fit(X_train)

In [43]:
X_train_tfidf = tfidf.transform(X_train)

In [46]:
X_test_tfidf = tfidf.transform(X_test)

In [40]:
class LogisticRegression:
    def __init__(self, learning_rate=0.01, epochs=1000, verbose=False):
        self.learning_rate = learning_rate
        self.epochs = epochs
        self.verbose = verbose

    def sigmoid(self, z):
        return 1 / (1 + np.exp(-z))

    def fit(self, X, y):
        self.weights = np.zeros(X.shape[1])
        self.bias = 0

        for epoch in range(self.epochs):
            linear_model = np.dot(X, self.weights) + self.bias
            y_pred = self.sigmoid(linear_model)

            dw = np.dot(X.T, (y_pred - y)) / len(y)
            db = np.sum(y_pred - y) / len(y)

            self.weights -= self.learning_rate * dw
            self.bias -= self.learning_rate * db

            if self.verbose and epoch % 100 == 0:
                loss = - (y * np.log(y_pred + 1e-15) + (1 - y) * np.log(1 - y_pred + 1e-15)).mean()
                print(f'Epoch {epoch}, Loss: {loss:.4f}')

    def predict_prob(self, X):
        return self.sigmoid(np.dot(X, self.weights) + self.bias)

    def predict(self, X, threshold=0.5):
        return (self.predict_prob(X) >= threshold).astype(int)

In [44]:
clf= LogisticRegression()
clf.fit(X_train_tfidf, y_train)

In [49]:
y_pred= clf.predict(X_test_tfidf)
accuracy= accuracy_score(y_test, y_pred)
print(f'TF-IDF Accuracy: {accuracy_tfidf:.4f}')

TF-IDF Accuracy: 0.7911


# OBSERVATION

Manual implementation of TF-IDF proves the same as previously.