In [5]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
pip install scikit-learn pandas



In [7]:
import pandas as pd
import zipfile
import os

In [4]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, accuracy_score

In [8]:
zip_file_path = '/content/drive/MyDrive/SEM7_SLP/sentiment+labelled+sentences.zip'

with zipfile.ZipFile(zip_file_path, 'r') as z:
    z.extractall('/content/extracted_data')

extracted_files = os.listdir('/content/extracted_data')
print("Extracted files:", extracted_files)

Extracted files: ['__MACOSX', 'sentiment labelled sentences']


In [9]:
sentence_dir = '/content/extracted_data/sentiment labelled sentences'

In [10]:
data = []
for annot_file in os.listdir(sentence_dir):
    file_path = os.path.join(sentence_dir, annot_file)
    with open(file_path, 'r', encoding='utf-8', errors='ignore') as file:
        sentences = file.readlines()
        for sentence in sentences:
            parts = sentence.strip().rsplit('\t', 1)
            if len(parts) == 2:  # Ensure there are exactly two parts: sentence and label
                sentence_text, sentiment_label = parts
                data.append([sentence_text, sentiment_label])

In [11]:
df = pd.DataFrame(data, columns=['sentence', 'sentiment'])
df.head()

Unnamed: 0,sentence,sentiment
0,"A very, very, very slow-moving, aimless movie ...",0
1,Not sure who was more lost - the flat characte...,0
2,Attempting artiness with black & white and cle...,0
3,Very little music or anything to speak of.,0
4,The best scene in the movie was when Gerardo i...,1


In [12]:
X = df['sentence']
y = df['sentiment']

In [13]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Using Inbuilt Functions


**CountVectorizer()**: Converts text documents into a matrix of token counts

In [None]:
vectorizer_no_ngrams = CountVectorizer()
vectorizer_with_ngrams = CountVectorizer(ngram_range=(1, 2)) #includes unigrams and bigrams

**sklearn.pipeline.Pipeline[('step_name', transformer_or_estimator)**

*Pipeline* is an object that sequences a series of data processing steps and a final estimator into a single object.

*Transformer/Estimator*: An object that implements fit and optionally transform (for transformers) or predict (for estimators).

In [None]:
pipeline_no_ngrams = Pipeline([
    ('vectorizer', vectorizer_no_ngrams),
    ('classifier', MultinomialNB())
])

In [None]:
pipeline_with_ngrams = Pipeline([
    ('vectorizer', vectorizer_with_ngrams),
    ('classifier', MultinomialNB())
])

In [None]:
pipeline_no_ngrams.fit(X_train, y_train)
y_pred_no_ngrams = pipeline_no_ngrams.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred_no_ngrams))
print("Classification Report:\n", classification_report(y_test, y_pred_no_ngrams))

Accuracy: 0.8255555555555556
Classification Report:
               precision    recall  f1-score   support

           0       0.80      0.86      0.83       443
           1       0.85      0.79      0.82       457

    accuracy                           0.83       900
   macro avg       0.83      0.83      0.83       900
weighted avg       0.83      0.83      0.83       900



In [None]:
pipeline_with_ngrams.fit(X_train, y_train)
y_pred_with_ngrams = pipeline_with_ngrams.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred_with_ngrams))
print("Classification Report:\n", classification_report(y_test, y_pred_with_ngrams))

Accuracy: 0.8388888888888889
Classification Report:
               precision    recall  f1-score   support

           0       0.82      0.87      0.84       443
           1       0.86      0.81      0.84       457

    accuracy                           0.84       900
   macro avg       0.84      0.84      0.84       900
weighted avg       0.84      0.84      0.84       900



# OBSERVATION

**Accuracy**: The accuracy improved from 82.56% to 83.89% when including n-grams. This is likely because the n-grams capture additional context from the text, allowing for more accurate predictions rather just induvidual words alone.

**Precision**: Precision improved slightly for both classes. This indicates that the n-gram model is better at predicting the correct class for positive instances while maintaining a good precision for negative instances.

**Recall**: Increased recall with n-grams shows that model is better a predicting true class instances.

**F1-score**: The F1-scores for both classes have improved which suggests a balanced improvement in both precision and recall due to the inclusion of n-grams.

Thus overall, the n-grams based approach enhances the model's ability to capture contextual information from the text leading to more accurate and balanced performance

# Without use of inbuilt functions

In [1]:
import numpy as np
from collections import defaultdict
from math import log

In [2]:
class UnigramFeatureExtractor:
    def __init__(self):
        self.vocabulary = {}

    def fit(self, documents):
        all_words = [word for doc in documents for word in doc]
        self.vocabulary = {word: idx for idx, word in enumerate(set(all_words))} #enumerate() creates an index (idx) for all ele in set
        # {key: value for item in iterable} key: word ; value: idk ; for the (idx, word) pair output by enumerate()

    def transform(self, documents):
        # Transform documents into unigram vectors
        feature_matrix = np.zeros((len(documents), len(self.vocabulary)))
        for i, doc in enumerate(documents):
            for word in doc:
                if word in self.vocabulary:
                    feature_matrix[i, self.vocabulary[word]] += 1
        return feature_matrix

In [15]:
unigram_extractor = UnigramFeatureExtractor()
unigram_extractor.fit(X_train)

In [16]:
X_train_unigram = unigram_extractor.transform(X_train)
X_test_unigram = unigram_extractor.transform(X_test)

**tokens[i:]** for i in range(self.n)] generates a list of sublists starting from index i:

    For i = 0, tokens[i:] gives ['a', 'b', 'c', 'd'].
    For i = 1, tokens[i:] gives ['b', 'c', 'd'].
    For i = 2, tokens[i:] gives ['c', 'd'].

*** (Splat Operator)**: The splat operator * is used to unpack the list of lists into separate arguments. For the example above, it unpacks [['a', 'b', 'c', 'd'], ['b', 'c', 'd'], ['c', 'd']] into three separate lists.

**zip(*[...])**: zip takes multiple iterables (lists in this case) and aggregates them into tuples. The * operator ensures that each inner list is passed as a separate argument to zip.

In [40]:
class NgramFeatureExtractor:
    def __init__(self, n=2):
        self.n = n
        self.vocabulary = {}

    def generate_ngrams(self, tokens):
        ngrams = zip(*[tokens[i:] for i in range(self.n)])
        return [" ".join(ngram) for ngram in ngrams]

    def fit(self, documents):
        all_ngrams = []
        for doc in documents:
            tokens = doc.split() if isinstance(doc, str) else doc

            unigrams = tokens #must be converted into a list for concatenation
            bigrams = self.generate_ngrams(doc) if self.n >= 2 else []
            trigrams = self.generate_ngrams(doc) if self.n >= 3 else []
            all_ngrams.extend(unigrams + bigrams + trigrams) #.extend() adds items to end of current list

        self.vocabulary = {ngram: idx for idx, ngram in enumerate(set(all_ngrams))}

    def transform(self, documents):
        feature_matrix = np.zeros((len(documents), len(self.vocabulary)))
        for i, doc in enumerate(documents):
            tokens = doc.split() if isinstance(doc, str) else doc

            unigrams = tokens
            bigrams = self.generate_ngrams(doc) if self.n >= 2 else []
            trigrams = self.generate_ngrams(doc) if self.n >= 3 else []
            ngrams = unigrams + bigrams + trigrams
            for ngram in ngrams:
                if ngram in self.vocabulary:
                    feature_matrix[i, self.vocabulary[ngram]] += 1
        return feature_matrix

In [41]:
ngram_extractor = NgramFeatureExtractor(n=3)
ngram_extractor.fit(X_train)

In [42]:
X_train_ngram = ngram_extractor.transform(X_train)
X_test_ngram = ngram_extractor.transform(X_test)

In [44]:
class NaiveBayesClassifier:
    def __init__(self):
        self.class_probs = {}
        self.feature_probs = defaultdict(lambda: defaultdict(lambda: 0))
        self.classes = []

    def fit(self, X, y):
        num_docs = len(y)
        self.classes = np.unique(y)
        class_counts = defaultdict(lambda: 0)
        feature_counts = defaultdict(lambda: defaultdict(lambda: 0))

        for i, label in enumerate(y):
            class_counts[label] += 1
            for j in range(X.shape[1]):
                feature_counts[label][j] += X[i, j]

        self.class_probs = {cls: count / num_docs for cls, count in class_counts.items()}
        for cls in self.classes:
            total_features = sum(feature_counts[cls].values())
            for feature in feature_counts[cls]:
                self.feature_probs[cls][feature] = (feature_counts[cls][feature] + 1) / (total_features + len(self.classes))

    def predict(self, X):
        predictions = []
        for i in range(X.shape[0]):
            log_probs = {}
            for cls in self.classes:
                log_prob = log(self.class_probs[cls])
                for j in range(X.shape[1]):
                    if X[i, j] > 0:
                        log_prob += log(self.feature_probs[cls].get(j, 1 / (len(self.classes) + 1)))
                log_probs[cls] = log_prob
            predictions.append(max(log_probs, key=log_probs.get))
        return predictions

In [45]:
nb = NaiveBayesClassifier()
nb.fit(X_train_unigram, y_train)
y_pred_no_ngrams = nb.predict(X_test_unigram)

In [46]:
nb = NaiveBayesClassifier()
nb.fit(X_train_ngram, y_train)
y_pred_ngrams = nb.predict(X_test_ngram)

In [48]:
print("Accuracy:", accuracy_score(y_test, y_pred_no_ngrams))
print("Classification Report:\n", classification_report(y_test, y_pred_no_ngrams))

Accuracy: 0.6033333333333334
Classification Report:
               precision    recall  f1-score   support

           0       0.58      0.67      0.62       443
           1       0.63      0.54      0.58       457

    accuracy                           0.60       900
   macro avg       0.61      0.60      0.60       900
weighted avg       0.61      0.60      0.60       900



In [50]:
print("Accuracy:", accuracy_score(y_test, y_pred_ngrams))
print("Classification Report:\n", classification_report(y_test, y_pred_ngrams))

Accuracy: 0.8122222222222222
Classification Report:
               precision    recall  f1-score   support

           0       0.79      0.84      0.81       443
           1       0.83      0.79      0.81       457

    accuracy                           0.81       900
   macro avg       0.81      0.81      0.81       900
weighted avg       0.81      0.81      0.81       900



# OBSERVATION

There is a significant impovement in accuracy from the model without n-grams to the model with n-grams, from 60.3% to 81.2%. All other metrics including recall, precision and f1-score also have similar significant increases which indicates that incorporating n-grams helps **capture contextual word relationships more effectively**, leading to better classification performance.