In [3]:
import os
import pandas as pd
import tarfile
import urllib.request
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.neighbors import KNeighborsClassifier
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

# Ensure the required NLTK resources are available
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [4]:
# Download and extract the IMDb dataset
url = "http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz"
dataset_path = "/content/aclImdb_v1.tar.gz"
urllib.request.urlretrieve(url, dataset_path)

with tarfile.open(dataset_path) as tar:
    tar.extractall(path="/content/")

# Function to read data from the extracted files
def read_imdb_data(data_dir='/content/aclImdb'):
    data = {'train': {}, 'test': {}}
    labels = {'train': {}, 'test': {}}

    for data_type in ['train', 'test']:
        for sentiment in ['pos', 'neg']:
            path = os.path.join(data_dir, data_type, sentiment)
            reviews = []
            sentiment_labels = []

            for file_name in os.listdir(path):
                with open(os.path.join(path, file_name), 'r', encoding='utf-8') as file:
                    reviews.append(file.read())
                    sentiment_labels.append(1 if sentiment == 'pos' else 0)

            data[data_type][sentiment] = pd.DataFrame({
                'review': reviews,
                'sentiment': sentiment_labels
            })

        data[data_type] = pd.concat([data[data_type]['pos'], data[data_type]['neg']], ignore_index=True)

    return data['train'], data['test']

train_data, test_data = read_imdb_data()


In [5]:
# Preprocess the text data
stop_words = set(stopwords.words('english'))

def preprocess(text):
    tokens = word_tokenize(text.lower())
    filtered_tokens = [word for word in tokens if word.isalnum() and not word in stop_words]
    return ' '.join(filtered_tokens)

train_data['review'] = train_data['review'].apply(preprocess)
test_data['review'] = test_data['review'].apply(preprocess)

In [6]:
# Vectorize the text data
tfidf_vectorizer = TfidfVectorizer()
bow_vectorizer = CountVectorizer()

X_train_tfidf = tfidf_vectorizer.fit_transform(train_data['review'])
X_train_bow = bow_vectorizer.fit_transform(train_data['review'])

X_test_tfidf = tfidf_vectorizer.transform(test_data['review'])
X_test_bow = bow_vectorizer.transform(test_data['review'])

In [7]:
# Train logistic regression models
model_tfidf = LogisticRegression(max_iter=1000)
model_bow = LogisticRegression(max_iter=1000)

model_tfidf.fit(X_train_tfidf, train_data['sentiment'])
model_bow.fit(X_train_bow, train_data['sentiment'])

In [8]:
# Make predictions and evaluate the models
y_pred_tfidf = model_tfidf.predict(X_test_tfidf)
y_pred_bow = model_bow.predict(X_test_bow)

accuracy_tfidf = accuracy_score(test_data['sentiment'], y_pred_tfidf)
accuracy_bow = accuracy_score(test_data['sentiment'], y_pred_bow)

print("Accuracy with TF-IDF:", accuracy_tfidf)
print("Accuracy with BoW:", accuracy_bow)

Accuracy with TF-IDF: 0.88228
Accuracy with BoW: 0.86164


In [9]:
# Train and evaluate a KNN classifier
knn_tfidf = KNeighborsClassifier(n_neighbors=3)
knn_tfidf.fit(X_train_tfidf, train_data['sentiment'])
y_pred_knn_tfidf = knn_tfidf.predict(X_test_tfidf)
accuracy_knn_tfidf = accuracy_score(test_data['sentiment'], y_pred_knn_tfidf)

print("Accuracy with KNN (k=3) using TF-IDF:", accuracy_knn_tfidf)

Accuracy with KNN (k=3) using TF-IDF: 0.62864
