# 🧠 Sentiment Analysis using NLTK
This notebook demonstrates how to build a simple sentiment analysis model using the NLTK library with the Movie Reviews dataset.

In [3]:
import nltk
from nltk.corpus import movie_reviews
import random

nltk.download('movie_reviews')
nltk.download('punkt')

[nltk_data] Downloading package movie_reviews to
[nltk_data]     /Users/amiteshsinha/nltk_data...
[nltk_data]   Unzipping corpora/movie_reviews.zip.
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/amiteshsinha/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

## 📂 Step 2: Load and Prepare Data

In [5]:
# Load labeled movie reviews
documents = [(list(movie_reviews.words(fileid)), category)
             for category in movie_reviews.categories()
             for fileid in movie_reviews.fileids(category)]

# Shuffle the dataset
random.shuffle(documents)

print("Sample document:", documents[0][:50])  # preview first 50 tokens

Sample document: (['look', 'back', 'at', 'all', 'the', 'times', 'in', 'your', 'life', 'when', 'there', 'was', 'a', 'fork', 'in', 'the', 'path', 'to', 'the', 'future', '.', 'some', 'sort', 'of', 'decision', 'had', 'to', 'be', 'made', ',', 'and', ',', 'for', 'better', 'or', 'worse', ',', 'it', 'irrevocably', 'altered', 'the', 'course', 'of', 'your', 'existence', '.', '>', 'from', 'time', '-', 'to', '-', 'time', ',', 'everyone', 'thinks', 'about', 'the', 'roads', 'not', 'taken', ',', 'and', 'how', 'things', 'might', 'have', 'turned', 'out', 'if', 'the', 'choice', 'had', 'been', 'different', '.', 'perhaps', 'even', 'more', 'dizzying', 'to', 'contemplate', 'is', 'how', 'a', 'seemingly', 'minor', 'action', '--', 'catching', 'the', '10', 'am', 'train', ',', 'for', 'example', '--', 'could', 'have', 'an', 'equally', 'profound', ',', 'yet', 'less', 'obvious', ',', 'impact', '.', 'maybe', 'that', "'", 's', 'where', 'you', 'met', 'your', 'significant', 'other', ',', 'and', ',', 'had', 'you', 'reac

## ✨ Step 3: Feature Extraction Function

In [7]:
# Build a frequency distribution of all words
all_words = nltk.FreqDist(word.lower() for word in movie_reviews.words())

# Select top 2000 most common words as features
word_features = list(all_words)[:2000]

def document_features(document):
    words = set(document)
    return {word: (word in words) for word in word_features}

## 🧠 Step 4: Train a Naive Bayes Classifier

In [9]:
# Create feature sets
featuresets = [(document_features(d), c) for (d, c) in documents]

# Train-test split
train_set, test_set = featuresets[:1900], featuresets[1900:]

# Train classifier
classifier = nltk.NaiveBayesClassifier.train(train_set)

## ✅ Step 5: Evaluate the Model

In [11]:
print("Accuracy:", nltk.classify.accuracy(classifier, test_set))
classifier.show_most_informative_features(10)

Accuracy: 0.82
Most Informative Features
             outstanding = True              pos : neg    =     11.2 : 1.0
                   mulan = True              pos : neg    =      7.7 : 1.0
                  seagal = True              neg : pos    =      7.0 : 1.0
             wonderfully = True              pos : neg    =      6.6 : 1.0
                   damon = True              pos : neg    =      6.3 : 1.0
                   flynt = True              pos : neg    =      5.7 : 1.0
                  wasted = True              neg : pos    =      5.6 : 1.0
                    lame = True              neg : pos    =      5.5 : 1.0
                   waste = True              neg : pos    =      5.5 : 1.0
              ridiculous = True              neg : pos    =      5.0 : 1.0


## 💬 Step 6: Try Custom Sentiment Predictions

In [13]:
custom_review = "This movie was absolutely fantastic, with amazing acting and a great story!"
custom_tokens = custom_review.lower().split()
features = document_features(custom_tokens)
print("Sentiment:", classifier.classify(features))

Sentiment: neg
