In [29]:
import nltk.classify.util
from nltk.classify import NaiveBayesClassifier
nltk.download('movie_reviews')
from nltk.corpus import movie_reviews 

[nltk_data] Downloading package movie_reviews to
[nltk_data]     /Users/syedzaidi/nltk_data...
[nltk_data]   Package movie_reviews is already up-to-date!


In [30]:
# Function to convert sentences into feature vectors by considering every word as a feature.
def word_feats(words):
  return dict([(word, True) for word in words])

# Extract positive and negative sentences
negids = movie_reviews.fileids('neg')
posids = movie_reviews.fileids('pos')
print('Positive review example: ' + ' '.join(movie_reviews.words(posids[0])))
print('Negative review example: ' + ' '.join(movie_reviews.words(negids[5])))

# Convert sentences into feature vectors
negfeats = [(word_feats(movie_reviews.words(fileids=[f])), 'neg') for f in negids]
posfeats = [(word_feats(movie_reviews.words(fileids=[f])), 'pos') for f in posids]

# Create the dataset and split it into training and testing
negcutoff = int(len(negfeats) * 3/4)
poscutoff = int(len(posfeats) * 3/4)
trainfeats = negfeats[:negcutoff] + posfeats[:poscutoff]
testfeats = negfeats[negcutoff:] + posfeats[poscutoff:]
print('train on %d instances, test on %d instances' % (len(trainfeats), len(testfeats)))

# Train and evaluate the Naive Bayes classifier
classifier = NaiveBayesClassifier.train(trainfeats)
print('accuracy:', nltk.classify.util.accuracy(classifier, testfeats))
classifier.show_most_informative_features()

Positive review example: films adapted from comic books have had plenty of success , whether they ' re about superheroes ( batman , superman , spawn ) , or geared toward kids ( casper ) or the arthouse crowd ( ghost world ) , but there ' s never really been a comic book like from hell before . for starters , it was created by alan moore ( and eddie campbell ) , who brought the medium to a whole new level in the mid ' 80s with a 12 - part series called the watchmen . to say moore and campbell thoroughly researched the subject of jack the ripper would be like saying michael jackson is starting to look a little odd . the book ( or " graphic novel , " if you will ) is over 500 pages long and includes nearly 30 more that consist of nothing but footnotes . in other words , don ' t dismiss this film because of its source . if you can get past the whole comic book thing , you might find another stumbling block in from hell ' s directors , albert and allen hughes . getting the hughes brothers t

In [31]:
all_words = nltk.FreqDist(w.lower() for w in movie_reviews.words())

# Select the top N most frequent words
N = 2000
most_common_words = all_words.most_common(N)
top_words = [word for word, freq in most_common_words]

def word_feats(words, top_words):
    return dict([(word, True) for word in words if word in top_words])

negfeats = [(word_feats(movie_reviews.words(fileids=[f]), top_words), 'neg') for f in negids]
posfeats = [(word_feats(movie_reviews.words(fileids=[f]), top_words), 'pos') for f in posids]

trainfeats = negfeats[:negcutoff] + posfeats[:poscutoff]
testfeats = negfeats[negcutoff:] + posfeats[poscutoff:]


KeyboardInterrupt: 

In [None]:
pip install pandas



[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip available: [0m[31;49m22.3.1[0m[39;49m -> [0m[32;49m24.0[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip3 install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [None]:
import pandas as pd  

In [None]:
df = pd.DataFrame(most_common_words, columns=['Word', 'Frequency'])
df

Unnamed: 0,Word,Frequency
0,",",77717
1,the,76529
2,.,65876
3,a,38106
4,and,35576
...,...,...
1995,villains,70
1996,critic,70
1997,lets,70
1998,visit,70


In [None]:
top_10_percent_cutoff = int(len(df) * 0.1)
bottom_90_percent_cutoff = int(len(df) * 0.9)
filtered_df = df.iloc[top_10_percent_cutoff:bottom_90_percent_cutoff]

filtered_df

Unnamed: 0,Word,Frequency
200,say,802
201,right,798
202,john,798
203,although,795
204,played,791
...,...,...
1795,babe,79
1796,aspects,79
1797,presents,78
1798,kills,78


In [None]:
pip install scikit-learn



[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip available: [0m[31;49m22.3.1[0m[39;49m -> [0m[32;49m24.0[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip3 install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [None]:
from sklearn.pipeline import make_pipeline
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
import nltk

nltk.download('stopwords')
nltk.download('wordnet')

# Preprocessing function
def preprocess_texts(texts, use_tfidf=True, remove_stopwords=False, use_stemming=False, use_lemmatization=False, ngram_range=(1,1)):
    if remove_stopwords:
        stop_words = set(stopwords.words('english'))
    else:
        stop_words = None

    if use_stemming:
        stemmer = PorterStemmer()
        analyzer = CountVectorizer().build_analyzer()
        def stemmed_words(doc):
            return (stemmer.stem(w) for w in analyzer(doc))
        analyzer_function = stemmed_words
    elif use_lemmatization:
        lemmatizer = WordNetLemmatizer()
        analyzer = CountVectorizer().build_analyzer()
        def lemmatized_words(doc):
            return (lemmatizer.lemmatize(w) for w in analyzer(doc))
        analyzer_function = lemmatized_words
    else:
        analyzer_function = 'word'
    
    if use_tfidf:
        vectorizer = TfidfVectorizer(stop_words=stop_words, analyzer=analyzer_function, ngram_range=ngram_range)
    else:
        vectorizer = CountVectorizer(stop_words=stop_words, analyzer=analyzer_function, ngram_range=ngram_range)
    
    return vectorizer

def evaluate_classification(X_train, y_train, X_test, y_test, vectorizer):
    clf = make_pipeline(vectorizer, MultinomialNB())
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    print(classification_report(y_test, y_pred))


[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/syedzaidi/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/syedzaidi/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [None]:
from nltk import FreqDist, word_tokenize
import numpy as np

In [None]:

# Determining the frequency of terms in each document
pos = [FreqDist(movie_reviews.words(f)) for f in posids]
neg = [FreqDist(movie_reviews.words(f)) for f in negids]

# Adding the labels for each document class
pos_df = pd.DataFrame(pos)
pos_df['label'] = 'pos'
neg_df = pd.DataFrame(neg)
neg_df['label'] = 'neg'

# Concat the pos and neg 
full = pd.concat((pos_df, neg_df), axis=0).fillna(0)

# Compute term frequencies
tf_matrix = full.drop('label', axis=1).div(full.drop('label', axis=1).sum(axis=1), axis=0)

# Compute inverse document frequencies
idf_matrix = full.drop('label', axis=1).sum(axis=0).div(len(full))

# Finding TF-IDF weight
tf_idf = tf_matrix*idf_matrix

# Seperate the positive and negative instances again, and replace 0 with nans
pos_tf_idf = tf_idf.loc[full['label']=='pos', :].replace(0, np.nan)
neg_tf_idf = tf_idf.loc[full['label']=='neg', :].replace(0, np.nan)

# Convert the positive and negative TF-IDF weights back into the NLTK format
posfeats_tf_idf = []
for _, row in pos_tf_idf.iterrows():
    posfeats_tf_idf.append((row.dropna().to_dict(), 'pos')) # <- row.dropna is why 0's are replaced above

negfeats_tf_idf = []
for _, row in neg_tf_idf.iterrows():
    negfeats_tf_idf.append((row.dropna().to_dict(), 'neg'))

print(posfeats_tf_idf[0][0])

{'films': 0.0008909512761020882, 'adapted': 2.668213457076566e-05, 'from': 0.02319721577726218, 'comic': 0.0011281902552204176, 'books': 4.5243619489559165e-05, 'have': 0.005685614849187935, 'had': 0.0026902552204176333, 'plenty': 7.77262180974478e-05, 'of': 0.2771009280742459, 'success': 0.00012529002320185614, ',': 1.938417053364269, 'whether': 0.00012587006960556845, 'they': 0.005597447795823666, "'": 0.44351798143851506, 're': 0.0006566125290023201, 'about': 0.00817401392111369, 'superheroes': 6.960556844547564e-06, '(': 0.12178190255220417, 'batman': 0.00011774941995359629, 'superman': 1.508120649651972e-05, 'spawn': 4.756380510440835e-05, ')': 0.12300348027842227, 'or': 0.005477958236658933, 'geared': 8.120649651972158e-06, 'toward': 5.80046403712297e-05, 'kids': 0.0001902552204176334, 'casper': 1.334106728538283e-05, 'the': 2.041957076566125, 'arthouse': 1.740139211136891e-06, 'crowd': 4.6403712296983755e-05, 'ghost': 4.466357308584687e-05, 'world': 0.0012030162412993038, 'but':

In [None]:
import pandas as pd
from nltk.corpus import movie_reviews
from nltk.probability import FreqDist

# Determining the frequency of terms in each document
pos = [FreqDist(movie_reviews.words(f)) for f in movie_reviews.fileids('pos')]
neg = [FreqDist(movie_reviews.words(f)) for f in movie_reviews.fileids('neg')]

# Adding the labels for each document class
pos_df = pd.DataFrame(pos)
pos_df['label'] = 'pos'
neg_df = pd.DataFrame(neg)
neg_df['label'] = 'neg'

# Concatenate the positive and negative documents
full = pd.concat((pos_df, neg_df), axis=0).fillna(0)

# Compute term frequencies
tf_matrix = full.drop('label', axis=1).div(full.drop('label', axis=1).sum(axis=1), axis=0)

# Separate the positive and negative instances
pos_tf = tf_matrix.loc[full['label']=='pos', :].replace(0, np.nan)
neg_tf = tf_matrix.loc[full['label']=='neg', :].replace(0, np.nan)

# Convert the positive and negative TF weights back into the NLTK format
posfeats_tf = []
for _, row in pos_tf.iterrows():
    posfeats_tf.append((row.dropna().to_dict(), 'pos')) 

negfeats_tf = []
for _, row in neg_tf.iterrows():
    negfeats_tf.append((row.dropna().to_dict(), 'neg'))

print(posfeats_tf[0][0])


{'films': 0.001160092807424594, 'adapted': 0.001160092807424594, 'from': 0.009280742459396751, 'comic': 0.00580046403712297, 'books': 0.001160092807424594, 'have': 0.002320185614849188, 'had': 0.0034802784222737818, 'plenty': 0.001160092807424594, 'of': 0.016241299303944315, 'success': 0.001160092807424594, ',': 0.04988399071925754, 'whether': 0.001160092807424594, 'they': 0.002320185614849188, "'": 0.029002320185614848, 're': 0.001160092807424594, 'about': 0.004640371229698376, 'superheroes': 0.001160092807424594, '(': 0.02088167053364269, 'batman': 0.001160092807424594, 'superman': 0.001160092807424594, 'spawn': 0.001160092807424594, ')': 0.02088167053364269, 'or': 0.0034802784222737818, 'geared': 0.001160092807424594, 'toward': 0.001160092807424594, 'kids': 0.001160092807424594, 'casper': 0.001160092807424594, 'the': 0.05336426914153132, 'arthouse': 0.001160092807424594, 'crowd': 0.001160092807424594, 'ghost': 0.001160092807424594, 'world': 0.002320185614849188, 'but': 0.00812064965

In [None]:
import pandas as pd
from nltk.corpus import movie_reviews
from nltk.probability import FreqDist
from nltk.corpus import stopwords

# Download NLTK's stop words list
import nltk
nltk.download('stopwords')

# Get the stop words list
stop_words = set(stopwords.words('english'))

# Determining the frequency of terms in each document after stop words removal
pos = [FreqDist(word for word in movie_reviews.words(f) if word.lower() not in stop_words) for f in movie_reviews.fileids('pos')]
neg = [FreqDist(word for word in movie_reviews.words(f) if word.lower() not in stop_words) for f in movie_reviews.fileids('neg')]

# Adding the labels for each document class
pos_df = pd.DataFrame(pos)
pos_df['label'] = 'pos'
neg_df = pd.DataFrame(neg)
neg_df['label'] = 'neg'

# Concatenate the positive and negative 
full = pd.concat((pos_df, neg_df), axis=0).fillna(0)

# Compute term frequencies after stop words removal
tf_matrix = full.drop('label', axis=1).div(full.drop('label', axis=1).sum(axis=1), axis=0)

pos_tf = tf_matrix.loc[full['label']=='pos', :].replace(0, np.nan)
neg_tf = tf_matrix.loc[full['label']=='neg', :].replace(0, np.nan)

# Convert the positive and negative TF weights back into the NLTK format
posfeats_tf = []
for _, row in pos_tf.iterrows():
    posfeats_tf.append((row.dropna().to_dict(), 'pos')) 

negfeats_tf = []
for _, row in neg_tf.iterrows():
    negfeats_tf.append((row.dropna().to_dict(), 'neg'))

print(posfeats_tf[0][0])


[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/syedzaidi/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


{'films': 0.0018214936247723133, 'adapted': 0.0018214936247723133, 'comic': 0.009107468123861567, 'books': 0.0018214936247723133, 'plenty': 0.0018214936247723133, 'success': 0.0018214936247723133, ',': 0.07832422586520947, 'whether': 0.0018214936247723133, "'": 0.04553734061930783, 'superheroes': 0.0018214936247723133, '(': 0.03278688524590164, 'batman': 0.0018214936247723133, 'superman': 0.0018214936247723133, 'spawn': 0.0018214936247723133, ')': 0.03278688524590164, 'geared': 0.0018214936247723133, 'toward': 0.0018214936247723133, 'kids': 0.0018214936247723133, 'casper': 0.0018214936247723133, 'arthouse': 0.0018214936247723133, 'crowd': 0.0018214936247723133, 'ghost': 0.0018214936247723133, 'world': 0.0036429872495446266, 'never': 0.0036429872495446266, 'really': 0.0036429872495446266, 'book': 0.00546448087431694, 'like': 0.007285974499089253, 'hell': 0.009107468123861567, '.': 0.04189435336976321, 'starters': 0.0018214936247723133, 'created': 0.0018214936247723133, 'alan': 0.0018214

In [None]:

# Determining the frequency of terms in each document
pos = [FreqDist(movie_reviews.words(f)) for f in movie_reviews.fileids('pos')]
neg = [FreqDist(movie_reviews.words(f)) for f in movie_reviews.fileids('neg')]

# Adding the labels for each document class
pos_df = pd.DataFrame(pos)
pos_df['label'] = 'pos'
neg_df = pd.DataFrame(neg)
neg_df['label'] = 'neg'

full = pd.concat((pos_df, neg_df), axis=0).fillna(0)

# Compute term frequencies
tf_matrix = full.drop('label', axis=1).div(full.drop('label', axis=1).sum(axis=1), axis=0)

# Separate the positive and negative instances 
pos_tf = tf_matrix.loc[full['label']=='pos', :].replace(0, np.nan)
neg_tf = tf_matrix.loc[full['label']=='neg', :].replace(0, np.nan)

posfeats_tf = []
for _, row in pos_tf.iterrows():
    posfeats_tf.append((row.dropna().to_dict(), 'pos')) 

negfeats_tf = []
for _, row in neg_tf.iterrows():
    negfeats_tf.append((row.dropna().to_dict(), 'neg'))

print(posfeats_tf[0][0])


{'films': 0.001160092807424594, 'adapted': 0.001160092807424594, 'from': 0.009280742459396751, 'comic': 0.00580046403712297, 'books': 0.001160092807424594, 'have': 0.002320185614849188, 'had': 0.0034802784222737818, 'plenty': 0.001160092807424594, 'of': 0.016241299303944315, 'success': 0.001160092807424594, ',': 0.04988399071925754, 'whether': 0.001160092807424594, 'they': 0.002320185614849188, "'": 0.029002320185614848, 're': 0.001160092807424594, 'about': 0.004640371229698376, 'superheroes': 0.001160092807424594, '(': 0.02088167053364269, 'batman': 0.001160092807424594, 'superman': 0.001160092807424594, 'spawn': 0.001160092807424594, ')': 0.02088167053364269, 'or': 0.0034802784222737818, 'geared': 0.001160092807424594, 'toward': 0.001160092807424594, 'kids': 0.001160092807424594, 'casper': 0.001160092807424594, 'the': 0.05336426914153132, 'arthouse': 0.001160092807424594, 'crowd': 0.001160092807424594, 'ghost': 0.001160092807424594, 'world': 0.002320185614849188, 'but': 0.00812064965

In [None]:

# Determining the frequency of unigrams in each document
pos = [FreqDist(movie_reviews.words(f)) for f in movie_reviews.fileids('pos')]
neg = [FreqDist(movie_reviews.words(f)) for f in movie_reviews.fileids('neg')]

# Adding the labels for each document class
pos_df = pd.DataFrame(pos)
pos_df['label'] = 'pos'
neg_df = pd.DataFrame(neg)
neg_df['label'] = 'neg'


full = pd.concat((pos_df, neg_df), axis=0).fillna(0)

# Compute term frequencies for unigrams
tf_matrix = full.drop('label', axis=1).div(full.drop('label', axis=1).sum(axis=1), axis=0)

# Separate the positive and negative instances
pos_tf = tf_matrix.loc[full['label']=='pos', :].replace(0, np.nan)
neg_tf = tf_matrix.loc[full['label']=='neg', :].replace(0, np.nan)

posfeats_tf = []
for _, row in pos_tf.iterrows():
    posfeats_tf.append((row.dropna().to_dict(), 'pos')) 

negfeats_tf = []
for _, row in neg_tf.iterrows():
    negfeats_tf.append((row.dropna().to_dict(), 'neg'))

print(posfeats_tf[0][0])


{'films': 0.001160092807424594, 'adapted': 0.001160092807424594, 'from': 0.009280742459396751, 'comic': 0.00580046403712297, 'books': 0.001160092807424594, 'have': 0.002320185614849188, 'had': 0.0034802784222737818, 'plenty': 0.001160092807424594, 'of': 0.016241299303944315, 'success': 0.001160092807424594, ',': 0.04988399071925754, 'whether': 0.001160092807424594, 'they': 0.002320185614849188, "'": 0.029002320185614848, 're': 0.001160092807424594, 'about': 0.004640371229698376, 'superheroes': 0.001160092807424594, '(': 0.02088167053364269, 'batman': 0.001160092807424594, 'superman': 0.001160092807424594, 'spawn': 0.001160092807424594, ')': 0.02088167053364269, 'or': 0.0034802784222737818, 'geared': 0.001160092807424594, 'toward': 0.001160092807424594, 'kids': 0.001160092807424594, 'casper': 0.001160092807424594, 'the': 0.05336426914153132, 'arthouse': 0.001160092807424594, 'crowd': 0.001160092807424594, 'ghost': 0.001160092807424594, 'world': 0.002320185614849188, 'but': 0.00812064965

In [None]:

from nltk.corpus import movie_reviews
from nltk.probability import FreqDist
from nltk import ngrams

# Function to generate unigrams and bigrams from a list of words
def extract_ngrams(words):
    unigrams = list(words)
    bigrams = list(ngrams(words, 2))
    return unigrams + bigrams

# Determining the frequency of unigrams and bigrams in each document
pos = [FreqDist(extract_ngrams(movie_reviews.words(f))) for f in movie_reviews.fileids('pos')]
neg = [FreqDist(extract_ngrams(movie_reviews.words(f))) for f in movie_reviews.fileids('neg')]

# Adding the labels for each document class
pos_df = pd.DataFrame(pos)
pos_df['label'] = 'pos'
neg_df = pd.DataFrame(neg)
neg_df['label'] = 'neg'

full = pd.concat((pos_df, neg_df), axis=0).fillna(0)

# Compute term frequencies for unigrams and bigrams
# Divide each term (column) for each document by the sum of each row
tf_matrix = full.drop('label', axis=1).div(full.drop('label', axis=1).sum(axis=1), axis=0)

pos_tf = tf_matrix.loc[full['label']=='pos', :].replace(0, np.nan)
neg_tf = tf_matrix.loc[full['label']=='neg', :].replace(0, np.nan)

posfeats_tf = []
for _, row in pos_tf.iterrows():
    posfeats_tf.append((row.dropna().to_dict(), 'pos')) 

negfeats_tf = []
for _, row in neg_tf.iterrows():
    negfeats_tf.append((row.dropna().to_dict(), 'neg'))

print(posfeats_tf[0][0])


Exception ignored in: <bound method IPythonKernel._clean_thread_parent_frames of <ipykernel.ipkernel.IPythonKernel object at 0x107ecb810>>
Traceback (most recent call last):
  File "/Users/syedzaidi/Library/Python/3.11/lib/python/site-packages/ipykernel/ipkernel.py", line 770, in _clean_thread_parent_frames
    def _clean_thread_parent_frames(

KeyboardInterrupt: 
Exception ignored in: <bound method IPythonKernel._clean_thread_parent_frames of <ipykernel.ipkernel.IPythonKernel object at 0x107ecb810>>
Traceback (most recent call last):
  File "/Users/syedzaidi/Library/Python/3.11/lib/python/site-packages/ipykernel/ipkernel.py", line 770, in _clean_thread_parent_frames
    def _clean_thread_parent_frames(

KeyboardInterrupt: 


{'films': 0.0005803830528148578, 'adapted': 0.0005803830528148578, 'from': 0.0046430644225188625, 'comic': 0.002901915264074289, 'books': 0.0005803830528148578, 'have': 0.0011607661056297156, 'had': 0.0017411491584445734, 'plenty': 0.0005803830528148578, 'of': 0.00812536273940801, 'success': 0.0005803830528148578, ',': 0.024956471271038887, 'whether': 0.0005803830528148578, 'they': 0.0011607661056297156, "'": 0.014509576320371444, 're': 0.0005803830528148578, 'about': 0.0023215322112594312, 'superheroes': 0.0005803830528148578, '(': 0.01044689495066744, 'batman': 0.0005803830528148578, 'superman': 0.0005803830528148578, 'spawn': 0.0005803830528148578, ')': 0.01044689495066744, 'or': 0.0017411491584445734, 'geared': 0.0005803830528148578, 'toward': 0.0005803830528148578, 'kids': 0.0005803830528148578, 'casper': 0.0005803830528148578, 'the': 0.026697620429483458, 'arthouse': 0.0005803830528148578, 'crowd': 0.0005803830528148578, 'ghost': 0.0005803830528148578, 'world': 0.0011607661056297

In [None]:
pip install --upgrade nltk



[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip available: [0m[31;49m22.3.1[0m[39;49m -> [0m[32;49m24.0[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip3 install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [None]:

import nltk
nltk.download('all', force=True)

[nltk_data] Downloading collection 'all'
[nltk_data]    | 
[nltk_data]    | Downloading package abc to
[nltk_data]    |     /Users/syedzaidi/nltk_data...
[nltk_data]    |   Unzipping corpora/abc.zip.
[nltk_data]    | Downloading package alpino to
[nltk_data]    |     /Users/syedzaidi/nltk_data...
[nltk_data]    |   Unzipping corpora/alpino.zip.
[nltk_data]    | Downloading package averaged_perceptron_tagger to
[nltk_data]    |     /Users/syedzaidi/nltk_data...
[nltk_data]    |   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data]    | Downloading package averaged_perceptron_tagger_ru to
[nltk_data]    |     /Users/syedzaidi/nltk_data...
[nltk_data]    |   Unzipping
[nltk_data]    |       taggers/averaged_perceptron_tagger_ru.zip.
[nltk_data]    | Downloading package basque_grammars to
[nltk_data]    |     /Users/syedzaidi/nltk_data...
[nltk_data]    |   Unzipping grammars/basque_grammars.zip.
[nltk_data]    | Downloading package bcp47 to
[nltk_data]    |     /Users/syedzaidi/

KeyboardInterrupt: 

In [1]:

from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import BernoulliNB, MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import classification_report

from nltk.corpus import movie_reviews
documents = [(movie_reviews.words(file_id), category) for category in movie_reviews.categories() for file_id in movie_reviews.fileids(category)]

reviews = [" ".join(document) for document, category in documents]
labels = [category for document, category in documents]

X_train, X_test, y_train, y_test = train_test_split(reviews, labels, test_size=0.2, random_state=42)

# Define pipelines for each classifier
pipelines = {
    'BernoulliNB': Pipeline([
        ('vect', CountVectorizer()),
        ('tfidf', TfidfTransformer()),
        ('clf', BernoulliNB()),
    ]),
    'MultinomialNB': Pipeline([
        ('vect', CountVectorizer()),
        ('tfidf', TfidfTransformer()),
        ('clf', MultinomialNB()),
    ]),
    'LogisticRegression': Pipeline([
        ('vect', CountVectorizer()),
        ('tfidf', TfidfTransformer()),
        ('clf', LogisticRegression(max_iter=1000)),
    ]),
    'SVM': Pipeline([
        ('vect', CountVectorizer()),
        ('tfidf', TfidfTransformer()),
        ('clf', SVC()),
    ]),
}

# Define parameter grids for grid search
param_grids = {
    'BernoulliNB': {
        'vect__binary': [True, False],
        'clf__alpha': [0.1, 0.5, 1.0],
    },
    'MultinomialNB': {
        'vect__binary': [True, False],
        'clf__alpha': [0.1, 0.5, 1.0],
    },
    'LogisticRegression': {
        'clf__C': [0.1, 1.0, 10.0],
    },
    'SVM': {
        'clf__C': [0.1, 1.0, 10.0],
        'clf__kernel': ['linear', 'rbf'],
    },
}

# Perform grid search and evaluate each classifier
for classifier_name, pipeline in pipelines.items():
    print(f"Classifier: {classifier_name}")
    grid_search = GridSearchCV(pipeline, param_grids[classifier_name], cv=5, n_jobs=-1)
    grid_search.fit(X_train, y_train)
    
    print("Best parameters found:")
    print(grid_search.best_params_)
    
    # Evaluate on test set
    y_pred = grid_search.predict(X_test)
    print("Classification Report:")
    print(classification_report(y_test, y_pred))
    print("="*50)


Classifier: BernoulliNB
Best parameters found:
{'clf__alpha': 0.1, 'vect__binary': True}
Classification Report:
              precision    recall  f1-score   support

         neg       0.74      0.85      0.79       199
         pos       0.82      0.70      0.76       201

    accuracy                           0.78       400
   macro avg       0.78      0.78      0.77       400
weighted avg       0.78      0.78      0.77       400

Classifier: MultinomialNB
Best parameters found:
{'clf__alpha': 1.0, 'vect__binary': True}
Classification Report:
              precision    recall  f1-score   support

         neg       0.83      0.86      0.84       199
         pos       0.86      0.83      0.84       201

    accuracy                           0.84       400
   macro avg       0.84      0.84      0.84       400
weighted avg       0.84      0.84      0.84       400

Classifier: LogisticRegression
Best parameters found:
{'clf__C': 10.0}
Classification Report:
              precision   