### Imports

In [8]:
import pandas as pd

# sklearn
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report
from sklearn.model_selection import GridSearchCV

### Fetch dataset and create training/test data

In [5]:
df = pd.read_csv('new_IMDB_Dataset.csv', on_bad_lines='skip')

X, y = df['review'], df['sentiment']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

### Naive Bayes classifier

In [7]:

pipe = Pipeline([
    ('vectorizer', CountVectorizer()),
    ('nb', MultinomialNB())
]).fit(X_train, y_train)

y_pred = pipe.predict(X_test)
report = classification_report(y_test, y_pred)
print(report)

              precision    recall  f1-score   support

    negative       0.83      0.88      0.85      5000
    positive       0.87      0.82      0.85      5000

    accuracy                           0.85     10000
   macro avg       0.85      0.85      0.85     10000
weighted avg       0.85      0.85      0.85     10000



### Naive Bayes - Fine tuned

In [19]:

parameters = {
    'vectorize__binary' : [True, False],
    'vectorize__ngram_range': [(1, 1), (1, 2), (1, 3)],
    'vectorize__stop_words': ['english', None],
    'vectorize__analyzer': ['word'],
    'predictor__alpha': [0.1, 0.2, 0.4, 0.6, 0.8, 1.0],
}

pipe = Pipeline([
    ('vectorize', CountVectorizer()),
    ('predictor', MultinomialNB()),
])

clf = GridSearchCV(pipe, parameters, scoring='f1_macro', cv=5)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)

print("best params:", clf.best_params_)
print()
report = classification_report(y_test, y_pred)
print(report)

best params: {'predictor__alpha': 0.2, 'vectorize__analyzer': 'word', 'vectorize__binary': True, 'vectorize__ngram_range': (1, 3), 'vectorize__stop_words': None}

              precision    recall  f1-score   support

    negative       0.88      0.91      0.90      5000
    positive       0.90      0.88      0.89      5000

    accuracy                           0.89     10000
   macro avg       0.89      0.89      0.89     10000
weighted avg       0.89      0.89      0.89     10000

