In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import RandomForestClassifier
from  sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report

In [2]:
df = pd.read_csv("movies_sentiment_data.csv")
df.head(5)

Unnamed: 0,review,sentiment
0,I first saw Jake Gyllenhaal in Jarhead (2005) ...,positive
1,I enjoyed the movie and the story immensely! I...,positive
2,I had a hard time sitting through this. Every ...,negative
3,It's hard to imagine that anyone could find th...,negative
4,This is one military drama I like a lot! Tom B...,positive


In [3]:
df.shape

(19000, 2)

In [8]:
df['category'] = df.sentiment.apply(lambda x: 1 if x == 'positive' else 0)


In [9]:
df

Unnamed: 0,review,sentiment,category
0,I first saw Jake Gyllenhaal in Jarhead (2005) ...,positive,1
1,I enjoyed the movie and the story immensely! I...,positive,1
2,I had a hard time sitting through this. Every ...,negative,0
3,It's hard to imagine that anyone could find th...,negative,0
4,This is one military drama I like a lot! Tom B...,positive,1
...,...,...,...
18995,- Bad Stuff: This movie is real crap. Bad stun...,negative,0
18996,"If you've seen the trailer for this movie, you...",positive,1
18997,This has to be the all time best computer anim...,positive,1
18998,I've seen 'NSNA' just after I've seen all Roge...,positive,1


In [10]:
df['category'].value_counts()

1    9500
0    9500
Name: category, dtype: int64

In [11]:
X_train, X_test, y_train, y_test = train_test_split(df.review,df.category,test_size=0.2)

# RandomForest Classifier

In [12]:
pipeline = Pipeline([
    ('count_vectorizer', CountVectorizer()),  # Convert text to numerical features using CountVectorizer
    ('classifier', RandomForestClassifier(n_estimators=50, criterion='entropy'))  # Use Random Forest as the classifier
])

# Fit the pipeline on the training data
pipeline.fit(X_train, y_train)

# Predict on the test data
y_pred = pipeline.predict(X_test)

# Print the classification report
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.82      0.86      0.84      1882
           1       0.86      0.82      0.84      1918

    accuracy                           0.84      3800
   macro avg       0.84      0.84      0.84      3800
weighted avg       0.84      0.84      0.84      3800



# K Neighbours Classifier

In [13]:
pipeline = Pipeline([
    ('count_vectorizer', CountVectorizer()),  # Convert text to numerical features using CountVectorizer
    ('classifier', KNeighborsClassifier(n_neighbors=10, metric='euclidean'))  # Use KNN as the classifier
])

# Fit the pipeline on the training data
pipeline.fit(X_train, y_train)

# Predict on the test data
y_pred = pipeline.predict(X_test)

# Print the classification report
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.63      0.67      0.65      1882
           1       0.65      0.61      0.63      1918

    accuracy                           0.64      3800
   macro avg       0.64      0.64      0.64      3800
weighted avg       0.64      0.64      0.64      3800



# Multinomial Naive Bayes Clasiifier

In [14]:
# Define the pipeline steps
pipeline = Pipeline([
    ('count_vectorizer', CountVectorizer()),  # Convert text to numerical features using CountVectorizer
    ('classifier', MultinomialNB())  # Use Multinomial Naive Bayes as the classifier
])

# Fit the pipeline on the training data
pipeline.fit(X_train, y_train)

# Predict on the test data
y_pred = pipeline.predict(X_test)

# Print the classification report
print(classification_report(y_test, y_pred))


              precision    recall  f1-score   support

           0       0.83      0.89      0.86      1882
           1       0.88      0.82      0.85      1918

    accuracy                           0.85      3800
   macro avg       0.86      0.85      0.85      3800
weighted avg       0.86      0.85      0.85      3800



###Can you write some observations of why model like KNN fails to produce good results unlike RandomForest and MultinomialNB?

1.As Machine learning algorithms does not work on Text data directly, we need to convert them into numeric vector and feed that into models while training.
2.In this process, we convert text into a very high dimensional numeric vector using the technique of Bag of words.
3.Model like K-Nearest Neighbours(KNN) doesn't work well with high dimensional data because with large number of dimensions, it becomes difficult for the algorithm to calculate distance in each dimension. In higher dimensional space, the cost to calculate distance becomes expensive and hence impacts the performance of model.
4.The easy calculation of probabilities for the words in corpus(Bag of words) and storing them in contigency table is the major reason for the Multinomial NaiveBayes to be a text classification friendly algorithm.
5.As Random Forest uses Bootstrapping(Row and column Sampling) with many decision tree and overcomes the high variance and overfitting of high dimensional data and also uses feature importance of words for better classifing the categories.
6.Machine Learning is like trial and error scientific method, where we keep trying all the possible algorithms we have and select the one which give good results and satisfy the requirements like latency, interpretability etc.