In [None]:
import pandas as pd

df = pd.read_csv(
    '/content/drive/MyDrive/NLP project/IMDB Dataset.csv'
)

df.head()


Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [None]:
df.isnull().sum()

Unnamed: 0,0
review,0
sentiment,0


In [None]:
df['sentiment'].unique()

array(['positive', 'negative'], dtype=object)

In [None]:
df['sentiment'] = df['sentiment'].map({
    'positive' : 1,
    'negative' : 0
})

In [None]:
# Text Cleaning


-- converting text into lower
-- remove html tags
-- remove all punctuation
-- remove digit
-- remove url links
-- remove stopwords
-- Remove Emoji and Special Characters




In [None]:
import re
from sklearn.base import BaseEstimator, TransformerMixin
from nltk.corpus import stopwords

In [None]:
import re
import string
from sklearn.base import BaseEstimator, TransformerMixin
from nltk.corpus import stopwords

class TextCleaningTransformer(BaseEstimator, TransformerMixin):

    def __init__(self):
        pass

    def fit(self, X, y=None):
        self.stop_words = set(stopwords.words('english'))
        return self

    def transform(self, X):
        cleaned_text = []

        for txt in X:
            if not isinstance(txt, str):
                cleaned_text.append("")
                continue

            txt = txt.lower()
            txt = re.sub(r'<.*?>', '', txt)
            txt = re.sub(r'http\S+|www\S+', '', txt)
            txt = txt.translate(str.maketrans('', '', string.punctuation))
            txt = ''.join([i for i in txt if not i.isdigit()])
            txt = ''.join([i for i in txt if i.isascii()])

            words = txt.split()
            words = [w for w in words if w not in self.stop_words]

            cleaned_text.append(' '.join(words))

        return cleaned_text


In [None]:
import nltk
nltk.download('stopwords')


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X = df['review']
y = df['sentiment']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
X_train

Unnamed: 0,review
39087,That's what I kept asking myself during the ma...
30893,I did not watch the entire movie. I could not ...
45278,A touching love story reminiscent of In the M...
16398,This latter-day Fulci schlocker is a totally a...
13653,"First of all, I firmly believe that Norwegian ..."
...,...
11284,`Shadow Magic' recaptures the joy and amazemen...
44732,I found this movie to be quite enjoyable and f...
38158,Avoid this one! It is a terrible movie. So wha...
860,This production was quite a surprise for me. I...


In [None]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import accuracy_score , classification_report

In [None]:
pipeline = Pipeline([
    ('cleaning',TextCleaningTransformer()),
    ('tfidf',TfidfVectorizer()),
    ('model',LogisticRegression(random_state=42)
    )
]
)

In [None]:
param_dist = {
    # tfidf parameters
    "tfidf__ngram_range": [(1,1), (1,2)],
    "tfidf__max_features": [3000, 5000],

    # model parameters
    "model__C": [0.01, 0.1, 1, 10],
    "model__max_iter": [500, 1000]
}


In [None]:
random_search = RandomizedSearchCV(
    estimator=pipeline,
    param_distributions=param_dist,
    n_iter=10,
    cv=5,
    n_jobs=1,
    scoring='accuracy',
    random_state=42,
    verbose=2
)


In [None]:
random_search.fit(X_train,y_train)

Fitting 5 folds for each of 10 candidates, totalling 50 fits
[CV] END model__C=10, model__max_iter=1000, tfidf__max_features=3000, tfidf__ngram_range=(1, 2); total time=  46.5s
[CV] END model__C=10, model__max_iter=1000, tfidf__max_features=3000, tfidf__ngram_range=(1, 2); total time=  40.4s
[CV] END model__C=10, model__max_iter=1000, tfidf__max_features=3000, tfidf__ngram_range=(1, 2); total time=  33.9s
[CV] END model__C=10, model__max_iter=1000, tfidf__max_features=3000, tfidf__ngram_range=(1, 2); total time=  36.6s
[CV] END model__C=10, model__max_iter=1000, tfidf__max_features=3000, tfidf__ngram_range=(1, 2); total time=  33.5s
[CV] END model__C=0.1, model__max_iter=1000, tfidf__max_features=5000, tfidf__ngram_range=(1, 2); total time=  35.7s
[CV] END model__C=0.1, model__max_iter=1000, tfidf__max_features=5000, tfidf__ngram_range=(1, 2); total time=  33.8s
[CV] END model__C=0.1, model__max_iter=1000, tfidf__max_features=5000, tfidf__ngram_range=(1, 2); total time=  35.1s
[CV] END

In [None]:
print("Best CV Score:", random_search.best_score_)


Best CV Score: 0.8811499999999999


In [None]:
print("Best Parameters Found:")
random_search.best_params_


Best Parameters Found:


{'tfidf__ngram_range': (1, 2),
 'tfidf__max_features': 3000,
 'model__max_iter': 500,
 'model__C': 1}

In [None]:
best_model = random_search.best_estimator_


In [None]:
best_model

In [None]:
from sklearn.metrics import accuracy_score

y_test_pred = best_model.predict(X_test)
test_accuracy = accuracy_score(y_test, y_test_pred)

print("Test Accuracy:", test_accuracy)


Test Accuracy: 0.8862


In [None]:

from sklearn.metrics import classification_report

print(classification_report(y_test, y_test_pred))


              precision    recall  f1-score   support

           0       0.89      0.87      0.88      4961
           1       0.88      0.90      0.89      5039

    accuracy                           0.89     10000
   macro avg       0.89      0.89      0.89     10000
weighted avg       0.89      0.89      0.89     10000



In [None]:
new_reviews = [
    "This movie was amazing and full of emotions",
    "Worst movie ever, I regret watching it"
]

new_predictions = best_model.predict(new_reviews)

for review, pred in zip(new_reviews, new_predictions):
    sentiment = "Positive" if pred == 1 else "Negative"
    print(f"Review: {review}")
    print(f"Prediction: {sentiment}\n")


Review: This movie was amazing and full of emotions
Prediction: Positive

Review: Worst movie ever, I regret watching it
Prediction: Negative



In [None]:

import joblib
joblib.dump(random_search.best_estimator_, "sentiment_model.pkl")


['sentiment_model.pkl']

NameError: name 'df' is not defined