In [None]:
!pip install pandas beautifulsoup4 nltk scikit-learn




In [1]:
# to get dataset from google drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import pandas as pd
import nltk
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import re
from bs4 import BeautifulSoup


# Download NLTK resources
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

# Make variables
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

# Function to clean the IMdb Dataset
def clean_text(review):
    # Cleaning HTML tags
    review = BeautifulSoup(review, 'html.parser').get_text()
    # Removing Special Characters
    review = re.sub("[^a-zA-Z]", " ", review)
    # Converting Reviews to lowercase
    review = review.lower()
    # Tokenizing reviews
    words = nltk.word_tokenize(review)
    # Removing stopwords and lemmatizing
    foundWords= [lemmatizer.lemmatize(word) for word in words if word not in stop_words]
    cleanReviws = ' '.join(foundWords)
    return cleanReviws

# Function to preprocess data and return cleaned data
def preprocess_data(file_path):
    df = pd.read_csv(file_path)
    df['cleanReviws'] = df['review'].apply(clean_text)
    return df[['cleanReviws']]

# Function to vectorize the data using TF-IDF
def vectorize_data(df):
    # max_features will take the top specified words from the total vectorized words
    vectorize = TfidfVectorizer(max_features=5000)
    X = vectorize.fit_transform(df['cleanReviws']).toarray()
    return X, vectorize

# File path to dataset (in google drive)
file_path = '/content/drive/My Drive/Colab Notebooks/IMDB Dataset.csv'

# Preprocess the data
df = preprocess_data(file_path)

# Print the first 10 cleaned reviews
print(df['cleanReviws'].head(10))

# Clean Reviews saved in new file
output_path = '/content/drive/My Drive/Colab Notebooks/dataCleaned.csv'
df.to_csv(output_path, index=False)


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
  review = BeautifulSoup(review, 'html.parser').get_text()


0    one reviewer mentioned watching oz episode hoo...
1    wonderful little production filming technique ...
2    thought wonderful way spend time hot summer we...
3    basically family little boy jake think zombie ...
4    petter mattei love time money visually stunnin...
5    probably time favorite movie story selflessnes...
6    sure would like see resurrection dated seahunt...
7    show amazing fresh innovative idea first aired...
8    encouraged positive comment film looking forwa...
9    like original gut wrenching laughter like movi...
Name: cleanReviws, dtype: object


In [3]:
import pandas as pd
import nltk
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import re
from bs4 import BeautifulSoup

# Download NLTK resources
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

# Make variables
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

# Function to clean the IMDB Dataset
def clean_text(review):
    # Cleaning HTML tags
    review = BeautifulSoup(review, 'html.parser').get_text()
    # Removing Special Characters
    review = re.sub("[^a-zA-Z]", " ", review)
    # Converting Reviews to lowercase
    review = review.lower()
    # Tokenizing reviews
    words = nltk.word_tokenize(review)
    # Removing stopwords and lemmatizing
    foundWords = [lemmatizer.lemmatize(word) for word in words if word not in stop_words]
    cleanReviews = ' '.join(foundWords)
    return cleanReviews

# Function to preprocess data and return cleaned data
def preprocess_data(file_path):
    df = pd.read_csv(file_path)
    df['cleanReviews'] = df['review'].apply(clean_text)
    return df[['cleanReviews', 'sentiment']]

# File path to dataset (in Google Drive)
file_path = '/content/drive/My Drive/Colab Notebooks/IMDB Dataset.csv'

# Preprocess the data
df = preprocess_data(file_path)

# Print the first 10 cleaned reviews
print(df.head(10))

# Save cleaned reviews and sentiment to a new file
output_path = '/content/drive/My Drive/Colab Notebooks/dataCleaned.csv'
df.to_csv(output_path, index=False)


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
  review = BeautifulSoup(review, 'html.parser').get_text()


                                        cleanReviews sentiment
0  one reviewer mentioned watching oz episode hoo...  positive
1  wonderful little production filming technique ...  positive
2  thought wonderful way spend time hot summer we...  positive
3  basically family little boy jake think zombie ...  negative
4  petter mattei love time money visually stunnin...  positive
5  probably time favorite movie story selflessnes...  positive
6  sure would like see resurrection dated seahunt...  positive
7  show amazing fresh innovative idea first aired...  negative
8  encouraged positive comment film looking forwa...  negative
9  like original gut wrenching laughter like movi...  positive


In [9]:
# Importing libraries for model training
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import pandas as pd

# Loading the cleaned reviews
file_path = '/content/drive/My Drive/Colab Notebooks/dataCleaned.csv'
df = pd.read_csv(file_path)

# Splitting the dataset into features and target variable
X = df['cleanReviews']
y = df['sentiment']

# Using TF-IDF Vectorizer for feature extraction
tfidf_vectorizer = TfidfVectorizer(max_features=5000)
X_tfidf = tfidf_vectorizer.fit_transform(X)

# Splitting the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X_tfidf, y, test_size=0.2, random_state=42)

# Initialize and train the Logistic Regression model
model = LogisticRegression(max_iter=3000)
model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')

# Print the evaluation metrics
print(f'Accuracy of Model: {accuracy:.3f}')
print(f'Precision of Model: {precision:.3f}')
print(f'Recall: {recall:.3f}')
print(f'F1 Score: {f1:.3f}')

# Testing the model with sample sentences
sample_sentences = ["The stunning visuals and compelling storyline make this movie a must-watch for any cinema lover.",
                     "The pacing was incredibly slow, making it hard to stay engaged with the story.",
                     "This film masterfully blends humor and drama, leaving the audience both entertained and moved."]

# Vectorizing the sample sentences
sample_tfidf = tfidf_vectorizer.transform(sample_sentences)

# Sentiment Prediction
sample_predictions = model.predict(sample_tfidf)

# Printing Prediction
for sentence, sentiment in zip(sample_sentences, sample_predictions):
    print(f'Sentence: "{sentence}" - Sentiment: {sentiment}')


Accuracy of Model: 0.888
Precision of Model: 0.888
Recall: 0.888
F1 Score: 0.888
Sentence: "The stunning visuals and compelling storyline make this movie a must-watch for any cinema lover." - Sentiment: positive
Sentence: "The pacing was incredibly slow, making it hard to stay engaged with the story." - Sentiment: negative
Sentence: "This film masterfully blends humor and drama, leaving the audience both entertained and moved." - Sentiment: positive
