In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, accuracy_score

print("Libraries imported successfully!")

Libraries imported successfully!


In [3]:
# Load the dataset from the CSV file
df = pd.read_csv('IMDB Dataset.csv')

# Display the first 5 rows to see what the data looks like
print("Data Head:")
print(df.head())

# Get a summary of the dataset
print("\nData Info:")
df.info()

# Check the balance between positive and negative reviews
print("\nSentiment Distribution:")
print(df['sentiment'].value_counts())

Data Head:
                                              review sentiment
0  One of the other reviewers has mentioned that ...  positive
1  A wonderful little production. <br /><br />The...  positive
2  I thought this was a wonderful way to spend ti...  positive
3  Basically there's a family where a little boy ...  negative
4  Petter Mattei's "Love in the Time of Money" is...  positive

Data Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   review     50000 non-null  object
 1   sentiment  50000 non-null  object
dtypes: object(2)
memory usage: 781.4+ KB

Sentiment Distribution:
sentiment
positive    25000
negative    25000
Name: count, dtype: int64


In [4]:
# Using a sample of 10,000 reviews for quicker training
df_sample = df.sample(n=10000, random_state=42)

# Define our features (X) and target (y)
X = df_sample['review']  # The text of the review
y = df_sample['sentiment'] # The label (positive/negative)

# Split the data into 80% for training and 20% for testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

print(f"Training data shape: {X_train.shape}")
print(f"Testing data shape: {X_test.shape}")

Training data shape: (8000,)
Testing data shape: (2000,)


In [5]:
# Create the pipeline
model_pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(stop_words='english')),
    ('classifier', LogisticRegression())
])

print("Model pipeline created:")
print(model_pipeline)

Model pipeline created:
Pipeline(steps=[('tfidf', TfidfVectorizer(stop_words='english')),
                ('classifier', LogisticRegression())])


In [6]:
print("Training the model...")
# The pipeline handles both vectorizing the text and training the classifier
model_pipeline.fit(X_train, y_train)

print("Model training complete! ✅")

Training the model...
Model training complete! ✅


In [9]:
print("Evaluating the model on the test set...")

# Make predictions on the test data
predictions = model_pipeline.predict(X_test)

# Calculate and print the overall accuracy
accuracy = accuracy_score(y_test, predictions)
print(f"\nOverall Accuracy: {accuracy:.4f} ({accuracy:.2%})✅")

# Print a detailed classification report
# This shows precision, recall, and f1-score for each class (positive/negative)
print("\nClassification Report:")
print(classification_report(y_test, predictions))

Evaluating the model on the test set...

Overall Accuracy: 0.8600 (86.00%)✅

Classification Report:
              precision    recall  f1-score   support

    negative       0.88      0.83      0.85       992
    positive       0.84      0.89      0.87      1008

    accuracy                           0.86      2000
   macro avg       0.86      0.86      0.86      2000
weighted avg       0.86      0.86      0.86      2000



In [10]:
# Example 1: A positive review
my_positive_review = "This movie was fantastic! I really enjoyed the acting and the plot was amazing."
prediction = model_pipeline.predict([my_positive_review])
print(f"Review: '{my_positive_review}'")
print(f"Predicted Sentiment: {prediction[0]} 😊")

print("-" * 30)

# Example 2: A negative review
my_negative_review = "It was a complete waste of time. The script was boring and predictable."
prediction = model_pipeline.predict([my_negative_review])
print(f"Review: '{my_negative_review}'")
print(f"Predicted Sentiment: {prediction[0]} 😠")

Review: 'This movie was fantastic! I really enjoyed the acting and the plot was amazing.'
Predicted Sentiment: positive 😊
------------------------------
Review: 'It was a complete waste of time. The script was boring and predictable.'
Predicted Sentiment: negative 😠
