<a href="https://colab.research.google.com/github/atomicamit24/Minor-Projects-using-AIML/blob/main/Sentiment_analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [23]:
# Import necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

# --- 1. Load the Dataset ---
print("Loading the dataset...")
# Assuming the user has uploaded the 'IMDB Dataset' file
url_csv = '/content/IMDB Dataset.csv' # Updated filename to match uploaded file
df = pd.read_csv(url_csv)

print("Dataset loaded successfully! Here's a preview:")
print(df.head())

# --- 2. Prepare the Data ---
# Convert sentiment labels from text ('positive'/'negative') to numbers (1/0)
df['sentiment'] = df['sentiment'].apply(lambda x: 1 if x == 'positive' else 0)

# Separate the reviews (features, X) from the sentiments (labels, y)
X = df['review']
y = df['sentiment']

# Split the data into a training set (80%) and a testing set (20%)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
print(f"\nData split into {len(X_train)} training samples and {len(X_test)} testing samples.")

# --- 3. Feature Extraction (Text to Numbers) ---
# Create and configure the TF-IDF Vectorizer
# This will convert text into numerical vectors, ignoring common English stop words
# and focusing on the top 5000 most frequent words.
print("Converting text to numbers using TF-IDF...")
vectorizer = TfidfVectorizer(stop_words='english', max_features=5000)

# Learn the vocabulary from the training data and transform it into vectors
X_train_tfidf = vectorizer.fit_transform(X_train)

# Transform the test data using the same learned vocabulary
X_test_tfidf = vectorizer.transform(X_test)
print("Text converted successfully!")

# --- 4. Train the Machine Learning Model ---
# Create the Logistic Regression model
model = LogisticRegression(max_iter=1000) # Increased max_iter for convergence

# Train the model on the training data
print("\nTraining the model...")
model.fit(X_train_tfidf, y_train)
print("Model training complete!")

# --- 5. Evaluate the Model ---
# Make predictions on the unseen test data
print("\nMaking predictions on the test set...")
y_pred = model.predict(X_test_tfidf)

# Calculate the accuracy of the predictions
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy * 100:.2f}%")

# Show a detailed report of the model's performance
print("\nClassification Report:")
print(classification_report(y_test, y_pred, target_names=['Negative', 'Positive']))

# --- 6. Test with a New Review ---
print("--- Testing with a new review ---")
# Get input from the user
my_review = input("Please enter the movie review you want to analyze: ")

# Convert the new review into a numerical vector using the same vectorizer
my_review_tfidf = vectorizer.transform([my_review])

# Use the trained model to predict the sentiment
prediction = model.predict(my_review_tfidf)

# Print the final result
if prediction[0] == 1:
    print("\nPrediction for your review: POSITIVE 👍")
else:
    print("\nPrediction for your review: NEGATIVE 👎")

Loading the dataset...
Dataset loaded successfully! Here's a preview:
                                              review sentiment
0  One of the other reviewers has mentioned that ...  positive
1  A wonderful little production. <br /><br />The...  positive
2  I thought this was a wonderful way to spend ti...  positive
3  Basically there's a family where a little boy ...  negative
4  Petter Mattei's "Love in the Time of Money" is...  positive

Data split into 40000 training samples and 10000 testing samples.
Converting text to numbers using TF-IDF...
Text converted successfully!

Training the model...
Model training complete!

Making predictions on the test set...
Accuracy: 88.98%

Classification Report:
              precision    recall  f1-score   support

    Negative       0.90      0.88      0.89      5000
    Positive       0.88      0.90      0.89      5000

    accuracy                           0.89     10000
   macro avg       0.89      0.89      0.89     10000
weighted avg 

In [21]:
# Download the dataset
!wget -O /content/imdb_dataset.csv https://raw.githubusercontent.com/AnkitMadaan/imdb_dataset/main/imdb_dataset.csv

print("Download complete. The file should now be in the /content/ directory.")

--2025-07-30 10:12:24--  https://raw.githubusercontent.com/AnkitMadaan/imdb_dataset/main/imdb_dataset.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.109.133, 185.199.108.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.109.133|:443... connected.
HTTP request sent, awaiting response... 404 Not Found
2025-07-30 10:12:25 ERROR 404: Not Found.

Download complete. The file should now be in the /content/ directory.


In [22]:
import os

# List files in the /content/ directory
files_in_content = os.listdir('/content/')
print("Files in /content/ directory:")
for file in files_in_content:
    print(file)

Files in /content/ directory:
.config
imdb_dataset.csv
IMDB Dataset.csv
.ipynb_checkpoints
sample_data
