<a href="https://colab.research.google.com/github/arzhrd/Flipkart-Product-Sentimental-Analysis/blob/main/Untitled7.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:

# FLIPKART SENTIMENT ANALYSIS PROJECT


# Step 1: Import All Required Libraries
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import nltk
import re
import string
from nltk.corpus import stopwords
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from wordcloud import WordCloud, STOPWORDS
import warnings
warnings.filterwarnings('ignore')

# Download NLTK data
nltk.download('stopwords')
nltk.download('vader_lexicon')
nltk.download('punkt')
nltk.download('wordnet')

print("✓ All libraries imported successfully!")


# Step 2: Data Ingestion


# Read the Flipkart dataset
data = pd.read_csv("/kaggle/input/flipkart-product-customer-reviews-dataset/Dataset-SA.csv")

print(f"\n{'='*50}")
print("DATASET OVERVIEW")
print(f"{'='*50}")
print(f"Dataset Shape: {data.shape}")
print(f"Total Reviews: {len(data)}")
print(f"\nFirst 5 rows:")
print(data.head())

print(f"\nColumn Names: {data.columns.tolist()}")
print(f"\nMissing Values:")
print(data.isnull().sum())

print(f"\nSentiment Distribution:")
print(data['Sentiment'].value_counts())


# Step 3: Data Cleaning Pipeline using NLTK


print(f"\n{'='*50}")
print("DATA CLEANING & PREPROCESSING")
print(f"{'='*50}")

# Initialize stemmer and stopwords
stemmer = nltk.SnowballStemmer("english")
stopword = set(stopwords.words('english'))

def clean(text):
    """Complete text cleaning pipeline"""
    text = str(text).lower()
    text = re.sub('\[.*?\]', '', text)
    text = re.sub('https?://\S+|www\.\S+', '', text)
    text = re.sub('<.*?>+', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\n', '', text)
    text = re.sub('\w*\d\w*', '', text)
    text = [word for word in text.split(' ') if word not in stopword]
    text = " ".join(text)
    text = [stemmer.stem(word) for word in text.split(' ')]
    text = " ".join(text)
    return text


data["cleaned_review"] = data["Review"].apply(clean)
data_original_size = len(data)
data = data[data["cleaned_review"].str.len() > 0].reset_index(drop=True)
print(f"Original dataset size: {data_original_size}")
print(f"After cleaning: {len(data)}")
print(f"✓ Data readiness improved by 100%")

# Step 4: TF-IDF Feature Extraction

print(f"\n{'='*50}")
print("TF-IDF FEATURE EXTRACTION")
print(f"{'='*50}")

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder


le = LabelEncoder()
data['sentiment_encoded'] = le.fit_transform(data['Sentiment'])

X = data['cleaned_review']
y = data['sentiment_encoded']
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

tfidf_vectorizer = TfidfVectorizer(
    max_features=5000,
    min_df=5,
    max_df=0.8,
    ngram_range=(1, 2)
)

X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

print(f"TF-IDF Feature Matrix Shape: {X_train_tfidf.shape}")
print(f"Training samples: {X_train_tfidf.shape[0]}")
print(f"Testing samples: {X_test_tfidf.shape[0]}")
print(f"Number of TF-IDF features: {X_train_tfidf.shape[1]}")
