In [None]:
# Final Data Quality Check
print("="*80)
print("DATA QUALITY SUMMARY")
print("="*80)
print(f"\nTotal Records: {len(df)}")
print(f"Duplicate Records: {df.duplicated().sum()}")
print(f"\nMissing Values:")
print(df.isnull().sum())
print(f"\nData type mismatches:")
print(f"- Ratings should be numeric: {df['Ratings'].dtype}")
print(f"- Up Votes should be numeric: {df['Up Votes'].dtype}")
print(f"- Down Votes should be numeric: {df['Down Votes'].dtype}")

print("\n" + "="*80)
print("CRITICAL FINDINGS:")
print("="*80)
print(f"1. Class Imbalance: {(df['Sentiment']==1).sum()/len(df)*100:.2f}% positive vs {(df['Sentiment']==0).sum()/len(df)*100:.2f}% negative")
print(f"2. Missing Review Text: {df['Review text'].isnull().sum()} records")
print(f"3. Missing Ratings: {df['Ratings'].isnull().sum()} records")
print("\nAction Items for Preprocessing:")
print("- Handle missing values")
print("- Clean and normalize text")
print("- Remove special characters and stopwords")
print("- Apply lemmatization/stemming")
print("- Consider class imbalance during model training")

## Data Quality Summary

In [None]:
# Analyze upvotes and downvotes by sentiment
print("Up Votes Statistics:")
print(f"Mean: {df['Up Votes'].mean():.2f}")
print(f"Median: {df['Up Votes'].median():.2f}")
print(f"Max: {df['Up Votes'].max():.2f}")

print("\nDown Votes Statistics:")
print(f"Mean: {df['Down Votes'].mean():.2f}")
print(f"Median: {df['Down Votes'].median():.2f}")
print(f"Max: {df['Down Votes'].max():.2f}")

# Visualize votes by sentiment
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Up votes comparison
sentiment_upvotes = [df[df['Sentiment']==0]['Up Votes'], df[df['Sentiment']==1]['Up Votes']]
axes[0].boxplot(sentiment_upvotes, labels=['Negative', 'Positive'])
axes[0].set_ylabel('Up Votes')
axes[0].set_title('Up Votes Distribution by Sentiment')
axes[0].grid(axis='y', alpha=0.3)

# Down votes comparison
sentiment_downvotes = [df[df['Sentiment']==0]['Down Votes'], df[df['Sentiment']==1]['Down Votes']]
axes[1].boxplot(sentiment_downvotes, labels=['Negative', 'Positive'])
axes[1].set_ylabel('Down Votes')
axes[1].set_title('Down Votes Distribution by Sentiment')
axes[1].grid(axis='y', alpha=0.3)

plt.tight_layout()
plt.show()

## Votes Analysis

In [None]:
# Display sample positive and negative reviews
print("="*80)
print("SAMPLE POSITIVE REVIEWS (Rating >= 3):")
print("="*80)
positive_samples = df[df['Sentiment'] == 1][['Review Title', 'Review text', 'Ratings']].head(3)
for idx, row in positive_samples.iterrows():
    print(f"\nTitle: {row['Review Title']}")
    print(f"Rating: {int(row['Ratings'])} stars")
    print(f"Review: {row['Review text'][:200]}...")

print("\n" + "="*80)
print("SAMPLE NEGATIVE REVIEWS (Rating < 3):")
print("="*80)
negative_samples = df[df['Sentiment'] == 0][['Review Title', 'Review text', 'Ratings']].head(3)
for idx, row in negative_samples.iterrows():
    print(f"\nTitle: {row['Review Title']}")
    print(f"Rating: {int(row['Ratings'])} stars")
    print(f"Review: {row['Review text'][:200]}...")

## Sample Reviews Analysis

In [None]:
# Analyze review text lengths
df['review_length'] = df['Review text'].fillna('').apply(len)
df['review_word_count'] = df['Review text'].fillna('').apply(lambda x: len(str(x).split()))

print("Review Text Statistics:")
print(f"Average review length (characters): {df['review_length'].mean():.2f}")
print(f"Average review length (words): {df['review_word_count'].mean():.2f}")
print(f"Max review length (characters): {df['review_length'].max()}")
print(f"Max review length (words): {df['review_word_count'].max()}")

# Visualize review lengths by sentiment
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Character count distribution
axes[0].hist([df[df['Sentiment']==0]['review_length'], df[df['Sentiment']==1]['review_length']], 
             bins=30, label=['Negative', 'Positive'], color=['#FF6B6B', '#4ECDC4'], alpha=0.7)
axes[0].set_xlabel('Review Length (Characters)')
axes[0].set_ylabel('Frequency')
axes[0].set_title('Review Length Distribution by Sentiment')
axes[0].legend()
axes[0].grid(axis='y', alpha=0.3)

# Word count distribution
axes[1].hist([df[df['Sentiment']==0]['review_word_count'], df[df['Sentiment']==1]['review_word_count']], 
             bins=30, label=['Negative', 'Positive'], color=['#FF6B6B', '#4ECDC4'], alpha=0.7)
axes[1].set_xlabel('Review Word Count')
axes[1].set_ylabel('Frequency')
axes[1].set_title('Word Count Distribution by Sentiment')
axes[1].legend()
axes[1].grid(axis='y', alpha=0.3)

plt.tight_layout()
plt.show()

## Review Text Analysis

In [None]:
# Create binary sentiment labels
# Ratings >= 3: Positive (1)
# Ratings < 3: Negative (0)
df['Sentiment'] = (df['Ratings'] >= 3).astype(int)

print("Sentiment Distribution:")
print(df['Sentiment'].value_counts())
print(f"\nPositive Reviews: {(df['Sentiment']==1).sum()} ({(df['Sentiment']==1).sum()/len(df)*100:.2f}%)")
print(f"Negative Reviews: {(df['Sentiment']==0).sum()} ({(df['Sentiment']==0).sum()/len(df)*100:.2f}%)")

# Visualize sentiment distribution
plt.figure(figsize=(8, 5))
sentiment_labels = {0: 'Negative', 1: 'Positive'}
colors = ['#FF6B6B', '#4ECDC4']
counts = df['Sentiment'].value_counts()
plt.bar([sentiment_labels[i] for i in counts.index], counts.values, color=colors, edgecolor='black')
plt.ylabel('Count')
plt.title('Sentiment Distribution of Reviews')
plt.grid(axis='y', alpha=0.3)
for i, v in enumerate(counts.values):
    plt.text(i, v + 100, str(v), ha='center', fontweight='bold')
plt.tight_layout()
plt.show()

## Create Sentiment Labels

In [None]:
# Analyze rating distribution
print("Rating Distribution:")
print(df['Ratings'].value_counts().sort_index())
print(f"\nRating Statistics:")
print(f"Mean: {df['Ratings'].mean():.2f}")
print(f"Median: {df['Ratings'].median():.2f}")
print(f"Std Dev: {df['Ratings'].std():.2f}")

# Visualize rating distribution
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Histogram
axes[0].hist(df['Ratings'], bins=5, edgecolor='black', color='skyblue')
axes[0].set_xlabel('Rating')
axes[0].set_ylabel('Frequency')
axes[0].set_title('Distribution of Ratings')
axes[0].grid(axis='y', alpha=0.3)

# Pie chart
rating_counts = df['Ratings'].value_counts().sort_index()
axes[1].pie(rating_counts.values, labels=[f'{int(r)} stars' for r in rating_counts.index], 
            autopct='%1.1f%%', startangle=90)
axes[1].set_title('Percentage Distribution of Ratings')

plt.tight_layout()
plt.show()

## Rating Distribution Analysis

In [None]:
# Get basic information
print("Data Info:")
print(df.info())
print("\n" + "="*50)
print("\nStatistical Summary:")
print(df.describe())
print("\n" + "="*50)
print("\nMissing Values:")
print(df.isnull().sum())

## Basic Data Information

In [None]:
# Load the badminton reviews dataset
data_path = r'C:\Users\admin\Documents\Innomatics\Sentiment\reviews_data_dump\reviews_badminton\data.csv'
df = pd.read_csv(data_path)

print(f"Dataset shape: {df.shape}")
print(f"\nColumn names and types:")
print(df.dtypes)
print(f"\nFirst 5 rows:")
df.head()

## Load the Dataset

In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from wordcloud import WordCloud
import warnings
warnings.filterwarnings('ignore')

# Set plotting style
sns.set_style("whitegrid")
plt.rcParams['figure.figsize'] = (12, 6)

# 1. Exploratory Data Analysis (EDA)
## Sentiment Analysis of Flipkart YONEX Badminton Reviews

In this notebook, we will:
- Load the badminton reviews dataset
- Explore data structure and basic statistics
- Analyze rating distribution
- Examine review characteristics
- Identify data quality issues