In [None]:
# Save preprocessed data
output_path = r'C:\Users\admin\Documents\Innomatics\Sentiment\sentiment_analysis_project\data\preprocessed_reviews.csv'
df.to_csv(output_path, index=False)
print(f"Preprocessed data saved to: {output_path}")

# Display final dataset info
print(f"\nFinal Dataset Info:")
print(f"Total records: {len(df)}")
print(f"Positive reviews: {(df['Sentiment']==1).sum()}")
print(f"Negative reviews: {(df['Sentiment']==0).sum()}")
print(f"Columns: {df.columns.tolist()}")

## Save Preprocessed Data

In [None]:
# Check for empty texts after cleaning
empty_texts = (df['cleaned_text'].str.len() == 0).sum()
print(f"Empty texts after cleaning: {empty_texts}")

# Calculate text statistics
df['cleaned_length'] = df['cleaned_text'].str.len()
df['cleaned_word_count'] = df['cleaned_text'].str.split().str.len()

print("\nCleaned Text Statistics:")
print(f"Average length (characters): {df['cleaned_length'].mean():.2f}")
print(f"Average word count: {df['cleaned_word_count'].mean():.2f}")
print(f"Min word count: {df['cleaned_word_count'].min()}")
print(f"Max word count: {df['cleaned_word_count'].max()}")

# Remove rows with very short cleaned text (optional)
print(f"\nTexts with less than 3 words: {(df['cleaned_word_count'] < 3).sum()}")

# Keep only texts with at least 3 words
df = df[df['cleaned_word_count'] >= 3].reset_index(drop=True)
print(f"Dataset shape after removing very short texts: {df.shape}")

## Verify Text Quality

In [None]:
# Apply text cleaning
print("Applying text cleaning pipeline...")
df = preprocessor.process_dataframe(
    df, 
    text_column='Review text', 
    output_column='cleaned_text',
    remove_stopwords=True,
    normalize=True
)
print("Text cleaning completed!")

# Show samples of cleaned text
print("\nCLEANED TEXT SAMPLES (after lemmatization and stopword removal):")
print("="*80)
for i in range(3):
    print(f"\nOriginal: {df['Review text'].iloc[i][:150]}...")
    print(f"Cleaned: {df['cleaned_text'].iloc[i][:150]}...")

## Apply Text Cleaning Pipeline

In [None]:
# Initialize the preprocessor with lemmatization
preprocessor = TextPreprocessor(use_lemmatization=True)

# Show sample of original text
print("ORIGINAL TEXT SAMPLES:")
print("="*80)
for i in range(3):
    print(f"\nReview {i+1}:")
    print(df['Review text'].iloc[i][:200])

## Initialize Text Preprocessor

In [None]:
# Check for missing values
print("Missing values before handling:")
print(df[['Review text', 'Ratings', 'Sentiment']].isnull().sum())

# Handle missing values in review text
df = handle_missing_values(df, text_column='Review text', strategy='drop')

print(f"\nDataset shape after handling missing values: {df.shape}")
print(f"Missing values after handling:")
print(df[['Review text', 'Ratings', 'Sentiment']].isnull().sum())

## Handle Missing Values

In [None]:
# Load the badminton reviews dataset
data_path = r'C:\Users\admin\Documents\Innomatics\Sentiment\reviews_data_dump\reviews_badminton\data.csv'
df = pd.read_csv(data_path)

# Create sentiment labels (ratings >= 3 = positive, < 3 = negative)
df['Sentiment'] = (df['Ratings'] >= 3).astype(int)

print(f"Dataset loaded with {len(df)} reviews")
print(f"Positive reviews: {(df['Sentiment']==1).sum()}")
print(f"Negative reviews: {(df['Sentiment']==0).sum()}")

## Load the Dataset

In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
import sys
sys.path.append(r'C:\Users\admin\Documents\Innomatics\Sentiment\sentiment_analysis_project\src')

from preprocessing import TextPreprocessor, create_binary_labels, handle_missing_values
import warnings
warnings.filterwarnings('ignore')

# 2. Text Preprocessing
## Cleaning and Normalizing Review Text

In this notebook, we will:
- Load cleaned EDA data
- Remove special characters and punctuation
- Remove HTML tags and URLs
- Normalize text (lowercase, whitespace)
- Remove stopwords
- Apply lemmatization
- Handle missing values