# Part 1 - Data Handling and Preprocessing
## Intelligent Customer Feedback Analysis System

**Objective:** Clean and preprocess 1,000+ customer feedback records

**Tasks:**
- Load and explore the dataset
- Remove duplicates and special characters
- Tokenization, lemmatization, stopword removal
- Handle missing or noisy data
- Save cleaned dataset

## 1. Import Required Libraries

In [None]:
# Data manipulation and analysis
import pandas as pd
import numpy as np

# Text processing
import re
import string

# NLP libraries
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns

# Warnings
import warnings
warnings.filterwarnings('ignore')

# Set display options
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', 100)

print("✓ All libraries imported successfully!")

## 2. Download NLTK Resources

In [None]:
# Download required NLTK data
nltk_resources = ['punkt', 'stopwords', 'wordnet', 'averaged_perceptron_tagger', 'omw-1.4']

for resource in nltk_resources:
    try:
        nltk.download(resource, quiet=True)
        print(f"✓ Downloaded: {resource}")
    except Exception as e:
        print(f"✗ Error downloading {resource}: {e}")

print("\n✓ NLTK resources ready!")

## 3. Load Dataset

In [None]:
# Load the dataset
df = pd.read_csv('../dataset/dataset.csv')

print("Dataset loaded successfully!")
print(f"\nDataset shape: {df.shape}")
print(f"Total records: {len(df):,}")
print(f"Total features: {len(df.columns)}")

## 4. Initial Data Exploration

In [None]:
# Display first few rows
print("First 5 rows of the dataset:\n")
df.head()

In [None]:
# Display dataset info
print("Dataset Information:\n")
df.info()

In [None]:
# Statistical summary
print("Statistical Summary:\n")
df.describe(include='all')

In [None]:
# Check for missing values
print("Missing Values Analysis:\n")
missing_data = pd.DataFrame({
    'Column': df.columns,
    'Missing Count': df.isnull().sum().values,
    'Missing Percentage': (df.isnull().sum().values / len(df) * 100).round(2)
})
missing_data = missing_data[missing_data['Missing Count'] > 0]

if len(missing_data) > 0:
    print(missing_data)
else:
    print("✓ No missing values found!")

In [None]:
# Check for duplicate rows
duplicate_count = df.duplicated().sum()
print(f"\nDuplicate rows: {duplicate_count}")

# Check duplicates based on feedback text
duplicate_feedback = df.duplicated(subset=['feedback_text']).sum()
print(f"Duplicate feedback texts: {duplicate_feedback}")

In [None]:
# Sentiment distribution
print("\nSentiment Distribution:\n")
sentiment_dist = df['sentiment_label'].value_counts()
print(sentiment_dist)
print(f"\nSentiment Percentages:\n{(df['sentiment_label'].value_counts(normalize=True) * 100).round(2)}")

In [None]:
# Visualize sentiment distribution
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Count plot
df['sentiment_label'].value_counts().plot(kind='bar', ax=axes[0], color=['#FF6B6B', '#4ECDC4', '#45B7D1'])
axes[0].set_title('Sentiment Distribution (Count)', fontsize=14, fontweight='bold')
axes[0].set_xlabel('Sentiment', fontsize=12)
axes[0].set_ylabel('Count', fontsize=12)
axes[0].tick_params(axis='x', rotation=0)

# Pie chart
df['sentiment_label'].value_counts().plot(kind='pie', ax=axes[1], autopct='%1.1f%%', 
                                          colors=['#FF6B6B', '#4ECDC4', '#45B7D1'])
axes[1].set_title('Sentiment Distribution (Percentage)', fontsize=14, fontweight='bold')
axes[1].set_ylabel('')

plt.tight_layout()
plt.show()

In [None]:
# Feedback length analysis
df['feedback_length'] = df['feedback_text'].astype(str).apply(len)
df['word_count'] = df['feedback_text'].astype(str).apply(lambda x: len(x.split()))

print("\nFeedback Length Statistics:")
print(f"Average character length: {df['feedback_length'].mean():.2f}")
print(f"Average word count: {df['word_count'].mean():.2f}")
print(f"Min word count: {df['word_count'].min()}")
print(f"Max word count: {df['word_count'].max()}")

## 5. Data Cleaning

In [None]:
# Create a copy for cleaning
df_clean = df.copy()

print(f"Original dataset size: {len(df_clean):,} records")

In [None]:
# Step 1: Handle missing values
print("\n=== Step 1: Handling Missing Values ===")

# Drop rows where feedback_text is missing
before_drop = len(df_clean)
df_clean = df_clean.dropna(subset=['feedback_text'])
after_drop = len(df_clean)

print(f"Rows dropped due to missing feedback_text: {before_drop - after_drop}")

# Fill missing sentiment labels with 'neutral' (if any)
if df_clean['sentiment_label'].isnull().sum() > 0:
    df_clean['sentiment_label'].fillna('neutral', inplace=True)
    print("Filled missing sentiment labels with 'neutral'")

# Convert all feedback to string
df_clean['feedback_text'] = df_clean['feedback_text'].astype(str)

print(f"✓ Current dataset size: {len(df_clean):,} records")

In [None]:
# Step 2: Remove duplicates
print("\n=== Step 2: Removing Duplicates ===")

before_dedup = len(df_clean)

# Remove exact duplicate rows
df_clean = df_clean.drop_duplicates()

# Remove duplicate feedback texts (keep first occurrence)
df_clean = df_clean.drop_duplicates(subset=['feedback_text'], keep='first')

after_dedup = len(df_clean)

print(f"Duplicate rows removed: {before_dedup - after_dedup}")
print(f"✓ Current dataset size: {len(df_clean):,} records")

In [None]:
# Step 3: Remove very short feedback (less than 3 words)
print("\n=== Step 3: Removing Short/Invalid Feedback ===")

before_filter = len(df_clean)
df_clean['temp_word_count'] = df_clean['feedback_text'].apply(lambda x: len(str(x).split()))
df_clean = df_clean[df_clean['temp_word_count'] >= 3]
df_clean = df_clean.drop('temp_word_count', axis=1)
after_filter = len(df_clean)

print(f"Short feedback removed (< 3 words): {before_filter - after_filter}")
print(f"✓ Current dataset size: {len(df_clean):,} records")

## 6. Text Preprocessing Functions

In [None]:
# Initialize lemmatizer and stopwords
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

# Define text cleaning function
def clean_text(text):
    """
    Clean text by:
    - Converting to lowercase
    - Removing URLs, emails, mentions
    - Removing special characters and numbers
    - Removing extra whitespace
    """
    # Convert to lowercase
    text = text.lower()
    
    # Remove URLs
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
    
    # Remove email addresses
    text = re.sub(r'\S+@\S+', '', text)
    
    # Remove mentions and hashtags
    text = re.sub(r'@\w+|#\w+', '', text)
    
    # Remove HTML tags
    text = re.sub(r'<.*?>', '', text)
    
    # Remove numbers
    text = re.sub(r'\d+', '', text)
    
    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))
    
    # Remove extra whitespace
    text = ' '.join(text.split())
    
    return text


def tokenize_text(text):
    """
    Tokenize text into words
    """
    try:
        tokens = word_tokenize(text)
        return tokens
    except:
        return text.split()


def remove_stopwords(tokens):
    """
    Remove stopwords from tokenized text
    """
    return [word for word in tokens if word not in stop_words and len(word) > 2]


def lemmatize_tokens(tokens):
    """
    Lemmatize tokens
    """
    return [lemmatizer.lemmatize(word) for word in tokens]


def preprocess_text(text):
    """
    Complete preprocessing pipeline:
    1. Clean text
    2. Tokenize
    3. Remove stopwords
    4. Lemmatize
    """
    # Clean
    text = clean_text(text)
    
    # Tokenize
    tokens = tokenize_text(text)
    
    # Remove stopwords
    tokens = remove_stopwords(tokens)
    
    # Lemmatize
    tokens = lemmatize_tokens(tokens)
    
    return tokens

print("✓ Text preprocessing functions defined!")

## 7. Test Preprocessing Functions

In [None]:
# Test with sample text
sample_text = df_clean['feedback_text'].iloc[0]

print("Original Text:")
print(sample_text)
print("\n" + "="*80 + "\n")

print("After Cleaning:")
cleaned = clean_text(sample_text)
print(cleaned)
print("\n" + "="*80 + "\n")

print("After Tokenization:")
tokens = tokenize_text(cleaned)
print(tokens)
print("\n" + "="*80 + "\n")

print("After Removing Stopwords:")
no_stopwords = remove_stopwords(tokens)
print(no_stopwords)
print("\n" + "="*80 + "\n")

print("After Lemmatization:")
lemmatized = lemmatize_tokens(no_stopwords)
print(lemmatized)
print("\n" + "="*80 + "\n")

print("Complete Preprocessing:")
preprocessed = preprocess_text(sample_text)
print(preprocessed)
print(f"\nProcessed tokens: {' '.join(preprocessed)}")

## 8. Apply Preprocessing to Entire Dataset

In [None]:
print("Starting text preprocessing...\n")
print("This may take a few minutes for large datasets...\n")

# Apply cleaning
print("Step 1/4: Cleaning text...")
df_clean['cleaned_text'] = df_clean['feedback_text'].apply(clean_text)
print("✓ Text cleaning completed!\n")

# Apply tokenization
print("Step 2/4: Tokenizing text...")
df_clean['tokens'] = df_clean['cleaned_text'].apply(tokenize_text)
print("✓ Tokenization completed!\n")

# Remove stopwords
print("Step 3/4: Removing stopwords...")
df_clean['tokens_no_stopwords'] = df_clean['tokens'].apply(remove_stopwords)
print("✓ Stopwords removal completed!\n")

# Lemmatize
print("Step 4/4: Lemmatizing tokens...")
df_clean['lemmatized_tokens'] = df_clean['tokens_no_stopwords'].apply(lemmatize_tokens)
print("✓ Lemmatization completed!\n")

# Create processed text column (tokens joined back into string)
df_clean['processed_text'] = df_clean['lemmatized_tokens'].apply(lambda x: ' '.join(x))

print("="*80)
print("✓ ALL PREPROCESSING COMPLETED SUCCESSFULLY!")
print("="*80)

## 9. Post-Preprocessing Analysis

In [None]:
# Compare before and after
print("Comparison of Original vs Processed Text:\n")
print("="*100)

for i in range(5):
    print(f"\nExample {i+1}:")
    print(f"Original: {df_clean.iloc[i]['feedback_text'][:150]}...")
    print(f"Processed: {df_clean.iloc[i]['processed_text'][:150]}...")
    print("-"*100)

In [None]:
# Token count statistics
df_clean['token_count'] = df_clean['lemmatized_tokens'].apply(len)

print("\nToken Count Statistics (After Preprocessing):")
print(f"Average tokens per feedback: {df_clean['token_count'].mean():.2f}")
print(f"Min tokens: {df_clean['token_count'].min()}")
print(f"Max tokens: {df_clean['token_count'].max()}")
print(f"Median tokens: {df_clean['token_count'].median():.2f}")

In [None]:
# Visualize token distribution
plt.figure(figsize=(12, 5))

plt.subplot(1, 2, 1)
plt.hist(df_clean['token_count'], bins=50, color='skyblue', edgecolor='black')
plt.title('Distribution of Token Counts', fontsize=14, fontweight='bold')
plt.xlabel('Number of Tokens', fontsize=12)
plt.ylabel('Frequency', fontsize=12)
plt.axvline(df_clean['token_count'].mean(), color='red', linestyle='--', label=f'Mean: {df_clean["token_count"].mean():.2f}')
plt.legend()

plt.subplot(1, 2, 2)
plt.boxplot(df_clean['token_count'], vert=True)
plt.title('Token Count Box Plot', fontsize=14, fontweight='bold')
plt.ylabel('Number of Tokens', fontsize=12)

plt.tight_layout()
plt.show()

## 10. Create Final Cleaned Dataset

In [None]:
# Select relevant columns for final dataset
final_columns = [
    'id',
    'source',
    'date',
    'feedback_text',           # Original text
    'cleaned_text',            # Cleaned text
    'processed_text',          # Fully processed text
    'lemmatized_tokens',       # List of tokens
    'rating',
    'sentiment_label',
    'token_count',
    'word_count',
    'feedback_length'
]

# Create final dataset
df_final = df_clean[final_columns].copy()

# Reset index
df_final = df_final.reset_index(drop=True)

print(f"Final cleaned dataset shape: {df_final.shape}")
print(f"Total records: {len(df_final):,}")
print(f"\nColumns in final dataset:")
for i, col in enumerate(df_final.columns, 1):
    print(f"{i}. {col}")

In [None]:
# Display sample of final dataset
print("\nSample of Final Cleaned Dataset:\n")
df_final.head(10)

In [None]:
# Final data quality check
print("\n" + "="*80)
print("FINAL DATA QUALITY REPORT")
print("="*80)

print(f"\n1. Dataset Size: {len(df_final):,} records")
print(f"2. Missing Values: {df_final.isnull().sum().sum()}")
print(f"3. Duplicate Records: {df_final.duplicated().sum()}")
print(f"\n4. Sentiment Distribution:")
print(df_final['sentiment_label'].value_counts())
print(f"\n5. Average Feedback Length: {df_final['feedback_length'].mean():.2f} characters")
print(f"6. Average Word Count: {df_final['word_count'].mean():.2f} words")
print(f"7. Average Token Count (processed): {df_final['token_count'].mean():.2f} tokens")

print("\n" + "="*80)
print("✓ DATA CLEANING AND PREPROCESSING COMPLETED SUCCESSFULLY!")
print("="*80)

## 11. Save Cleaned Dataset

In [None]:
# Save cleaned dataset to CSV
output_path = '../dataset/cleaned_customer_feedback.csv'

# Convert tokens list to string for CSV storage
df_to_save = df_final.copy()
df_to_save['lemmatized_tokens'] = df_to_save['lemmatized_tokens'].apply(lambda x: ' '.join(x))

# Save to CSV
df_to_save.to_csv(output_path, index=False)

print(f"✓ Cleaned dataset saved to: {output_path}")
print(f"✓ File size: {len(df_to_save):,} records")
print(f"✓ Columns saved: {len(df_to_save.columns)}")

In [None]:
# Also save a minimal version for model training
df_minimal = df_final[['id', 'processed_text', 'sentiment_label', 'rating']].copy()
minimal_path = '../dataset/cleaned_feedback_minimal.csv'
df_minimal.to_csv(minimal_path, index=False)

print(f"\n✓ Minimal dataset saved to: {minimal_path}")
print(f"✓ Columns: {list(df_minimal.columns)}")

## 12. Summary Statistics and Visualization

In [None]:
# Create comprehensive summary visualization
fig = plt.figure(figsize=(16, 10))
gs = fig.add_gridspec(3, 3, hspace=0.3, wspace=0.3)

# 1. Sentiment Distribution
ax1 = fig.add_subplot(gs[0, :])
sentiment_counts = df_final['sentiment_label'].value_counts()
colors = ['#FF6B6B' if s == 'negative' else '#4ECDC4' if s == 'positive' else '#45B7D1' 
          for s in sentiment_counts.index]
bars = ax1.bar(sentiment_counts.index, sentiment_counts.values, color=colors, edgecolor='black', linewidth=1.5)
ax1.set_title('Sentiment Distribution in Cleaned Dataset', fontsize=16, fontweight='bold', pad=20)
ax1.set_xlabel('Sentiment', fontsize=12)
ax1.set_ylabel('Count', fontsize=12)
for bar in bars:
    height = bar.get_height()
    ax1.text(bar.get_x() + bar.get_width()/2., height,
             f'{int(height):,}\n({height/len(df_final)*100:.1f}%)',
             ha='center', va='bottom', fontsize=11, fontweight='bold')

# 2. Word Count Distribution
ax2 = fig.add_subplot(gs[1, 0])
ax2.hist(df_final['word_count'], bins=40, color='#95E1D3', edgecolor='black')
ax2.set_title('Word Count Distribution', fontsize=12, fontweight='bold')
ax2.set_xlabel('Words', fontsize=10)
ax2.set_ylabel('Frequency', fontsize=10)
ax2.axvline(df_final['word_count'].mean(), color='red', linestyle='--', linewidth=2)

# 3. Token Count Distribution
ax3 = fig.add_subplot(gs[1, 1])
ax3.hist(df_final['token_count'], bins=40, color='#F38181', edgecolor='black')
ax3.set_title('Token Count Distribution (After Processing)', fontsize=12, fontweight='bold')
ax3.set_xlabel('Tokens', fontsize=10)
ax3.set_ylabel('Frequency', fontsize=10)
ax3.axvline(df_final['token_count'].mean(), color='blue', linestyle='--', linewidth=2)

# 4. Character Length Distribution
ax4 = fig.add_subplot(gs[1, 2])
ax4.hist(df_final['feedback_length'], bins=40, color='#AA96DA', edgecolor='black')
ax4.set_title('Character Length Distribution', fontsize=12, fontweight='bold')
ax4.set_xlabel('Characters', fontsize=10)
ax4.set_ylabel('Frequency', fontsize=10)
ax4.axvline(df_final['feedback_length'].mean(), color='orange', linestyle='--', linewidth=2)

# 5. Rating Distribution
ax5 = fig.add_subplot(gs[2, 0])
df_final['rating'].value_counts().sort_index().plot(kind='bar', ax=ax5, color='#FCBAD3', edgecolor='black')
ax5.set_title('Rating Distribution', fontsize=12, fontweight='bold')
ax5.set_xlabel('Rating', fontsize=10)
ax5.set_ylabel('Count', fontsize=10)
ax5.tick_params(axis='x', rotation=0)

# 6. Sentiment by Rating
ax6 = fig.add_subplot(gs[2, 1:])
sentiment_rating = pd.crosstab(df_final['rating'], df_final['sentiment_label'])
sentiment_rating.plot(kind='bar', stacked=True, ax=ax6, 
                      color=['#FF6B6B', '#45B7D1', '#4ECDC4'], edgecolor='black')
ax6.set_title('Sentiment Distribution by Rating', fontsize=12, fontweight='bold')
ax6.set_xlabel('Rating', fontsize=10)
ax6.set_ylabel('Count', fontsize=10)
ax6.legend(title='Sentiment', fontsize=9)
ax6.tick_params(axis='x', rotation=0)

plt.suptitle('Data Preprocessing Summary - Part 1 Complete', 
             fontsize=18, fontweight='bold', y=0.995)

plt.show()

## 13. Final Summary

In [None]:
print("\n" + "#"*100)
print("#" + " "*98 + "#")
print("#" + " "*30 + "PART 1 - DATA HANDLING COMPLETED" + " "*35 + "#")
print("#" + " "*98 + "#")
print("#"*100)

print("\n📊 DELIVERABLES:")
print("   ✓ Cleaned dataset with 1,000+ records")
print("   ✓ Data preprocessing notebook (this file)")
print("   ✓ Removed duplicates and special characters")
print("   ✓ Performed tokenization, lemmatization, stopword removal")
print("   ✓ Handled missing and noisy data")

print("\n📁 OUTPUT FILES:")
print("   1. ../dataset/cleaned_customer_feedback.csv (Full cleaned dataset)")
print("   2. ../dataset/cleaned_feedback_minimal.csv (Minimal for training)")

print("\n📈 DATASET STATISTICS:")
print(f"   • Total Records: {len(df_final):,}")
print(f"   • Features: {len(df_final.columns)}")
print(f"   • Sentiments: {df_final['sentiment_label'].nunique()} classes")
print(f"   • Average Tokens: {df_final['token_count'].mean():.2f}")
print(f"   • Data Quality: 100% (No missing values, No duplicates)")

print("\n✅ READY FOR PART 2: Sentiment Classification Model")
print("\n" + "#"*100 + "\n")