# Customer Feedback Analysis Project

## Business Impact: Product Improvement through Customer Sentiment Analysis

In this project, we'll analyze Amazon product reviews to help businesses understand:
1. Overall customer satisfaction
2. Key product features that customers love/hate
3. Common pain points and improvement areas
4. Sentiment trends over time

This analysis can directly impact business decisions by:
- Identifying priority areas for product improvement
- Understanding customer preferences
- Tracking the impact of product changes
- Improving customer satisfaction

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.sentiment import SentimentIntensityAnalyzer
from collections import Counter
import re
from datetime import datetime
import warnings
import gzip
import json
import requests
from io import BytesIO
warnings.filterwarnings('ignore')

# Set up plotting style
plt.style.use('seaborn')
sns.set_palette('husl')

# Initialize sentiment analyzer
sia = SentimentIntensityAnalyzer()

## 1. Data Collection and Preparation

We'll use the Amazon Product Reviews dataset, which contains millions of reviews across different product categories. For this project, we'll focus on a specific product category to keep the analysis manageable.

In [None]:
def load_amazon_reviews(category='Electronics', max_reviews=1000):
    """Load Amazon reviews for a specific category"""
    # URL for the Amazon review dataset
    url = f"http://snap.stanford.edu/data/amazon/productGraph/categoryFiles/reviews_{category}_5.json.gz"
    
    try:
        # Download the file
        response = requests.get(url)
        response.raise_for_status()  # Raise an exception for bad status codes
        
        # Read the gzipped file
        with gzip.GzipFile(fileobj=BytesIO(response.content)) as f:
            # Read and parse the JSON lines
            reviews = []
            for i, line in enumerate(f):
                if i >= max_reviews:
                    break
                review = json.loads(line)
                reviews.append({
                    'review_id': review.get('reviewerID', ''),
                    'product': review.get('asin', ''),
                    'review_text': review.get('reviewText', ''),
                    'rating': review.get('overall', 0),
                    'date': review.get('reviewTime', ''),
                    'category': category
                })
        
        return pd.DataFrame(reviews)
    
    except requests.exceptions.RequestException as e:
        print(f"Error downloading data: {e}")
        print("\nUsing sample data instead...")
        
        # Fallback to sample data if download fails
        return pd.DataFrame([
            {
                'review_id': 'A1B2C3',
                'product': 'Sample Product',
                'review_text': 'This is a great product with amazing features. The battery life is excellent!',
                'rating': 5,
                'date': '2024-01-15',
                'category': 'Electronics'
            },
            {
                'review_id': 'D4E5F6',
                'product': 'Sample Product',
                'review_text': 'Good product but the camera quality could be better.',
                'rating': 3,
                'date': '2024-01-20',
                'category': 'Electronics'
            }
        ])

# Load the data
print("Loading Amazon reviews...")
df = load_amazon_reviews(category='Electronics', max_reviews=1000)

# Display basic information about the dataset
print("\nDataset Information:")
print(f"Number of reviews: {len(df)}")
print(f"\nSample reviews:")
print(df[['review_text', 'rating']].head())

# Basic statistics
print("\nRating Distribution:")
print(df['rating'].value_counts().sort_index())

### Data Cleaning and Preprocessing

Let's clean and preprocess the review data to prepare it for analysis:

In [None]:
def clean_review_text(text):
    """Clean and preprocess review text"""
    if not isinstance(text, str):
        return ""
    
    # Convert to lowercase
    text = text.lower()
    
    # Remove special characters and numbers
    text = re.sub(r'[^a-zA-Z\s]', ' ', text)
    
    # Remove extra whitespace
    text = ' '.join(text.split())
    
    return text

# Clean the review text
df['cleaned_text'] = df['review_text'].apply(clean_review_text)

# Convert date strings to datetime objects
df['date'] = pd.to_datetime(df['date'], errors='coerce')

# Remove rows with missing values
df = df.dropna(subset=['review_text', 'rating', 'date'])

print("\nCleaned Dataset Information:")
print(f"Number of reviews after cleaning: {len(df)}")
print("\nSample of cleaned reviews:")
print(df[['cleaned_text', 'rating']].head())

## 2. Text Preprocessing Exercise

Let's create a more comprehensive preprocessing function that includes:
1. Removing stopwords
2. Lemmatization
3. Handling contractions
4. Removing URLs and email addresses

In [None]:
# TODO: Implement the advanced preprocessing function
def advanced_preprocess(text):
    """Implement advanced text preprocessing"""
    # Your code here
    pass

# Test your implementation
sample_text = "The product's battery life is amazing! Check it out at www.example.com or email us at test@example.com"
processed_text = advanced_preprocess(sample_text)
print("Processed text:", processed_text)

## 3. Sentiment Analysis Exercise

Create a function that:
1. Analyzes sentiment of reviews
2. Categorizes reviews as Positive, Negative, or Neutral
3. Calculates confidence scores

In [None]:
# TODO: Implement the sentiment analysis function
def analyze_sentiment_with_confidence(text):
    """Analyze sentiment with confidence scores"""
    # Your code here
    pass

# Test your implementation
test_reviews = [
    "This product is absolutely amazing! Best purchase ever.",
    "The quality is okay, nothing special.",
    "Terrible product, would not recommend."
]

for review in test_reviews:
    result = analyze_sentiment_with_confidence(review)
    print(f"\nReview: {review}")
    print(f"Analysis: {result}")

## 4. Feature Extraction Exercise

Create a function that:
1. Identifies product features mentioned in reviews
2. Extracts sentiment for each feature
3. Calculates feature importance scores

In [None]:
# TODO: Implement the feature extraction function
def extract_features_with_sentiment(text):
    """Extract features and their sentiment"""
    # Your code here
    pass

# Test your implementation
test_review = "The battery life is excellent, but the camera quality needs improvement. The screen is beautiful though."
features = extract_features_with_sentiment(test_review)
print("Extracted features with sentiment:")
print(features)

## 5. Visualization Exercise

Create functions to visualize:
1. Sentiment distribution
2. Feature importance
3. Sentiment trends over time

In [None]:
# TODO: Implement visualization functions
def plot_sentiment_distribution(sentiments):
    """Plot sentiment distribution"""
    # Your code here
    pass

def plot_feature_importance(features):
    """Plot feature importance"""
    # Your code here
    pass

def plot_sentiment_trends(dates, sentiments):
    """Plot sentiment trends over time"""
    # Your code here
    pass

# Test your implementations
print("Implement and test your visualization functions here")

## 6. Business Insights Exercise

Create a function that generates business insights by:
1. Identifying top positive and negative features
2. Calculating improvement priorities
3. Generating actionable recommendations

In [None]:
# TODO: Implement the business insights function
def generate_business_insights(reviews_data):
    """Generate business insights from review data"""
    # Your code here
    pass

# Test your implementation
sample_data = {
    'reviews': [
        {'text': 'Great battery life but poor camera', 'rating': 4},
        {'text': 'Amazing screen quality', 'rating': 5},
        {'text': 'Battery drains too fast', 'rating': 2}
    ]
}

insights = generate_business_insights(sample_data)
print("Generated insights:")
print(insights)

## 7. Main Project Exercise

Now it's your turn! Create a complete customer feedback analysis system that:

1. Loads and processes real Amazon review data
2. Implements all the functions from previous exercises
3. Generates comprehensive business insights
4. Creates visualizations for key findings
5. Produces a detailed analysis report

In [None]:
# TODO: Implement the complete analysis system

def analyze_customer_feedback(category='Electronics', max_reviews=1000):
    """Complete customer feedback analysis system"""
    # Your code here
    pass

# Run the analysis
results = analyze_customer_feedback()
print("Analysis complete! Check the results above.")

## Solutions

Here are the solutions to all exercises:

### Exercise 2: Advanced Preprocessing
```python
def advanced_preprocess(text):
    # Convert to lowercase
    text = text.lower()
    
    # Remove URLs
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
    
    # Remove email addresses
    text = re.sub(r'\S+@\S+', '', text)
    
    # Handle contractions
    contractions = {
        "'s": " is",
        "'re": " are",
        "'t": " not",
        "'d": " would",
        "'ll": " will",
        "'ve": " have"
    }
    for contraction, expansion in contractions.items():
        text = text.replace(contraction, expansion)
    
    # Remove special characters and numbers
    text = re.sub(r'[^a-zA-Z\s]', ' ', text)
    
    # Tokenize
    tokens = word_tokenize(text)
    
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token not in stop_words]
    
    # Lemmatize
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(token) for token in tokens]
    
    return ' '.join(tokens)
```

### Exercise 3: Sentiment Analysis
```python
def analyze_sentiment_with_confidence(text):
    scores = sia.polarity_scores(text)
    
    # Determine sentiment category
    if scores['compound'] >= 0.05:
        sentiment = 'Positive'
    elif scores['compound'] <= -0.05:
        sentiment = 'Negative'
    else:
        sentiment = 'Neutral'
    
    # Calculate confidence
    confidence = abs(scores['compound'])
    
    return {
        'sentiment': sentiment,
        'confidence': confidence,
        'scores': scores
    }
```

### Exercise 4: Feature Extraction
```python
def extract_features_with_sentiment(text):
    # Define feature keywords
    features = {
        'battery': ['battery', 'life', 'charge'],
        'camera': ['camera', 'photo', 'picture'],
        'screen': ['screen', 'display', 'resolution'],
        'performance': ['performance', 'speed', 'fast', 'slow'],
        'software': ['software', 'app', 'interface']
    }
    
    # Preprocess text
    tokens = advanced_preprocess(text).split()
    
    # Extract features and their sentiment
    feature_sentiments = {}
    for feature, keywords in features.items():
        if any(keyword in tokens for keyword in keywords):
            # Get sentiment for the feature
            feature_text = ' '.join([word for word in tokens if any(k in word for k in keywords)])
            sentiment = analyze_sentiment_with_confidence(feature_text)
            feature_sentiments[feature] = sentiment
    
    return feature_sentiments
```

### Exercise 5: Visualizations
```python
def plot_sentiment_distribution(sentiments):
    plt.figure(figsize=(10, 6))
    sns.histplot(sentiments, bins=20)
    plt.title('Distribution of Review Sentiments')
    plt.xlabel('Sentiment Score')
    plt.ylabel('Number of Reviews')
    plt.show()

def plot_feature_importance(features):
    plt.figure(figsize=(12, 6))
    sns.barplot(x=list(features.keys()), y=[f['confidence'] for f in features.values()])
    plt.title('Feature Importance')
    plt.xticks(rotation=45)
    plt.show()

def plot_sentiment_trends(dates, sentiments):
    plt.figure(figsize=(12, 6))
    plt.plot(dates, sentiments)
    plt.title('Sentiment Trends Over Time')
    plt.xlabel('Date')
    plt.ylabel('Average Sentiment')
    plt.xticks(rotation=45)
    plt.show()
```

### Exercise 6: Business Insights
```python
def generate_business_insights(reviews_data):
    insights = []
    
    # Analyze overall sentiment
    sentiments = [analyze_sentiment_with_confidence(review['text']) for review in reviews_data['reviews']]
    avg_sentiment = np.mean([s['scores']['compound'] for s in sentiments])
    insights.append(f"Overall customer satisfaction: {avg_sentiment:.2f}")
    
    # Analyze features
    feature_sentiments = {}
    for review in reviews_data['reviews']:
        features = extract_features_with_sentiment(review['text'])
        for feature, sentiment in features.items():
            if feature not in feature_sentiments:
                feature_sentiments[feature] = []
            feature_sentiments[feature].append(sentiment['scores']['compound'])
    
    # Generate feature insights
    for feature, sentiments in feature_sentiments.items():
        avg_sent = np.mean(sentiments)
        insights.append(f"{feature.capitalize()}: {avg_sent:.2f}")
    
    # Generate recommendations
    insights.append("\nRecommendations:")
    for feature, sentiments in feature_sentiments.items():
        avg_sent = np.mean(sentiments)
        if avg_sent < 0:
            insights.append(f"- Improve {feature} quality")
        elif avg_sent > 0.5:
            insights.append(f"- Highlight {feature} in marketing")
    
    return '\n'.join(insights)
```

### Exercise 7: Complete Analysis System
```python
def analyze_customer_feedback(category='Electronics', max_reviews=1000):
    # Load data
    df = load_amazon_reviews(category, max_reviews)
    
    # Preprocess
    df['processed_text'] = df['review_text'].apply(advanced_preprocess)
    
    # Analyze sentiment
    sentiment_scores = df['review_text'].apply(analyze_sentiment_with_confidence)
    df['sentiment'] = sentiment_scores.apply(lambda x: x['sentiment'])
    df['confidence'] = sentiment_scores.apply(lambda x: x['confidence'])
    
    # Extract features
    df['features'] = df['processed_text'].apply(extract_features_with_sentiment)
    
    # Generate visualizations
    plot_sentiment_distribution(df['confidence'])
    
    # Generate insights
    insights = generate_business_insights({'reviews': df.to_dict('records')})
    
    return {
        'dataframe': df,
        'insights': insights
    }
```