In [6]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
from collections import defaultdict
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler
from textblob import TextBlob
import warnings
warnings.filterwarnings('ignore')

In [2]:
class CustomerJourneyAnalyzer:
    def __init__(self):
        self.journey_data = None
        self.user_patterns = None
        self.category_insights = None
        self.processed_df = None  # Added to store processed dataframe
        
    def preprocess_data(self, df):
        """Preprocess the dataset for journey analysis."""
        # Create a copy to avoid modifying original data
        processed_df = df.copy()
        
        # Calculate sentiment scores for reviews
        processed_df['sentiment_score'] = processed_df['normalized_review'].apply(
            lambda x: TextBlob(str(x)).sentiment.polarity
        )
        
        # Add binary columns for journey touchpoints
        processed_df['has_rating'] = processed_df['rating'].notna().astype(int)
        processed_df['has_review'] = processed_df['normalized_review'].notna().astype(int)
        
        # Calculate price sensitivity
        processed_df['price_sensitivity'] = (
            processed_df['discount_percentage'] / 
            processed_df['actual_price']
        ).fillna(0)
        
        return processed_df
    
    def analyze_user_patterns(self, df):
        """Analyze patterns in user behavior."""
        user_patterns = defaultdict(dict)
        
        # Aggregate user behavior metrics
        user_metrics = df.groupby('user_id').agg({
            'product_id': 'count',
            'rating': ['mean', 'std'],
            'sentiment_score': ['mean', 'std'],
            'price_sensitivity': 'mean',
            'category': lambda x: list(x.unique()),
            'discount_percentage': 'mean'
        }).reset_index()
        
        # Flatten column names
        user_metrics.columns = [
            'user_id', 'total_interactions', 'avg_rating', 'rating_std',
            'avg_sentiment', 'sentiment_std', 'price_sensitivity',
            'categories', 'avg_discount'
        ]
        
        # Calculate category preferences
        for _, row in user_metrics.iterrows():
            user_id = row['user_id']
            user_patterns[user_id] = {
                'total_interactions': row['total_interactions'],
                'avg_rating': row['avg_rating'],
                'rating_consistency': 1 - (row['rating_std'] / 5 if pd.notna(row['rating_std']) else 0),
                'sentiment_consistency': 1 - (row['sentiment_std'] if pd.notna(row['sentiment_std']) else 0),
                'price_sensitivity': row['price_sensitivity'],
                'preferred_categories': row['categories'],
                'discount_preference': row['avg_discount']
            }
        
        return dict(user_patterns)
    
    def analyze_category_patterns(self, df):
        """Analyze patterns within product categories."""
        category_insights = {}
        
        for category in df['category'].unique():
            category_data = df[df['category'] == category]
            
            # Calculate category metrics
            category_insights[category] = {
                'avg_rating': category_data['rating'].mean(),
                'review_sentiment': category_data['sentiment_score'].mean(),
                'price_sensitivity': (
                    category_data['discount_percentage'].corr(category_data['rating'])
                ),
                'avg_discount': category_data['discount_percentage'].mean(),
                'review_rate': category_data['has_review'].mean(),
                'common_terms': self._extract_common_terms(
                    category_data[category_data['sentiment_score'] > 0]['normalized_review']
                )
            }
        
        return category_insights
    
    def analyze_journey_factors(self, df):
        """Analyze factors influencing customer journey and satisfaction."""
        journey_factors = {
            'rating_drivers': self._analyze_rating_drivers(df),
            'review_patterns': self._analyze_review_patterns(df),
            'price_impact': self._analyze_price_impact(df)
        }
        
        return journey_factors
    
    def _analyze_rating_drivers(self, df):
        """Analyze factors that drive product ratings."""
        rating_drivers = {
            'discount_correlation': df['discount_percentage'].corr(df['rating']),
            'price_correlation': df['actual_price'].corr(df['rating']),
            'sentiment_correlation': df['sentiment_score'].corr(df['rating'])
        }
        
        # Analyze rating distribution by category
        rating_by_category = df.groupby('category')['rating'].agg(['mean', 'std']).to_dict('index')
        rating_drivers['category_performance'] = rating_by_category
        
        return rating_drivers
    
    def _analyze_review_patterns(self, df):
        """Analyze patterns in customer reviews."""
        review_patterns = {
            'review_rate': df['has_review'].mean(),
            'sentiment_distribution': {
                'positive': (df['sentiment_score'] > 0).mean(),
                'neutral': (df['sentiment_score'] == 0).mean(),
                'negative': (df['sentiment_score'] < 0).mean()
            }
        }
        
        # Analyze review length impact
        df['review_length'] = df['normalized_review'].str.len()
        review_patterns['length_correlation'] = df['review_length'].corr(df['rating'])
        
        return review_patterns
    
    def _analyze_price_impact(self, df):
        """Analyze the impact of pricing on customer satisfaction."""
        price_impact = {
            'discount_satisfaction': {
                'high_discount': df[df['discount_percentage'] > df['discount_percentage'].median()]['rating'].mean(),
                'low_discount': df[df['discount_percentage'] <= df['discount_percentage'].median()]['rating'].mean()
            },
            'price_range_performance': {}
        }
        
        # Analyze performance by price range
        df['price_quartile'] = pd.qcut(df['actual_price'], 4, labels=['budget', 'mid_low', 'mid_high', 'premium'])
        price_range_metrics = df.groupby('price_quartile').agg({
            'rating': 'mean',
            'sentiment_score': 'mean',
            'has_review': 'mean'
        }).to_dict('index')
        
        price_impact['price_range_performance'] = price_range_metrics
        
        return price_impact
    
    def _extract_common_terms(self, reviews, top_n=5):
        """Extract most common terms from positive reviews."""
        if reviews.empty:
            return []
            
        tfidf = TfidfVectorizer(
            stop_words='english',
            max_features=top_n,
            ngram_range=(1, 2)
        )
        
        try:
            tfidf_matrix = tfidf.fit_transform(reviews.fillna(''))
            feature_names = tfidf.get_feature_names_out()
            return list(feature_names)
        except:
            return []
    
    def analyze_customer_journeys(self, df):
        """Main method to analyze customer journeys."""
        # Preprocess data
        self.processed_df = self.preprocess_data(df)  # Store processed_df as instance variable
        
        # Perform analyses
        self.user_patterns = self.analyze_user_patterns(self.processed_df)
        self.category_insights = self.analyze_category_patterns(self.processed_df)
        self.journey_factors = self.analyze_journey_factors(self.processed_df)
        
        return {
            'user_patterns': self.user_patterns,
            'category_insights': self.category_insights,
            'journey_factors': self.journey_factors
        }
    
    def get_user_journey_summary(self, user_id):
        """Get a summary of a specific user's journey."""
        if user_id not in self.user_patterns:
            return None
            
        user_data = self.user_patterns[user_id]
        
        return {
            'interaction_level': 'High' if user_data['total_interactions'] > 5 else 'Medium' if user_data['total_interactions'] > 2 else 'Low',
            'satisfaction_level': 'High' if user_data['avg_rating'] > 4 else 'Medium' if user_data['avg_rating'] > 3 else 'Low',
            'price_sensitivity': 'High' if user_data['price_sensitivity'] > 0.5 else 'Medium' if user_data['price_sensitivity'] > 0.2 else 'Low',
            'preferred_categories': user_data['preferred_categories'],
            'consistency': user_data['rating_consistency']
        }



In [3]:
def analyze_customer_behavior(df):
    """Wrapper function to analyze customer behavior and generate insights."""
    analyzer = CustomerJourneyAnalyzer()
    analysis_results = analyzer.analyze_customer_journeys(df)
    
    # Generate summary insights using processed_df instead of original df
    summary = {
        'overall_satisfaction': {
            'average_rating': analyzer.processed_df['rating'].mean(),
            'rating_distribution': analyzer.processed_df['rating'].value_counts().to_dict(),
            'sentiment_distribution': {
                'positive': (analyzer.processed_df['sentiment_score'] > 0).mean(),
                'negative': (analyzer.processed_df['sentiment_score'] < 0).mean(),
                'neutral': (analyzer.processed_df['sentiment_score'] == 0).mean()
            }
        },
        'top_performing_categories': sorted(
            analysis_results['category_insights'].items(),
            key=lambda x: x[1]['avg_rating'],
            reverse=True
        )[:3],
        'key_factors': {
            'price_sensitivity': analysis_results['journey_factors']['price_impact'],
            'review_patterns': analysis_results['journey_factors']['review_patterns']
        }
    }
    
    return analyzer, summary

In [4]:
# Load DataFrame
df = pd.read_csv('/Users/anithasmac/Projects/CustomerJourneyMapping/Featured_Amazon_Data.csv')

In [5]:
# Analyze customer journeys
analyzer, summary = analyze_customer_behavior(df)

# Print overall insights
print("\nOverall Customer Journey Insights:")
print(f"Average Rating: {summary['overall_satisfaction']['average_rating']:.2f}")
print("\nTop Performing Categories:")
for category, metrics in summary['top_performing_categories']:
    print(f"{category}: {metrics['avg_rating']:.2f} average rating")

# Get specific user journey
user_id = df['user_id'].iloc[0]
user_journey = analyzer.get_user_journey_summary(user_id)
print(f"\nUser Journey Summary for {user_id}:")
print(user_journey)



Overall Customer Journey Insights:
Average Rating: 4.11

Top Performing Categories:
Computers&Accessories|Tablets: 4.60 average rating
Electronics|HomeAudio|MediaStreamingDevices|StreamingClients: 4.50 average rating
Electronics|Cameras&Photography|Accessories|Film: 4.50 average rating

User Journey Summary for AG3D6O4STAQKAY2UVGEUV46KN35Q,AHMY5CWJMMK5BJRBBSNLYT3ONILA,AHCTC6ULH4XB6YHDY6PCH2R772LQ,AGYHHIERNXKA6P5T7CZLXKVPT7IQ,AG4OGOFWXJZTQ2HKYIOCOY3KXF2Q,AENGU523SXMOS7JPDTW52PNNVWGQ,AEQJHCVTNINBS4FKTBGQRQTGTE5Q,AFC3FFC5PKFF5PMA52S3VCHOZ5FQ:
{'interaction_level': 'High', 'satisfaction_level': 'High', 'price_sensitivity': 'Low', 'preferred_categories': ['Computers&Accessories|Accessories&Peripherals|Cables&Accessories|Cables|USBCables'], 'consistency': 1.0}
