In [1]:
import pandas as pd
import numpy as np
import json
import re
from datetime import datetime, timedelta
from collections import defaultdict, Counter
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from sklearn.preprocessing import LabelEncoder
import warnings
warnings.filterwarnings('ignore')

In [None]:
# Load the uploaded sample CSV
file_path = 'sample.csv'
df = pd.read_csv(file_path)

# Show the shape and first few rows
df.shape, df.head()

In [3]:
class CustomerSupportNBA:
    """
    Next-Best-Action system for customer support using Twitter data
    """
    
    def __init__(self):
        self.data = None
        self.conversations = {}
        self.customer_profiles = {}
        self.conversation_flows = {}
        self.resolution_patterns = {}
        
    def load_and_clean_data(self, file_path='sample.csv', sample_data=None):
        """
        Task 1: Data Pipeline - Load and normalize CST data
        """
        print("=== TASK 1: DATA PIPELINE ===")
        
        if sample_data is not None:
            # Use provided sample data for demonstration
            self.data = sample_data.copy()
        else:
            # In production, this would connect to the actual dataset
            self.data = pd.read_csv(file_path)
        
        print(f"Loaded {len(self.data)} records")
        
        # Data cleaning and normalization
        self.data['created_at'] = pd.to_datetime(self.data['created_at'])
        self.data['text'] = self.data['text'].fillna('')
        self.data['inbound'] = self.data['inbound'].astype(bool)
        
        # Create interaction table with deduplication
        self.data = self.data.drop_duplicates(subset=['tweet_id'])
        self.data = self.data.sort_values(['author_id', 'created_at'])
        
        print(f"After cleaning: {len(self.data)} unique records")
        print("Data pipeline completed with idempotent processing")
        
        return self.data
    
    def build_conversations(self):
        """
        Build conversation threads from tweet data
        """
        print("\n=== BUILDING CONVERSATION THREADS ===")
        
        # Group by conversation threads
        conversations = defaultdict(list)
        
        for _, row in self.data.iterrows():
            # Find conversation root
            conv_id = self._find_conversation_root(row)
            conversations[conv_id].append(row.to_dict())  # Convert to dict
        
        # Sort conversations by timestamp
        for conv_id in conversations:
            conversations[conv_id] = sorted(conversations[conv_id], 
                                          key=lambda x: x['created_at'])
        
        self.conversations = dict(conversations)
        print(f"Built {len(self.conversations)} conversation threads")
        
        return self.conversations
    
    def _find_conversation_root(self, row):
        """
        Find the root of a conversation thread
        """
        if pd.isna(row['in_response_to_tweet_id']):
            return row['tweet_id']
        else:
            # Follow the chain back to find root
            current = row['in_response_to_tweet_id']
            visited = set()
            
            while current and current not in visited:
                visited.add(current)
                parent_row = self.data[self.data['tweet_id'] == current]
                if len(parent_row) == 0:
                    break
                parent_row = parent_row.iloc[0]
                if pd.isna(parent_row['in_response_to_tweet_id']):
                    return current
                current = parent_row['in_response_to_tweet_id']
            
            return row['tweet_id']
    
    def analyze_user_behavior(self):
        """
        Task 2: Observe user behavior and identify conversation flows
        """
        print("\n=== TASK 2: USER BEHAVIOR ANALYSIS ===")
        
        # Analyze sentiment patterns
        self._analyze_sentiment()
        
        # Identify conversation flows
        self._identify_conversation_flows()
        
        # Tag resolved vs open issues
        self._tag_issue_status()
        
        # Create customer cohorts
        self._create_customer_cohorts()
        
        # Generate visualizations
        self._create_visualizations()
        
        return self.customer_profiles
    
    def _analyze_sentiment(self):
        """
        Analyze sentiment of customer messages
        """
        print("Analyzing sentiment patterns...")
        
        sentiment_keywords = {
            'frustrated': ['wtf', 'frustrated', 'hate', 'angry', 'terrible', 'awful', 'worst', 'üò°'],
            'urgent': ['urgent', 'immediately', 'asap', 'emergency', 'critical'],
            'polite': ['please', 'thank', 'appreciate', 'sorry', 'help'],
            'technical': ['error', 'bug', 'crash', 'freeze', 'broken', 'issue', 'problem', 'update', 'version']
        }
        
        for conv_id, messages in self.conversations.items():
            customer_messages = [msg for msg in messages if msg['inbound']]
            
            sentiment_scores = defaultdict(int)
            for msg in customer_messages:
                text = str(msg['text']).lower()
                for sentiment, keywords in sentiment_keywords.items():
                    for keyword in keywords:
                        if keyword in text:
                            sentiment_scores[sentiment] += 1
            
            # Assign primary sentiment
            primary_sentiment = max(sentiment_scores.items(), 
                                  key=lambda x: x[1])[0] if sentiment_scores else 'neutral'
            
            self.customer_profiles[conv_id] = {
                'sentiment': primary_sentiment,
                'sentiment_scores': dict(sentiment_scores),
                'message_count': len(customer_messages)
            }
    
    def _identify_conversation_flows(self):
        """
        Identify different conversation flow patterns
        """
        print("Identifying conversation flows...")
        
        flow_patterns = {
            'quick_resolution': [],
            'escalated': [],
            'abandoned': [],
            'multi_turn': []
        }
        
        for conv_id, messages in self.conversations.items():
            customer_msgs = [msg for msg in messages if msg['inbound']]
            support_msgs = [msg for msg in messages if not msg['inbound']]
            
            total_turns = len(customer_msgs) + len(support_msgs)
            
            # Classify conversation flow
            if total_turns <= 4 and len(support_msgs) > 0:
                flow_type = 'quick_resolution'
            elif total_turns > 8:
                flow_type = 'multi_turn'
            elif len(support_msgs) == 0:
                flow_type = 'abandoned'
            else:
                flow_type = 'escalated'
            
            flow_patterns[flow_type].append(conv_id)
            self.customer_profiles[conv_id]['conversation_flow'] = flow_type
            self.customer_profiles[conv_id]['turn_count'] = total_turns
        
        self.conversation_flows = flow_patterns
        
        for flow, convs in flow_patterns.items():
            print(f"{flow}: {len(convs)} conversations")
    
    def _tag_issue_status(self):
        """
        Tag issues as resolved or open
        """
        print("Tagging issue resolution status...")
        
        resolution_keywords = [
            'resolved', 'fixed', 'working', 'solved', 'thanks', 'brilliant',
            'great', 'perfect', 'sorted', 'helped', 'love', 'üòÄ'
        ]
        
        resolved_count = 0
        open_count = 0
        
        for conv_id, messages in self.conversations.items():
            # Check last few customer messages for resolution indicators
            customer_msgs = [msg for msg in messages if msg['inbound']]
            
            is_resolved = False
            if customer_msgs:
                last_msgs = customer_msgs[-2:] if len(customer_msgs) >= 2 else customer_msgs
                for msg in last_msgs:
                    text = str(msg['text']).lower()
                    if any(keyword in text for keyword in resolution_keywords):
                        is_resolved = True
                        break
            
            self.customer_profiles[conv_id]['is_resolved'] = is_resolved
            
            if is_resolved:
                resolved_count += 1
            else:
                open_count += 1
        
        print(f"Resolved issues: {resolved_count}")
        print(f"Open issues: {open_count}")
        
        return resolved_count, open_count
    
    def _create_customer_cohorts(self):
        """
        Create customer cohorts based on behavior patterns
        """
        print("Creating customer cohorts...")
        
        cohorts = {
            'frustrated_high_priority': [],
            'polite_technical': [],
            'quick_resolvers': [],
            'persistent_customers': []
        }
        
        for conv_id, profile in self.customer_profiles.items():
            sentiment = profile['sentiment']
            flow = profile['conversation_flow']
            turn_count = profile['turn_count']
            
            # Assign to cohorts
            if sentiment == 'frustrated' and profile['sentiment_scores'].get('urgent', 0) > 0:
                cohorts['frustrated_high_priority'].append(conv_id)
            elif sentiment == 'polite' and profile['sentiment_scores'].get('technical', 0) > 0:
                cohorts['polite_technical'].append(conv_id)
            elif flow == 'quick_resolution':
                cohorts['quick_resolvers'].append(conv_id)
            elif turn_count > 6:
                cohorts['persistent_customers'].append(conv_id)
        
        for cohort, customers in cohorts.items():
            print(f"{cohort}: {len(customers)} customers")
        
        return cohorts
    
    def _create_visualizations(self):
        """
        Create visualizations for behavior analysis
        """
        print("Creating behavior analysis visualizations...")
        
        # Sentiment distribution
        sentiments = [profile['sentiment'] for profile in self.customer_profiles.values()]
        sentiment_counts = Counter(sentiments)
        
        # Conversation flow distribution
        flows = [profile['conversation_flow'] for profile in self.customer_profiles.values()]
        flow_counts = Counter(flows)
        
        print("\nSentiment Distribution:")
        for sentiment, count in sentiment_counts.items():
            print(f"  {sentiment}: {count}")
        
        print("\nConversation Flow Distribution:")
        for flow, count in flow_counts.items():
            print(f"  {flow}: {count}")
    
    def build_nba_engine(self):
        """
        Task 3: Next-Best-Action Engine
        """
        print("\n=== TASK 3: NEXT-BEST-ACTION ENGINE ===")
        
        # Define action rules based on customer profiles
        self.action_rules = {
            'frustrated_high_priority': {
                'channel': 'scheduling_phone_call',
                'priority': 'immediate',
                'message_template': 'We understand your frustration and want to resolve this immediately. A specialist will call you within the next hour.'
            },
            'polite_technical': {
                'channel': 'email_reply',
                'priority': 'high',
                'message_template': 'Thank you for your patience. We\'ve prepared detailed technical steps to resolve your issue.'
            },
            'quick_resolvers': {
                'channel': 'twitter_dm_reply',
                'priority': 'normal',
                'message_template': 'Hi! We have a quick solution for your issue. Let\'s get this sorted right away.'
            },
            'persistent_customers': {
                'channel': 'scheduling_phone_call',
                'priority': 'high',
                'message_template': 'We appreciate your patience with this ongoing issue. Let\'s schedule a call to resolve this comprehensively.'
            }
        }
        
        print("NBA Engine rules configured")
        return self.action_rules
    
    def generate_nba_recommendation(self, conversation_id):
        """
        Generate NBA recommendation for a specific conversation
        """
        if conversation_id not in self.customer_profiles:
            return None
        
        profile = self.customer_profiles[conversation_id]
        
        # Skip if already resolved
        if profile.get('is_resolved', False):
            return {
                'customer_id': str(conversation_id),
                'status': 'already_resolved',
                'action': None,
                'channel': None,
                'send_time': None,
                'message': None,
                'reasoning': 'Issue already resolved by customer'
            }
        
        # Determine customer cohort
        cohort = self._determine_cohort(profile)
        
        # Get action rule
        action_rule = self.action_rules.get(cohort, self.action_rules['quick_resolvers'])
        
        # Calculate send time (business hours + urgency)
        send_time = self._calculate_send_time(profile, action_rule['priority'])
        
        # Generate personalized message
        message = self._generate_message(profile, action_rule['message_template'])
        
        recommendation = {
            'customer_id': str(conversation_id),
            'channel': action_rule['channel'],
            'send_time': send_time.isoformat(),
            'message': message,
            'reasoning': self._generate_reasoning(profile, cohort, action_rule),
            'status': 'pending_action'
        }
        
        return recommendation
    
    def _determine_cohort(self, profile):
        """
        Determine which cohort a customer belongs to
        """
        sentiment = profile['sentiment']
        flow = profile['conversation_flow']
        turn_count = profile['turn_count']
        sentiment_scores = profile.get('sentiment_scores', {})
        
        if sentiment == 'frustrated' and sentiment_scores.get('urgent', 0) > 0:
            return 'frustrated_high_priority'
        elif sentiment == 'polite' and sentiment_scores.get('technical', 0) > 0:
            return 'polite_technical'
        elif flow == 'quick_resolution':
            return 'quick_resolvers'
        elif turn_count > 6:
            return 'persistent_customers'
        else:
            return 'quick_resolvers'  # default
    
    def _calculate_send_time(self, profile, priority):
        """
        Calculate optimal send time based on priority and business hours
        """
        now = datetime.now()
        
        if priority == 'immediate':
            return now + timedelta(minutes=30)
        elif priority == 'high':
            return now + timedelta(hours=2)
        else:
            # Next business day morning
            next_day = now + timedelta(days=1)
            return next_day.replace(hour=9, minute=0, second=0, microsecond=0)
    
    def _generate_message(self, profile, template):
        """
        Generate personalized message based on customer profile
        """
        # Simple template personalization
        if profile['sentiment'] == 'frustrated':
            return template + " We sincerely apologize for the inconvenience."
        elif profile['sentiment'] == 'polite':
            return template + " We appreciate your patience and understanding."
        else:
            return template
    
    def _generate_reasoning(self, profile, cohort, action_rule):
        """
        Generate reasoning for the NBA recommendation
        """
        reasoning = f"Customer classified as '{cohort}' based on "
        
        factors = []
        if profile['sentiment'] != 'neutral':
            factors.append(f"sentiment: {profile['sentiment']}")
        
        factors.append(f"conversation flow: {profile['conversation_flow']}")
        factors.append(f"interaction count: {profile['turn_count']}")
        
        reasoning += ", ".join(factors)
        reasoning += f". {action_rule['channel']} selected for optimal resolution efficiency"
        
        if action_rule['channel'] == 'scheduling_phone_call':
            reasoning += " - direct communication needed for complex issue"
        elif action_rule['channel'] == 'email_reply':
            reasoning += " - detailed written response appropriate for technical query"
        else:
            reasoning += " - quick Twitter response suitable for simple issue"
        
        return reasoning
    
    def run_evaluation(self, sample_size=1000):
        """
        Task 4: Evaluation - Run NBA system on sample customers
        """
        print(f"\n=== TASK 4: EVALUATION ===")
        print(f"Running NBA evaluation on {sample_size} customers...")
        
        # Get sample of conversations
        conv_ids = list(self.customer_profiles.keys())
        sample_ids = conv_ids[:min(sample_size, len(conv_ids))]
        
        results = []
        already_resolved = 0
        
        for conv_id in sample_ids:
            recommendation = self.generate_nba_recommendation(conv_id)
            
            # if recommendation and recommendation.get('status') == 'already_resolved':
            #     already_resolved += 1
            #     continue
            
            if not recommendation:
                continue
            
            # Add conversation log
            chat_log = self._generate_chat_log(conv_id)
            
            # Predict issue status after action
            predicted_status = self._predict_issue_status(recommendation)
            
            result = {
                'customer_id': recommendation['customer_id'],
                'channel': recommendation['channel'],
                'send_time': recommendation['send_time'],
                'message': recommendation['message'],
                'reasoning': recommendation['reasoning'],
                'chat_log': chat_log,
                'issue_status': predicted_status
            }
            
            results.append(result)
        
        print(f"Already resolved issues excluded: {already_resolved}")
        print(f"Active recommendations generated: {len(results)}")
        
        # Generate predictions
        if results:  # Only generate predictions if we have results
            self._generate_predictions(results)
        
        return results
    
    def _generate_chat_log(self, conv_id):
        """
        Generate readable chat log for a conversation
        """
        if conv_id not in self.conversations:
            return "No conversation found"
        
        messages = self.conversations[conv_id]
        chat_log = []
        
        for msg in messages:
            if msg['inbound']:
                role = "Customer"
            else:
                role = "Support_agent"
            
            # Clean message text
            text = str(msg['text']).replace('@', '').replace('https://t.co/', '[link]')
            chat_log.append(f"{role}: {text}")
        
        return "\n".join(chat_log)
    
    def _predict_issue_status(self, recommendation):
        """
        Predict issue status after taking the recommended action
        """
        channel = recommendation['channel']
        
        # Simple prediction logic based on channel effectiveness
        if channel == 'scheduling_phone_call':
            return 'resolved'  # Phone calls have highest resolution rate
        elif channel == 'email_reply':
            return 'pending_customer_reply'  # Email requires customer response
        else:
            return 'resolved'  # Twitter DM usually resolves quickly
    
    def _generate_predictions(self, results):
        """
        Generate prediction metrics
        """
        status_counts = Counter([r['issue_status'] for r in results])
        channel_counts = Counter([r['channel'] for r in results])
        
        print("\nPredicted Issue Status Distribution:")
        for status, count in status_counts.items():
            print(f"  {status}: {count}")
        
        print("\nChannel Distribution:")
        for channel, count in channel_counts.items():
            print(f"  {channel}: {count}")
        
        # Estimate resolution rate
        resolved = status_counts.get('resolved', 0)
        total = sum(status_counts.values())
        resolution_rate = (resolved / total * 100) if total > 0 else 0
        
        print(f"\nPredicted Resolution Rate: {resolution_rate:.1f}%")
    
    def export_results(self, results, filename='results.csv'):
        """
        Export results to CSV
        """
        if not results:
            print("No results to export")
            return pd.DataFrame()
            
        df = pd.DataFrame(results)
        df.to_csv(filename, index=False)
        print(f"\nResults exported to {filename}")
        return df



In [4]:
def run_nba_demo_with_file(file_path):
    """
    Run complete NBA system demonstration with CSV file input
    """
    print("üöÄ Starting Next-Best-Action System Demo with CSV")
    print("=" * 50)
    
    # Initialize system
    nba = CustomerSupportNBA()
    
    # Task 1: Data Pipeline with CSV
    nba.load_and_clean_data(file_path=file_path)
    nba.build_conversations()
    
    # Task 2: User Behavior Analysis
    nba.analyze_user_behavior()
    
    # Task 3: NBA Engine
    nba.build_nba_engine()
    
    # Task 4: Evaluation
    results = nba.run_evaluation(sample_size=1000)
    
    # Export results
    results_df = nba.export_results(results)
    
    print("\n" + "=" * 50)
    print("‚úÖ NBA System Demo Completed Successfully!")
    print(f"üìä Generated {len(results)} recommendations")
    if len(results) > 0:
        print("üìÅ Results exported to results.csv")
    
    return nba, results_df

In [None]:
if __name__ == "__main__":
    file_path = 'sample.csv'
    nba_system, results_df = run_nba_demo_with_file(file_path)
    
    # Display sample results
    if len(results_df) > 0:
        print("\nüîç SAMPLE NBA RECOMMENDATIONS:")
        print("=" * 50)
        results_list = results_df.to_dict('records')
        for i, result in enumerate(results_list[:3]):
            print(f"\nRecommendation {i+1}:")
            print(f"Customer ID: {result['customer_id']}")
            print(f"Channel: {result['channel']}")
            print(f"Message: {result['message']}")
            print(f"Reasoning: {result['reasoning']}")
            print(f"Predicted Status: {result['issue_status']}")
            print("-" * 30)
    else:
        print("\n‚ö†Ô∏è No active recommendations generated (all issues may be resolved)")