# WhatsApp Chat Analysis

This notebook provides comprehensive analysis of WhatsApp chat data including:

## Analysis Features:
1. **Basic Frequency Analysis**: Message counts, user activity patterns
2. **Temporal Analysis**: Time-based messaging patterns, peak days
3. **Emoji & Word Analysis**: Most used emojis, word frequency, longest messages
4. **Video Call Analysis**: Duration tracking and patterns
5. **Sentiment Analysis**: Positive/negative message classification and trends
6. **Advanced Pattern Analysis**: Specific word usage tracking over time

**Data Source**: Parsed WhatsApp chat data from `whatsapp_parsed_data.csv`

## 1. Data Loading and Preprocessing

Import necessary libraries and load the parsed WhatsApp chat data for analysis.

In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import re
from datetime import datetime, timedelta
from collections import Counter
import emoji
import warnings
warnings.filterwarnings('ignore')

# Configure plotting
plt.style.use('default')
sns.set_palette("husl")
plt.rcParams['figure.figsize'] = (12, 8)

print("Libraries imported successfully!")

In [None]:
# Load the parsed WhatsApp data
df = pd.read_csv('data/whatsapp_parsed_data.csv')

# Create proper datetime column
df['date_str'] = df['Day'].astype(str) + '/' + df['Month'].astype(str) + '/' + df['Year'].astype(str)
df['datetime'] = pd.to_datetime(df['date_str'] + ' ' + df['Time'], format='%d/%m/%Y %I:%M:%S %p')
df['date'] = df['datetime'].dt.date
df['hour'] = df['datetime'].dt.hour
df['weekday'] = df['datetime'].dt.day_name()

# Basic dataset information
print("Dataset Information:")
print(f"Total messages: {len(df):,}")
print(f"Date range: {df['datetime'].min()} to {df['datetime'].max()}")
print(f"Users: {df['User'].unique()}")
print(f"Columns: {list(df.columns)}")
print(f"\nDataset shape: {df.shape}")

# Display first few rows
print("\nFirst 5 rows:")
df.head()

## 2. Basic Text Frequency Analysis

Analyze message frequency patterns and user activity levels.

In [None]:
# Messages sent per user
user_message_counts = df['User'].value_counts()
print("Messages per user:")
for user, count in user_message_counts.items():
    percentage = (count / len(df)) * 100
    print(f"{user}: {count:,} messages ({percentage:.1f}%)")

# Create visualizations
fig, axes = plt.subplots(1, 2, figsize=(15, 6))

# Bar chart
user_message_counts.plot(kind='bar', ax=axes[0], color=['#FF6B6B', '#4ECDC4'])
axes[0].set_title('Messages Sent Per User')
axes[0].set_xlabel('User')
axes[0].set_ylabel('Number of Messages')
axes[0].tick_params(axis='x', rotation=45)

# Pie chart
axes[1].pie(user_message_counts.values, labels=user_message_counts.index, autopct='%1.1f%%', 
           colors=['#FF6B6B', '#4ECDC4'], startangle=90)
axes[1].set_title('Message Distribution')

plt.tight_layout()
plt.show()

# Interactive plotly chart
fig_plotly = px.bar(x=user_message_counts.index, y=user_message_counts.values,
                   title="Messages Sent Per User (Interactive)",
                   labels={'x': 'User', 'y': 'Number of Messages'},
                   color=user_message_counts.index)
fig_plotly.show()

## 3. Temporal Analysis

Analyze messaging patterns over time including monthly, yearly, and daily trends.

In [None]:
# Messages by month and year
df['year_month'] = df['datetime'].dt.to_period('M')
monthly_counts = df.groupby('year_month').size()
yearly_counts = df.groupby('Year').size()

# Plotting monthly trends
fig, axes = plt.subplots(2, 1, figsize=(15, 12))

# Monthly trend
monthly_counts.plot(kind='line', ax=axes[0], marker='o', linewidth=2, markersize=4)
axes[0].set_title('Messages Per Month Over Time')
axes[0].set_xlabel('Month')
axes[0].set_ylabel('Number of Messages')
axes[0].grid(True, alpha=0.3)
axes[0].tick_params(axis='x', rotation=45)

# Yearly trend
yearly_counts.plot(kind='bar', ax=axes[1], color='skyblue', alpha=0.8)
axes[1].set_title('Messages Per Year')
axes[1].set_xlabel('Year')
axes[1].set_ylabel('Number of Messages')
axes[1].tick_params(axis='x', rotation=0)

plt.tight_layout()
plt.show()

# Print top months and years
print("Top 5 most active months:")
print(monthly_counts.nlargest(5))
print("\nMessages by year:")
print(yearly_counts)

# Interactive time series plot
fig_interactive = px.line(x=monthly_counts.index.astype(str), y=monthly_counts.values,
                         title="Monthly Message Frequency (Interactive)",
                         labels={'x': 'Month', 'y': 'Number of Messages'})
fig_interactive.update_traces(mode='lines+markers')
fig_interactive.show()

In [None]:
# Daily analysis - find peak messaging days
daily_counts = df.groupby('date').size().sort_values(ascending=False)
daily_counts_by_user = df.groupby(['date', 'User']).size().unstack(fill_value=0)

print("Top 10 most active days (total messages):")
for i, (date, count) in enumerate(daily_counts.head(10).items(), 1):
    print(f"{i}. {date}: {count} messages")

print("\nPeak messaging days by user:")
for user in df['User'].unique():
    user_daily = df[df['User'] == user].groupby('date').size()
    peak_day = user_daily.idxmax()
    peak_count = user_daily.max()
    print(f"{user}: {peak_day} ({peak_count} messages)")

# Weekday analysis
weekday_counts = df.groupby('weekday').size()
weekday_order = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']
weekday_counts = weekday_counts.reindex(weekday_order)

# Hour analysis
hourly_counts = df.groupby('hour').size()

# Create subplots for daily patterns
fig, axes = plt.subplots(2, 2, figsize=(16, 12))

# Top 30 days bar chart
daily_counts.head(30).plot(kind='bar', ax=axes[0,0], color='coral')
axes[0,0].set_title('Top 30 Most Active Days')
axes[0,0].set_xlabel('Date')
axes[0,0].set_ylabel('Messages')
axes[0,0].tick_params(axis='x', rotation=45)

# Weekday distribution
weekday_counts.plot(kind='bar', ax=axes[0,1], color='lightgreen')
axes[0,1].set_title('Messages by Day of Week')
axes[0,1].set_xlabel('Day of Week')
axes[0,1].set_ylabel('Messages')
axes[0,1].tick_params(axis='x', rotation=45)

# Hourly distribution
hourly_counts.plot(kind='line', ax=axes[1,0], marker='o', color='purple')
axes[1,0].set_title('Messages by Hour of Day')
axes[1,0].set_xlabel('Hour')
axes[1,0].set_ylabel('Messages')
axes[1,0].grid(True, alpha=0.3)

# Heatmap of weekday vs hour
pivot_data = df.groupby(['weekday', 'hour']).size().unstack(fill_value=0)
pivot_data = pivot_data.reindex(weekday_order)
sns.heatmap(pivot_data, ax=axes[1,1], cmap='YlOrRd', cbar_kws={'label': 'Messages'})
axes[1,1].set_title('Activity Heatmap: Weekday vs Hour')
axes[1,1].set_xlabel('Hour')
axes[1,1].set_ylabel('Day of Week')

plt.tight_layout()
plt.show()

## 4. Emoji and Word Analysis

Analyze emoji usage patterns and most frequently used words.

In [None]:
# Emoji analysis
def extract_emojis(text):
    """Extract emojis from text"""
    return [char for char in text if char in emoji.EMOJI_DATA]

# Extract all emojis
all_emojis = []
user_emojis = {user: [] for user in df['User'].unique()}

for idx, row in df.iterrows():
    emojis_in_text = extract_emojis(str(row['text_message']))
    all_emojis.extend(emojis_in_text)
    user_emojis[row['User']].extend(emojis_in_text)

# Count emojis
emoji_counts = Counter(all_emojis)
user_emoji_counts = {user: Counter(emojis) for user, emojis in user_emojis.items()}

# Display top emojis
print("Top 20 Most Used Emojis (Overall):")
for i, (emoji_char, count) in enumerate(emoji_counts.most_common(20), 1):
    print(f"{i:2d}. {emoji_char} : {count:,} times")

print("\nTop 10 Emojis by User:")
for user in df['User'].unique():
    print(f"\n{user}:")
    for i, (emoji_char, count) in enumerate(user_emoji_counts[user].most_common(10), 1):
        print(f"  {i:2d}. {emoji_char} : {count:,} times")

# Visualize top emojis
if emoji_counts:
    top_emojis = dict(emoji_counts.most_common(15))
    
    fig, axes = plt.subplots(1, 2, figsize=(16, 6))
    
    # Overall emoji usage
    axes[0].bar(range(len(top_emojis)), list(top_emojis.values()), color='gold')
    axes[0].set_title('Top 15 Most Used Emojis (Overall)')
    axes[0].set_xlabel('Emoji Rank')
    axes[0].set_ylabel('Usage Count')
    axes[0].set_xticks(range(len(top_emojis)))
    axes[0].set_xticklabels(list(top_emojis.keys()), fontsize=16)
    
    # Emoji usage by user
    users = list(df['User'].unique())
    top_5_emojis = list(emoji_counts.most_common(5))
    
    x = np.arange(len(top_5_emojis))
    width = 0.35
    
    for i, user in enumerate(users):
        user_counts = [user_emoji_counts[user].get(emoji_char, 0) for emoji_char, _ in top_5_emojis]
        axes[1].bar(x + i * width, user_counts, width, label=user, alpha=0.8)
    
    axes[1].set_title('Top 5 Emojis by User')
    axes[1].set_xlabel('Emoji')
    axes[1].set_ylabel('Usage Count')
    axes[1].set_xticks(x + width/2)
    axes[1].set_xticklabels([emoji_char for emoji_char, _ in top_5_emojis], fontsize=16)
    axes[1].legend()
    
    plt.tight_layout()
    plt.show()
else:
    print("No emojis found in the dataset.")

In [None]:
# Word frequency analysis
import string
from collections import defaultdict

def clean_text_for_words(text):
    """Clean text and extract words"""
    if pd.isna(text):
        return []
    
    # Convert to lowercase and remove punctuation
    text = str(text).lower()
    translator = str.maketrans('', '', string.punctuation)
    text = text.translate(translator)
    
    # Split into words and filter out common stopwords, short words, and media-related terms
    stopwords = {'the', 'and', 'or', 'but', 'in', 'on', 'at', 'to', 'for', 'of', 'with', 'by', 
                'a', 'an', 'is', 'are', 'was', 'were', 'be', 'been', 'have', 'has', 'had', 
                'do', 'does', 'did', 'will', 'would', 'could', 'should', 'may', 'might', 
                'must', 'can', 'i', 'you', 'he', 'she', 'it', 'we', 'they', 'me', 'him', 
                'her', 'us', 'them', 'my', 'your', 'his', 'her', 'its', 'our', 'their',
                'this', 'that', 'these', 'those', 'am', 'if', 'then', 'than', 'so', 'very',
                'just', 'now', 'get', 'got', 'go', 'went', 'like', 'also', 'well', 'know',
                'think', 'want', 'see', 'come', 'came', 'back', 'time', 'good', 'make',
                'way', 'even', 'new', 'take', 'say', 'said', 'one', 'two', 'first', 'last',
                # Media-related terms to exclude from word analysis
                'image', 'omitted', 'video', 'audio', 'document', 'sticker', 'gif'}
    
    words = [word for word in text.split() if len(word) > 2 and word not in stopwords]
    return words

# Extract words for all users and individually
all_words = []
user_words = defaultdict(list)

for idx, row in df.iterrows():
    words = clean_text_for_words(row['text_message'])
    all_words.extend(words)
    user_words[row['User']].extend(words)

# Count words
word_counts = Counter(all_words)
user_word_counts = {user: Counter(words) for user, words in user_words.items()}

# Display top words
print("Top 50 Most Common Words (Overall):")
for i, (word, count) in enumerate(word_counts.most_common(50), 1):
    print(f"{i:2d}. {word:15} : {count:,} times")

print("\nTop 20 Words by User:")
for user in df['User'].unique():
    print(f"\n{user}:")
    for i, (word, count) in enumerate(user_word_counts[user].most_common(20), 1):
        print(f"  {i:2d}. {word:15} : {count:,} times")

# Visualize top words
if word_counts:
    top_words = dict(word_counts.most_common(20))
    
    fig, axes = plt.subplots(1, 2, figsize=(20, 8))
    
    # Overall word usage
    axes[0].barh(range(len(top_words)), list(top_words.values()), color='lightcoral')
    axes[0].set_title('Top 20 Most Used Words (Overall)')
    axes[0].set_xlabel('Usage Count')
    axes[0].set_ylabel('Words')
    axes[0].set_yticks(range(len(top_words)))
    axes[0].set_yticklabels(list(top_words.keys()))
    axes[0].invert_yaxis()
    
    # Word usage comparison between users
    users = list(df['User'].unique())
    top_10_words = list(word_counts.most_common(10))
    
    x = np.arange(len(top_10_words))
    width = 0.35
    
    for i, user in enumerate(users):
        user_counts = [user_word_counts[user].get(word, 0) for word, _ in top_10_words]
        axes[1].bar(x + i * width, user_counts, width, label=user, alpha=0.8)
    
    axes[1].set_title('Top 10 Words by User Comparison')
    axes[1].set_xlabel('Words')
    axes[1].set_ylabel('Usage Count')
    axes[1].set_xticks(x + width/2)
    axes[1].set_xticklabels([word for word, _ in top_10_words], rotation=45)
    axes[1].legend()
    
    plt.tight_layout()
    plt.show()

In [None]:
# Word Cloud Visualization
from wordcloud import WordCloud
import matplotlib.pyplot as plt

print("=== WORD CLOUD VISUALIZATION ===")

# Create word clouds for overall and individual users
fig, axes = plt.subplots(2, 2, figsize=(20, 16))

# Overall word cloud
if word_counts:
    # Create word frequency dictionary for wordcloud
    wordcloud_overall = WordCloud(
        width=800, 
        height=400, 
        background_color='white',
        colormap='viridis',
        max_words=100,
        relative_scaling=0.5,
        random_state=42
    ).generate_from_frequencies(word_counts)
    
    axes[0,0].imshow(wordcloud_overall, interpolation='bilinear')
    axes[0,0].set_title('Overall Word Cloud (All Messages)', fontsize=16, fontweight='bold')
    axes[0,0].axis('off')
else:
    axes[0,0].text(0.5, 0.5, 'No words found', ha='center', va='center', transform=axes[0,0].transAxes)
    axes[0,0].set_title('Overall Word Cloud')

# Individual user word clouds
users = list(df['User'].unique())
user_positions = [(0,1), (1,0), (1,1)]  # positions for up to 3 users

for i, user in enumerate(users):
    if i < len(user_positions):
        row, col = user_positions[i]
        
        if user_word_counts[user]:
            # Create user-specific word cloud
            user_wordcloud = WordCloud(
                width=800, 
                height=400, 
                background_color='white',
                colormap='plasma' if i == 0 else 'cool',
                max_words=100,
                relative_scaling=0.5,
                random_state=42
            ).generate_from_frequencies(user_word_counts[user])
            
            axes[row,col].imshow(user_wordcloud, interpolation='bilinear')
            axes[row,col].set_title(f'{user} Word Cloud', fontsize=16, fontweight='bold')
            axes[row,col].axis('off')
        else:
            axes[row,col].text(0.5, 0.5, f'No words found for {user}', 
                              ha='center', va='center', transform=axes[row,col].transAxes)
            axes[row,col].set_title(f'{user} Word Cloud')

# Hide unused subplot if only 2 users
if len(users) == 2:
    axes[1,1].set_visible(False)

plt.tight_layout()
plt.show()

In [None]:

# Create combined word cloud with top words from each user
print("\n=== COMBINED USER COMPARISON WORD CLOUD ===")

if len(users) >= 2:
    fig, axes = plt.subplots(1, 2, figsize=(20, 8))
    
    for i, user in enumerate(users[:2]):  # Show first 2 users
        if user_word_counts[user]:
            # Get top 50 words for this user
            top_user_words = dict(user_word_counts[user].most_common(50))
            
            user_wordcloud = WordCloud(
                width=800, 
                height=400, 
                background_color='white',
                colormap='Set1' if i == 0 else 'Set2',
                max_words=50,
                relative_scaling=0.5,
                random_state=42
            ).generate_from_frequencies(top_user_words)
            
            axes[i].imshow(user_wordcloud, interpolation='bilinear')
            axes[i].set_title(f'{user} - Top 50 Words', fontsize=16, fontweight='bold')
            axes[i].axis('off')
    
    plt.tight_layout()
    plt.show()

In [None]:
# Show word cloud statistics
print(f"\n=== WORD CLOUD STATISTICS ===")
print(f"Total unique words: {len(word_counts):,}")
print(f"Most frequent word: '{word_counts.most_common(1)[0][0]}' ({word_counts.most_common(1)[0][1]:,} times)")

for user in users:
    user_unique_words = len(user_word_counts[user])
    if user_unique_words > 0:
        most_frequent_user_word = user_word_counts[user].most_common(1)[0]
        print(f"{user} unique words: {user_unique_words:,}, most frequent: '{most_frequent_user_word[0]}' ({most_frequent_user_word[1]:,} times)")

# Word diversity analysis
print(f"\n=== WORD DIVERSITY ANALYSIS ===")
total_words = sum(word_counts.values())
unique_words = len(word_counts)
word_diversity = unique_words / total_words if total_words > 0 else 0

print(f"Total words used: {total_words:,}")
print(f"Unique words: {unique_words:,}")
print(f"Word diversity ratio: {word_diversity:.4f} (higher = more diverse vocabulary)")

for user in users:
    user_total_words = sum(user_word_counts[user].values())
    user_unique_words = len(user_word_counts[user])
    user_diversity = user_unique_words / user_total_words if user_total_words > 0 else 0
    
    print(f"{user} - Total: {user_total_words:,}, Unique: {user_unique_words:,}, Diversity: {user_diversity:.4f}")

In [None]:
# Longest messages analysis
df['message_length'] = df['text_message'].astype(str).str.len()

print("=== LONGEST MESSAGES ANALYSIS ===")
print(f"Average message length: {df['message_length'].mean():.1f} characters")
print(f"Median message length: {df['message_length'].median():.1f} characters")
print(f"Maximum message length: {df['message_length'].max():,} characters")

# Longest messages overall
print("\nTop 10 Longest Messages (Overall):")
longest_messages = df.nlargest(10, 'message_length')
for i, (idx, row) in enumerate(longest_messages.iterrows(), 1):
    preview = str(row['text_message'])[:100] + "..." if len(str(row['text_message'])) > 100 else str(row['text_message'])
    print(f"{i:2d}. {row['User']} ({row['message_length']:,} chars): {preview}")
    print(f"    Date: {row['date']}")
    print()

# Longest messages by user
print("Longest Messages by User:")
for user in df['User'].unique():
    user_data = df[df['User'] == user]
    longest_msg = user_data.loc[user_data['message_length'].idxmax()]
    preview = str(longest_msg['text_message'])[:150] + "..." if len(str(longest_msg['text_message'])) > 150 else str(longest_msg['text_message'])
    print(f"\n{user}: {longest_msg['message_length']:,} characters")
    print(f"Date: {longest_msg['date']}")
    print(f"Message: {preview}")

In [None]:
# Message length distribution
plt.figure(figsize=(15, 5))
plt.subplot(1, 3, 1)
plt.hist(df['message_length'], bins=50, alpha=0.7, color='skyblue', edgecolor='black')
plt.title('Message Length Distribution')
plt.xlabel('Message Length (characters)')
plt.ylabel('Frequency')

plt.subplot(1, 3, 2)
df.boxplot(column='message_length', by='User', ax=plt.gca())
plt.title('Message Length by User')
plt.xlabel('User')
plt.ylabel('Message Length (characters)')

plt.subplot(1, 3, 3)
user_avg_length = df.groupby('User')['message_length'].mean()
user_avg_length.plot(kind='bar', color=['coral', 'lightgreen'])
plt.title('Average Message Length by User')
plt.xlabel('User')
plt.ylabel('Average Length (characters)')
plt.xticks(rotation=45)

plt.tight_layout()
plt.show()

# Communication time calculation
total_days = (df['datetime'].max() - df['datetime'].min()).days
total_hours = total_days * 24
messages_per_day = len(df) / total_days
avg_time_per_message = 20  # Assume 30 seconds per message

estimated_texting_time_hours = (len(df) * avg_time_per_message) / 3600
estimated_texting_time_days = estimated_texting_time_hours / 24

print(f"\n=== COMMUNICATION TIME ANALYSIS ===")
print(f"Total conversation period: {total_days:,} days ({total_hours:,} hours)")
print(f"Total messages: {len(df):,}")
print(f"Average messages per day: {messages_per_day:.1f}")
print(f"Estimated time spent texting: {estimated_texting_time_hours:.1f} hours ({estimated_texting_time_days:.1f} days)")
print(f"Percentage of time spent texting: {(estimated_texting_time_hours/total_hours)*100:.3f}%")

## 5. Video Call Duration Analysis

Extract and analyze video call durations from messages.

In [None]:
# Video call analysis
def extract_video_call_duration(text):
    """Extract video call duration from text like 'Video call, 45 min'"""
    if pd.isna(text):
        return None
    
    # Pattern to match "Video call, X min" or variations
    pattern = r'video call[,\s]+(\d+)\s*min'
    match = re.search(pattern, str(text).lower())
    
    if match:
        return int(match.group(1))
    return None

# Extract video call durations
df['video_call_duration'] = df['text_message'].apply(extract_video_call_duration)
video_calls = df[df['video_call_duration'].notna()].copy()


In [None]:

if len(video_calls) > 0:
    print("=== VIDEO CALL ANALYSIS ===")
    print(f"Total video calls found: {len(video_calls)}")
    print(f"Total video call time: {video_calls['video_call_duration'].sum():,} minutes")
    print(f"Total video call time: {video_calls['video_call_duration'].sum()/60:.1f} hours")
    print(f"Total video call time: {video_calls['video_call_duration'].sum()/(60*24):.1f} days")
    print(f"Average call duration: {video_calls['video_call_duration'].mean():.1f} minutes")
    print(f"Longest call: {video_calls['video_call_duration'].max()} minutes")
    print(f"Shortest call: {video_calls['video_call_duration'].min()} minutes")
    
    # Video calls by user
    print("\nVideo Calls by User:")
    call_stats_by_user = video_calls.groupby('User')['video_call_duration'].agg(['count', 'sum', 'mean'])
    call_stats_by_user.columns = ['Number of Calls', 'Total Minutes', 'Average Duration']
    call_stats_by_user['Total Hours'] = call_stats_by_user['Total Minutes'] / 60
    print(call_stats_by_user)
    
    # Video calls over time
    video_calls['call_month'] = video_calls['datetime'].dt.to_period('M')
    monthly_calls = video_calls.groupby('call_month')['video_call_duration'].agg(['count', 'sum'])
    
    # Visualizations
    fig, axes = plt.subplots(2, 2, figsize=(16, 12))
    
    # Call duration distribution
    axes[0,0].hist(video_calls['video_call_duration'], bins=20, alpha=0.7, color='lightblue', edgecolor='black')
    axes[0,0].set_title('Video Call Duration Distribution')
    axes[0,0].set_xlabel('Duration (minutes)')
    axes[0,0].set_ylabel('Frequency')
    
    # Calls by user
    call_counts = video_calls['User'].value_counts()
    call_counts.plot(kind='bar', ax=axes[0,1], color=['orange', 'green'])
    axes[0,1].set_title('Number of Video Calls by User')
    axes[0,1].set_xlabel('User')
    axes[0,1].set_ylabel('Number of Calls')
    axes[0,1].tick_params(axis='x', rotation=45)
    
    # Total minutes by user
    total_minutes = video_calls.groupby('User')['video_call_duration'].sum()
    total_minutes.plot(kind='bar', ax=axes[1,0], color=['coral', 'lightgreen'])
    axes[1,0].set_title('Total Video Call Minutes by User')
    axes[1,0].set_xlabel('User')
    axes[1,0].set_ylabel('Total Minutes')
    axes[1,0].tick_params(axis='x', rotation=45)
    
    # Calls over time
    monthly_calls['count'].plot(kind='line', ax=axes[1,1], marker='o', color='purple')
    axes[1,1].set_title('Video Calls per Month')
    axes[1,1].set_xlabel('Month')
    axes[1,1].set_ylabel('Number of Calls')
    axes[1,1].tick_params(axis='x', rotation=45)
    axes[1,1].grid(True, alpha=0.3)
    
    plt.tight_layout()
    plt.show()
    
    # Show some example video call messages
    print("\nExample Video Call Messages:")
    for i, (idx, row) in enumerate(video_calls.head(10).iterrows(), 1):
        print(f"{i}. {row['User']} - {row['date']}: {row['text_message']} ({row['video_call_duration']} min)")
    
else:
    print("No video call messages found in the format 'Video call, X min'")

## 6. Sentiment Analysis

Analyze the emotional content of messages and identify sentiment trends.

In [None]:
from textblob import TextBlob


def analyze_sentiment(text):
    """Analyze sentiment using TextBlob"""
    if pd.isna(text) or text == '':
        return 0, 'neutral'
    
    try:
        blob = TextBlob(str(text))
        polarity = blob.sentiment.polarity
        
        if polarity > 0.1:
            sentiment = 'positive'
        elif polarity < -0.1:
            sentiment = 'negative'
        else:
            sentiment = 'neutral'
            
        return polarity, sentiment
    except:
        return 0, 'neutral'

# Apply sentiment analysis
print("Analyzing sentiment for all messages...")
sentiment_results = df['text_message'].apply(analyze_sentiment)
df['sentiment_polarity'] = [result[0] for result in sentiment_results]
df['sentiment_label'] = [result[1] for result in sentiment_results]


In [None]:

print("=== SENTIMENT ANALYSIS RESULTS ===")

# Overall sentiment distribution
sentiment_counts = df['sentiment_label'].value_counts()
print("Overall Sentiment Distribution:")
for sentiment, count in sentiment_counts.items():
    percentage = (count / len(df)) * 100
    print(f"{sentiment}: {count:,} messages ({percentage:.1f}%)")

# Sentiment by user
print("\nSentiment Distribution by User:")
sentiment_by_user = df.groupby(['User', 'sentiment_label']).size().unstack(fill_value=0)
sentiment_by_user_pct = sentiment_by_user.div(sentiment_by_user.sum(axis=1), axis=0) * 100

for user in df['User'].unique():
    print(f"\n{user}:")
    for sentiment in ['positive', 'neutral', 'negative']:
        if sentiment in sentiment_by_user.columns:
            count = sentiment_by_user.loc[user, sentiment]
            pct = sentiment_by_user_pct.loc[user, sentiment]
            print(f"  {sentiment.capitalize()}: {count:,} ({pct:.1f}%)")


In [None]:

# Visualizations
fig, axes = plt.subplots(2, 2, figsize=(16, 12))

# Overall sentiment pie chart
colors = ['lightgreen', 'lightgray', 'lightcoral']
sentiment_counts.plot(kind='pie', ax=axes[0,0], autopct='%1.1f%%', colors=colors)
axes[0,0].set_title('Overall Sentiment Distribution')
axes[0,0].set_ylabel('')

# Sentiment by user
sentiment_by_user.plot(kind='bar', ax=axes[0,1], color=colors)
axes[0,1].set_title('Sentiment Count by User')
axes[0,1].set_xlabel('User')
axes[0,1].set_ylabel('Number of Messages')
axes[0,1].tick_params(axis='x', rotation=45)
axes[0,1].legend()

# Sentiment polarity distribution
axes[1,0].hist(df['sentiment_polarity'], bins=50, alpha=0.7, color='skyblue', edgecolor='black')
axes[1,0].set_title('Sentiment Polarity Distribution')
axes[1,0].set_xlabel('Polarity Score (-1 to 1)')
axes[1,0].set_ylabel('Frequency')
axes[1,0].axvline(0, color='red', linestyle='--', alpha=0.7)

# Average sentiment by user
avg_sentiment = df.groupby('User')['sentiment_polarity'].mean()
avg_sentiment.plot(kind='bar', ax=axes[1,1], color=['orange', 'green'])
axes[1,1].set_title('Average Sentiment Polarity by User')
axes[1,1].set_xlabel('User')
axes[1,1].set_ylabel('Average Polarity')
axes[1,1].tick_params(axis='x', rotation=45)
axes[1,1].axhline(0, color='red', linestyle='--', alpha=0.7)

plt.tight_layout()
plt.show()

In [None]:
# Temporal sentiment analysis
df['sentiment_month'] = df['datetime'].dt.to_period('M')

# Monthly sentiment trends
monthly_sentiment = df.groupby(['sentiment_month', 'sentiment_label']).size().unstack(fill_value=0)
monthly_avg_sentiment = df.groupby('sentiment_month')['sentiment_polarity'].mean()

# Daily sentiment analysis
daily_sentiment = df.groupby('date')['sentiment_polarity'].mean().sort_values()

# Find most positive and negative days
most_negative_day = daily_sentiment.idxmin()
most_positive_day = daily_sentiment.idxmax()

print("=== TEMPORAL SENTIMENT ANALYSIS ===")
print(f"Most negative day: {most_negative_day} (avg polarity: {daily_sentiment.min():.3f})")
print(f"Most positive day: {most_positive_day} (avg polarity: {daily_sentiment.max():.3f})")

In [None]:

# Messages from most negative day
negative_day_messages = df[df['date'] == most_negative_day].sort_values('sentiment_polarity')
print(f"\nMessages from most negative day ({most_negative_day}):")
print(f"Total messages: {len(negative_day_messages)}")
print("Most negative messages:")
for i, (idx, row) in enumerate(negative_day_messages.head(5).iterrows(), 1):
    preview = str(row['text_message'])[:100] + "..." if len(str(row['text_message'])) > 100 else str(row['text_message'])
    print(f"{i}. {row['User']} (polarity: {row['sentiment_polarity']:.3f}): {preview}")

# Messages from most positive day
positive_day_messages = df[df['date'] == most_positive_day].sort_values('sentiment_polarity', ascending=False)
print(f"\nMessages from most positive day ({most_positive_day}):")
print(f"Total messages: {len(positive_day_messages)}")
print("Most positive messages:")
for i, (idx, row) in enumerate(positive_day_messages.head(5).iterrows(), 1):
    preview = str(row['text_message'])[:100] + "..." if len(str(row['text_message'])) > 100 else str(row['text_message'])
    print(f"{i}. {row['User']} (polarity: {row['sentiment_polarity']:.3f}): {preview}")

In [None]:

# Plot temporal sentiment trends
fig, axes = plt.subplots(2, 1, figsize=(15, 10))

# Monthly sentiment trends
monthly_sentiment.plot(kind='area', ax=axes[0], alpha=0.7, color=['lightcoral', 'lightgray', 'lightgreen'])
axes[0].set_title('Monthly Sentiment Trends (Message Count)')
axes[0].set_xlabel('Month')
axes[0].set_ylabel('Number of Messages')
axes[0].legend()

# Daily average sentiment (moving average for smoother visualization)
daily_sentiment_smooth = daily_sentiment.rolling(window=7).mean()
daily_sentiment_smooth.plot(kind='line', ax=axes[1], color='purple', alpha=0.8)
axes[1].set_title('Daily Average Sentiment (7-day moving average)')
axes[1].set_xlabel('Date')
axes[1].set_ylabel('Average Sentiment Polarity')
axes[1].axhline(0, color='red', linestyle='--', alpha=0.7)
axes[1].grid(True, alpha=0.3)

# Highlight extreme days
axes[1].scatter([most_negative_day], [daily_sentiment.min()], color='red', s=100, zorder=5, label='Most Negative Day')
axes[1].scatter([most_positive_day], [daily_sentiment.max()], color='green', s=100, zorder=5, label='Most Positive Day')
axes[1].legend()

plt.tight_layout()
plt.show()

In [None]:


# Sentiment by month and year for each user
print("\nMonthly Sentiment by User:")
for user in df['User'].unique():
    user_data = df[df['User'] == user]
    user_monthly_sentiment = user_data.groupby(['sentiment_month', 'sentiment_label']).size().unstack(fill_value=0)
    print(f"\n{user} - Monthly positive/negative message ratio:")
    
    if 'positive' in user_monthly_sentiment.columns and 'negative' in user_monthly_sentiment.columns:
        ratio = user_monthly_sentiment['positive'] / (user_monthly_sentiment['negative'] + 1)  # +1 to avoid division by zero
        print(f"Average positive/negative ratio: {ratio.mean():.2f}")
        print("Top 3 most positive months:")
        print(ratio.nlargest(3))

## 7. Specific Word Pattern Analysis

This section tracks specific words/patterns mentioned in our requirements over time.

In [None]:
# Track specific words/patterns over time
import re

# Define specific patterns to track
patterns = {
    'pizza': r'\bpizza\b',
    'love': r'\blove\b',
    'sorry': r'\bsorry\b',
}

def find_pattern_matches(text, pattern):
    """Find all matches of a pattern in text (case insensitive)"""
    if pd.isna(text):
        return []
    return re.findall(pattern, str(text).lower(), re.IGNORECASE)

# Create columns for each pattern
pattern_data = {}
for pattern_name, pattern_regex in patterns.items():
    df[f'{pattern_name}_matches'] = df['text_message'].apply(lambda x: find_pattern_matches(x, pattern_regex))
    df[f'{pattern_name}_count'] = df[f'{pattern_name}_matches'].apply(len)
    pattern_data[pattern_name] = df[f'{pattern_name}_count'].sum()


In [None]:
print("=== SPECIFIC WORD PATTERN ANALYSIS ===")
print("Total usage count for each pattern:")
for pattern, count in pattern_data.items():
    print(f"{pattern}: {count} occurrences")

# Temporal analysis of pattern usage
monthly_patterns = pd.DataFrame()
for pattern_name in patterns.keys():
    monthly_count = df.groupby(df['datetime'].dt.to_period('M'))[f'{pattern_name}_count'].sum()
    monthly_patterns[pattern_name] = monthly_count

# Find peak usage months for each pattern
print("\nPeak usage months for each pattern:")
for pattern in patterns.keys():
    if monthly_patterns[pattern].sum() > 0:
        peak_month = monthly_patterns[pattern].idxmax()
        peak_count = monthly_patterns[pattern].max()
        print(f"{pattern}: {peak_month} ({peak_count} occurrences)")

# User analysis for patterns
print("\nPattern usage by user:")
for pattern_name in patterns.keys():
    user_pattern_usage = df.groupby('User')[f'{pattern_name}_count'].sum().sort_values(ascending=False)
    if user_pattern_usage.sum() > 0:
        print(f"\n{pattern_name}:")
        for user, count in user_pattern_usage.items():
            if count > 0:
                percentage = (count / user_pattern_usage.sum()) * 100
                print(f"  {user}: {count} times ({percentage:.1f}%)")

In [None]:

# Plot pattern usage over time
fig, axes = plt.subplots(2, 2, figsize=(20, 12))
axes = axes.flatten()

# Select top 4 most used patterns for visualization
top_patterns = sorted(pattern_data.items(), key=lambda x: x[1], reverse=True)[:4]

for i, (pattern_name, total_count) in enumerate(top_patterns):
    if total_count > 0:
        monthly_usage = monthly_patterns[pattern_name]
        monthly_usage.plot(kind='line', ax=axes[i], marker='o', linewidth=2)
        axes[i].set_title(f"'{pattern_name}' usage over time (Total: {total_count})")
        axes[i].set_xlabel('Month')
        axes[i].set_ylabel('Usage Count')
        axes[i].grid(True, alpha=0.3)

# Hide unused subplots
for i in range(len(top_patterns), 4):
    axes[i].set_visible(False)

plt.tight_layout()
plt.show()

In [None]:

# Create a heatmap showing pattern usage by user and month
if len([p for p in pattern_data.values() if p > 0]) > 0:
    # Create user-pattern matrix
    user_pattern_matrix = pd.DataFrame()
    for user in df['User'].unique():
        user_data = df[df['User'] == user]
        user_totals = {}
        for pattern_name in patterns.keys():
            user_totals[pattern_name] = user_data[f'{pattern_name}_count'].sum()
        user_pattern_matrix[user] = pd.Series(user_totals)
    
    # Only show patterns/users with actual usage
    user_pattern_matrix = user_pattern_matrix.loc[(user_pattern_matrix > 0).any(axis=1), 
                                                  (user_pattern_matrix > 0).any(axis=0)]
    
    if not user_pattern_matrix.empty:
        plt.figure(figsize=(12, 8))
        sns.heatmap(user_pattern_matrix, annot=True, fmt='d', cmap='YlOrRd', cbar_kws={'label': 'Usage Count'})
        plt.title('Pattern Usage by User (Heatmap)')
        plt.xlabel('Users')
        plt.ylabel('Patterns')
        plt.tight_layout()
        plt.show()

In [None]:
# Show sample messages for interesting patterns
interesting_patterns = ['pizza', 'love', 'sorry']
for pattern in interesting_patterns:
    pattern_messages = df[df[f'{pattern}_count'] > 0]
    if len(pattern_messages) > 0:
        print(f"\nSample messages containing '{pattern}':")
        for i, (idx, row) in enumerate(pattern_messages.sample(min(3, len(pattern_messages))).iterrows(), 1):
            preview = str(row['text_message'])[:150] + "..." if len(str(row['text_message'])) > 150 else str(row['text_message'])
            print(f"{i}. {row['User']} ({row['date']}): {preview}")

print("\n" + "="*50)
print("ANALYSIS COMPLETE!")
print("="*50)

In [None]:
# First instances and chronological context analysis
print("\n" + "="*70)
print("FIRST INSTANCES AND CHRONOLOGICAL CONTEXT")
print("="*70)

# Target patterns for detailed analysis
target_patterns = list(patterns.keys())

for pattern in target_patterns:
    pattern_messages = df[df[f'{pattern}_count'] > 0].copy()
    
    if len(pattern_messages) > 0:
        # Sort by datetime to find the first instance
        pattern_messages_sorted = pattern_messages.sort_values('datetime')
        first_instance = pattern_messages_sorted.iloc[0]
        
        print(f"\n🔍 FIRST INSTANCE OF '{pattern.upper()}':")
        print(f"Date: {first_instance['date']}")
        print(f"Time: {first_instance['Time']}")
        print(f"User: {first_instance['User']}")
        print(f"Message: {first_instance['text_message']}")
        
        # Find the index of this message in the original dataframe
        first_msg_index = first_instance.name
        
        # Get the next 5 messages chronologically (by datetime, not by index)
        first_msg_datetime = first_instance['datetime']
        
        # Find all messages after this datetime
        subsequent_messages = df[df['datetime'] > first_msg_datetime].sort_values('datetime').head(5)
        
        if len(subsequent_messages) > 0:
            print(f"\n📝 NEXT 5 MESSAGES AFTER FIRST '{pattern.upper()}' INSTANCE:")
            for i, (idx, row) in enumerate(subsequent_messages.iterrows(), 1):
                time_diff = (row['datetime'] - first_msg_datetime).total_seconds()
                hours = int(time_diff // 3600)
                minutes = int((time_diff % 3600) // 60)
                
                if hours > 0:
                    time_str = f"({hours}h {minutes}m later)"
                else:
                    time_str = f"({minutes}m later)"
                
                print(f"{i}. {row['User']} - {row['date']} {row['Time']} {time_str}")
                print(f"   {row['text_message']}")
                print()
        else:
            print(f"No subsequent messages found after the first '{pattern}' instance.")
        
        # Additional context: Show total occurrences and date range
        total_occurrences = pattern_messages[f'{pattern}_count'].sum()
        first_date = pattern_messages_sorted.iloc[0]['date']
        last_date = pattern_messages_sorted.iloc[-1]['date']
        
        print(f"📊 Pattern Summary:")
        print(f"   Total occurrences: {total_occurrences}")
        print(f"   First used: {first_date}")
        print(f"   Last used: {last_date}")
        print(f"   Usage span: {(pd.to_datetime(last_date) - pd.to_datetime(first_date)).days} days")
        
    else:
        print(f"\n❌ No instances of '{pattern.upper()}' found in the conversation.")

print("\n" + "="*50)
print("ANALYSIS COMPLETE!")
print("="*50)

## 8. Good Morning/Good Night Analysis

This section analyzes the frequency of "good morning" and "good night" messages by user and over time.

In [None]:
# Good Morning and Good Night Analysis
import re

def detect_good_morning(text):
    """Detect good morning messages (various patterns)"""
    if pd.isna(text):
        return False
    
    text = str(text).lower()
    # Patterns for good morning
    gm_patterns = [
        r'\bgood\s*morning\b',
        r'\bgd\s*morning\b',
        r'\bg\s*morning\b',
        r'\bmorning\b(?!\s*(shift|workout|run|jog))',  # morning but not morning shift/workout
        r'\bgm\b(?!\s*(shift|car|music))',  # gm but not GM shift/car/music
        r'\bgood\s*mrng\b',
        r'\bgutten\s*morgen\b',  # German
        r'\bbuenos\s*dias\b',    # Spanish
        r'\bbonjour\b'           # French
    ]
    
    return any(re.search(pattern, text) for pattern in gm_patterns)

def detect_good_night(text):
    """Detect good night messages (various patterns)"""
    if pd.isna(text):
        return False
    
    text = str(text).lower()
    # Patterns for good night
    gn_patterns = [
        r'\bgood\s*night\b',
        r'\bgd\s*night\b',
        r'\bg\s*night\b',
        r'\bnight\b(?!\s*(shift|out|club|life))',  # night but not night shift/out/club
        r'\bgn\b(?!\s*(shift|out))',  # gn but not GN shift/out
        r'\bgood\s*nite\b',
        r'\bgood\s*noite\b',
        r'\bnighty\b',
        r'\bgutten\s*nacht\b',  # German
        r'\bbuenas\s*noches\b', # Spanish
        r'\bbonne\s*nuit\b',    # French
        r'\bsweet\s*dreams\b',
        r'\bsleep\s*well\b',
        r'\bsleep\s*tight\b'
    ]
    
    return any(re.search(pattern, text) for pattern in gn_patterns)

# Apply detection functions
df['is_good_morning'] = df['text_message'].apply(detect_good_morning)
df['is_good_night'] = df['text_message'].apply(detect_good_night)

# Filter messages
good_morning_msgs = df[df['is_good_morning']].copy()
good_night_msgs = df[df['is_good_night']].copy()

In [None]:
print("=== GOOD MORNING & GOOD NIGHT ANALYSIS ===")

# Overall statistics
total_gm = len(good_morning_msgs)
total_gn = len(good_night_msgs)
total_messages = len(df)

print(f"Total Good Morning messages: {total_gm:,} ({(total_gm/total_messages)*100:.2f}% of all messages)")
print(f"Total Good Night messages: {total_gn:,} ({(total_gn/total_messages)*100:.2f}% of all messages)")

# Frequency by user
print("\n=== FREQUENCY BY USER ===")
gm_by_user = good_morning_msgs['User'].value_counts()
gn_by_user = good_night_msgs['User'].value_counts()

print("Good Morning messages by user:")
for user in df['User'].unique():
    count = gm_by_user.get(user, 0)
    percentage = (count / total_gm * 100) if total_gm > 0 else 0
    user_total = len(df[df['User'] == user])
    user_percentage = (count / user_total * 100) if user_total > 0 else 0
    print(f"  {user}: {count:,} messages ({percentage:.1f}% of all GM, {user_percentage:.2f}% of their messages)")

print("\nGood Night messages by user:")
for user in df['User'].unique():
    count = gn_by_user.get(user, 0)
    percentage = (count / total_gn * 100) if total_gn > 0 else 0
    user_total = len(df[df['User'] == user])
    user_percentage = (count / user_total * 100) if user_total > 0 else 0
    print(f"  {user}: {count:,} messages ({percentage:.1f}% of all GN, {user_percentage:.2f}% of their messages)")

# Combined user statistics
user_greeting_stats = pd.DataFrame({
    'Good Morning': [gm_by_user.get(user, 0) for user in df['User'].unique()],
    'Good Night': [gn_by_user.get(user, 0) for user in df['User'].unique()],
}, index=df['User'].unique())

user_greeting_stats['Total Greetings'] = user_greeting_stats['Good Morning'] + user_greeting_stats['Good Night']
user_greeting_stats['GM/GN Ratio'] = user_greeting_stats['Good Morning'] / (user_greeting_stats['Good Night'] + 1)  # +1 to avoid division by zero

print(f"\nCombined Greeting Statistics:")
print(user_greeting_stats)

# Time-based analysis
if total_gm > 0:
    good_morning_msgs['month_year'] = good_morning_msgs['datetime'].dt.to_period('M')
    gm_monthly = good_morning_msgs.groupby('month_year').size()
    
    # Peak months for good morning
    peak_gm_month = gm_monthly.idxmax()
    peak_gm_count = gm_monthly.max()
    print(f"\nPeak Good Morning month: {peak_gm_month} ({peak_gm_count} messages)")

if total_gn > 0:
    good_night_msgs['month_year'] = good_night_msgs['datetime'].dt.to_period('M')
    gn_monthly = good_night_msgs.groupby('month_year').size()
    
    # Peak months for good night
    peak_gn_month = gn_monthly.idxmax()
    peak_gn_count = gn_monthly.max()
    print(f"Peak Good Night month: {peak_gn_month} ({peak_gn_count} messages)")

# Show some sample messages
print(f"\n=== SAMPLE MESSAGES ===")
if total_gm > 0:
    print("Sample Good Morning messages:")
    for i, (idx, row) in enumerate(good_morning_msgs.sample(min(5, total_gm)).iterrows(), 1):
        print(f"{i}. {row['User']} ({row['date']}): {row['text_message']}")

if total_gn > 0:
    print("\nSample Good Night messages:")
    for i, (idx, row) in enumerate(good_night_msgs.sample(min(5, total_gn)).iterrows(), 1):
        print(f"{i}. {row['User']} ({row['date']}): {row['text_message']}")

In [None]:
# Create comprehensive visualizations
fig, axes = plt.subplots(2, 3, figsize=(20, 12))

# 1. User comparison bar chart
user_greeting_stats[['Good Morning', 'Good Night']].plot(kind='bar', ax=axes[0,0], 
                                                         color=['gold', 'navy'], alpha=0.8)
axes[0,0].set_title('Good Morning vs Good Night Messages by User')
axes[0,0].set_xlabel('User')
axes[0,0].set_ylabel('Number of Messages')
axes[0,0].tick_params(axis='x', rotation=45)
axes[0,0].legend()

# 2. Pie chart for overall distribution
greeting_totals = [total_gm, total_gn]
greeting_labels = ['Good Morning', 'Good Night']
if sum(greeting_totals) > 0:
    axes[0,1].pie(greeting_totals, labels=greeting_labels, autopct='%1.1f%%', 
                  colors=['gold', 'navy'], startangle=90)
    axes[0,1].set_title('Overall GM vs GN Distribution')

# 3. GM/GN Ratio by user
user_greeting_stats['GM/GN Ratio'].plot(kind='bar', ax=axes[0,2], color='purple', alpha=0.7)
axes[0,2].set_title('Good Morning to Good Night Ratio by User')
axes[0,2].set_xlabel('User')
axes[0,2].set_ylabel('GM/GN Ratio')
axes[0,2].tick_params(axis='x', rotation=45)
axes[0,2].axhline(1, color='red', linestyle='--', alpha=0.7, label='Equal ratio')
axes[0,2].legend()

# 4. & 5. Time series plots for GM and GN over time
if total_gm > 0 and total_gn > 0:
    # Combine monthly data for both
    all_months = pd.period_range(start=df['datetime'].min(), end=df['datetime'].max(), freq='M')
    
    # Ensure both series have the same index
    gm_monthly_full = gm_monthly.reindex(all_months, fill_value=0)
    gn_monthly_full = gn_monthly.reindex(all_months, fill_value=0)
    
    # Combined time series
    combined_monthly = pd.DataFrame({
        'Good Morning': gm_monthly_full,
        'Good Night': gn_monthly_full
    })
    
    # Plot combined time series
    combined_monthly.plot(kind='line', ax=axes[1,0], marker='o', linewidth=2, 
                         color=['gold', 'navy'], alpha=0.8)
    axes[1,0].set_title('Good Morning & Good Night Messages Over Time')
    axes[1,0].set_xlabel('Month')
    axes[1,0].set_ylabel('Number of Messages')
    axes[1,0].grid(True, alpha=0.3)
    axes[1,0].legend()
    axes[1,0].tick_params(axis='x', rotation=45)
    
    # Individual GM time series
    gm_monthly_full.plot(kind='line', ax=axes[1,1], marker='o', linewidth=2, 
                        color='gold', alpha=0.8)
    axes[1,1].set_title('Good Morning Messages Over Time')
    axes[1,1].set_xlabel('Month')
    axes[1,1].set_ylabel('Number of Messages')
    axes[1,1].grid(True, alpha=0.3)
    axes[1,1].tick_params(axis='x', rotation=45)
    
    # Individual GN time series
    gn_monthly_full.plot(kind='line', ax=axes[1,2], marker='o', linewidth=2, 
                        color='navy', alpha=0.8)
    axes[1,2].set_title('Good Night Messages Over Time')
    axes[1,2].set_xlabel('Month')
    axes[1,2].set_ylabel('Number of Messages')
    axes[1,2].grid(True, alpha=0.3)
    axes[1,2].tick_params(axis='x', rotation=45)

elif total_gm > 0:
    # Only GM data available
    gm_monthly.plot(kind='line', ax=axes[1,0], marker='o', linewidth=2, color='gold')
    axes[1,0].set_title('Good Morning Messages Over Time')
    axes[1,0].set_xlabel('Month')
    axes[1,0].set_ylabel('Number of Messages')
    axes[1,0].grid(True, alpha=0.3)
    axes[1,0].tick_params(axis='x', rotation=45)
    
    # Hide unused subplots
    axes[1,1].set_visible(False)
    axes[1,2].set_visible(False)

elif total_gn > 0:
    # Only GN data available
    gn_monthly.plot(kind='line', ax=axes[1,0], marker='o', linewidth=2, color='navy')
    axes[1,0].set_title('Good Night Messages Over Time')
    axes[1,0].set_xlabel('Month')
    axes[1,0].set_ylabel('Number of Messages')
    axes[1,0].grid(True, alpha=0.3)
    axes[1,0].tick_params(axis='x', rotation=45)
    
    # Hide unused subplots
    axes[1,1].set_visible(False)
    axes[1,2].set_visible(False)

else:
    # No greeting data found
    for i in range(3):
        axes[1,i].text(0.5, 0.5, 'No greeting messages found', 
                      ha='center', va='center', transform=axes[1,i].transAxes)
        axes[1,i].set_xlim(0, 1)
        axes[1,i].set_ylim(0, 1)

plt.tight_layout()
plt.show()


In [None]:

# Interactive Plotly visualization for time series
if total_gm > 0 or total_gn > 0:
    print("\n=== INTERACTIVE TIME SERIES VISUALIZATION ===")
    
    # Create interactive plot
    fig_interactive = go.Figure()
    
    if total_gm > 0:
        fig_interactive.add_trace(go.Scatter(
            x=[str(date) for date in gm_monthly.index],
            y=gm_monthly.values,
            mode='lines+markers',
            name='Good Morning',
            line=dict(color='gold', width=3),
            marker=dict(size=8)
        ))
    
    if total_gn > 0:
        fig_interactive.add_trace(go.Scatter(
            x=[str(date) for date in gn_monthly.index],
            y=gn_monthly.values,
            mode='lines+markers',
            name='Good Night',
            line=dict(color='darkblue', width=3),
            marker=dict(size=8)
        ))
    
    fig_interactive.update_layout(
        title='Good Morning & Good Night Messages Over Time (Interactive)',
        xaxis_title='Month',
        yaxis_title='Number of Messages',
        hovermode='x unified',
        width=1000,
        height=500
    )
    
    fig_interactive.show()

# Additional analysis: Time of day patterns
if total_gm > 0:
    gm_hours = good_morning_msgs['hour'].value_counts().sort_index()
    print(f"\n=== GOOD MORNING TIME PATTERNS ===")
    print("Most common hours for Good Morning messages:")
    for hour, count in gm_hours.head(10).items():
        time_str = f"{hour:02d}:00"
        percentage = (count / total_gm) * 100
        print(f"  {time_str}: {count:,} messages ({percentage:.1f}%)")

if total_gn > 0:
    gn_hours = good_night_msgs['hour'].value_counts().sort_index()
    print(f"\n=== GOOD NIGHT TIME PATTERNS ===")
    print("Most common hours for Good Night messages:")
    for hour, count in gn_hours.head(10).items():
        time_str = f"{hour:02d}:00"
        percentage = (count / total_gn) * 100
        print(f"  {time_str}: {count:,} messages ({percentage:.1f}%)")

In [None]:
# Hourly and weekly patterns visualization
if total_gm > 0 or total_gn > 0:
    fig, axes = plt.subplots(2, 2, figsize=(16, 12))
    
    # Hour distribution for Good Morning
    if total_gm > 0:
        gm_hours = good_morning_msgs['hour'].value_counts().sort_index()
        gm_hours.plot(kind='bar', ax=axes[0,0], color='gold', alpha=0.8)
        axes[0,0].set_title('Good Morning Messages by Hour of Day')
        axes[0,0].set_xlabel('Hour')
        axes[0,0].set_ylabel('Number of Messages')
        axes[0,0].tick_params(axis='x', rotation=0)
    else:
        axes[0,0].text(0.5, 0.5, 'No Good Morning messages found', 
                      ha='center', va='center', transform=axes[0,0].transAxes)
    
    # Hour distribution for Good Night
    if total_gn > 0:
        gn_hours = good_night_msgs['hour'].value_counts().sort_index()
        gn_hours.plot(kind='bar', ax=axes[0,1], color='navy', alpha=0.8)
        axes[0,1].set_title('Good Night Messages by Hour of Day')
        axes[0,1].set_xlabel('Hour')
        axes[0,1].set_ylabel('Number of Messages')
        axes[0,1].tick_params(axis='x', rotation=0)
    else:
        axes[0,1].text(0.5, 0.5, 'No Good Night messages found', 
                      ha='center', va='center', transform=axes[0,1].transAxes)
    
    # Day of week patterns for Good Morning
    if total_gm > 0:
        weekday_order = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']
        gm_weekdays = good_morning_msgs['weekday'].value_counts().reindex(weekday_order, fill_value=0)
        gm_weekdays.plot(kind='bar', ax=axes[1,0], color='gold', alpha=0.8)
        axes[1,0].set_title('Good Morning Messages by Day of Week')
        axes[1,0].set_xlabel('Day of Week')
        axes[1,0].set_ylabel('Number of Messages')
        axes[1,0].tick_params(axis='x', rotation=45)
    
    # Day of week patterns for Good Night
    if total_gn > 0:
        gn_weekdays = good_night_msgs['weekday'].value_counts().reindex(weekday_order, fill_value=0)
        gn_weekdays.plot(kind='bar', ax=axes[1,1], color='navy', alpha=0.8)
        axes[1,1].set_title('Good Night Messages by Day of Week')
        axes[1,1].set_xlabel('Day of Week')
        axes[1,1].set_ylabel('Number of Messages')
        axes[1,1].tick_params(axis='x', rotation=45)
    
    plt.tight_layout()
    plt.show()

In [None]:

# Weekly patterns analysis
if total_gm > 0:
    print(f"\n=== WEEKLY PATTERNS FOR GOOD MORNING ===")
    weekday_order = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']
    gm_weekday_stats = good_morning_msgs['weekday'].value_counts().reindex(weekday_order, fill_value=0)
    
    for day, count in gm_weekday_stats.items():
        percentage = (count / total_gm) * 100
        print(f"  {day}: {count:,} messages ({percentage:.1f}%)")
    
    most_gm_day = gm_weekday_stats.idxmax()
    least_gm_day = gm_weekday_stats.idxmin()
    print(f"  Most GM messages: {most_gm_day} ({gm_weekday_stats.max()} messages)")
    print(f"  Least GM messages: {least_gm_day} ({gm_weekday_stats.min()} messages)")

if total_gn > 0:
    print(f"\n=== WEEKLY PATTERNS FOR GOOD NIGHT ===")
    gn_weekday_stats = good_night_msgs['weekday'].value_counts().reindex(weekday_order, fill_value=0)
    
    for day, count in gn_weekday_stats.items():
        percentage = (count / total_gn) * 100
        print(f"  {day}: {count:,} messages ({percentage:.1f}%)")
    
    most_gn_day = gn_weekday_stats.idxmax()
    least_gn_day = gn_weekday_stats.idxmin()
    print(f"  Most GN messages: {most_gn_day} ({gn_weekday_stats.max()} messages)")
    print(f"  Least GN messages: {least_gn_day} ({gn_weekday_stats.min()} messages)")


In [None]:

# Advanced insights
print(f"\n=== ADVANCED GREETING INSIGHTS ===")

# Calculate greeting consistency (how often users send both GM and GN)
if total_gm > 0 and total_gn > 0:
    gm_days = set(good_morning_msgs['date'])
    gn_days = set(good_night_msgs['date'])
    both_days = gm_days.intersection(gn_days)
    
    print(f"Days with Good Morning messages: {len(gm_days):,}")
    print(f"Days with Good Night messages: {len(gn_days):,}")
    print(f"Days with both GM and GN: {len(both_days):,}")
    
    if len(gm_days.union(gn_days)) > 0:
        consistency_rate = len(both_days) / len(gm_days.union(gn_days))
        print(f"Greeting consistency rate: {consistency_rate:.1%}")

# User-specific patterns
for user in df['User'].unique():
    user_gm = len(good_morning_msgs[good_morning_msgs['User'] == user])
    user_gn = len(good_night_msgs[good_night_msgs['User'] == user])
    user_total = len(df[df['User'] == user])
    
    print(f"\n{user} greeting patterns:")
    print(f"  Good Morning: {user_gm:,} ({(user_gm/user_total)*100:.2f}% of their messages)")
    print(f"  Good Night: {user_gn:,} ({(user_gn/user_total)*100:.2f}% of their messages)")
    
    if user_gm > 0 and user_gn > 0:
        user_gm_msgs = good_morning_msgs[good_morning_msgs['User'] == user]
        user_gn_msgs = good_night_msgs[good_night_msgs['User'] == user]
        
        # Average time for greetings
        avg_gm_hour = user_gm_msgs['hour'].mean()
        avg_gn_hour = user_gn_msgs['hour'].mean()
        
        print(f"  Average GM time: {avg_gm_hour:.1f}:00 ({int(avg_gm_hour):02d}:{int((avg_gm_hour % 1) * 60):02d})")
        print(f"  Average GN time: {avg_gn_hour:.1f}:00 ({int(avg_gn_hour):02d}:{int((avg_gn_hour % 1) * 60):02d})")

print("\n" + "="*60)
print("GOOD MORNING & GOOD NIGHT ANALYSIS COMPLETE!")
print("="*60)

## 9. Named Entity Recognition Analysis

This section identifies and analyzes the most frequently mentioned named entities (people, places, organizations, etc.) in the conversation.

In [None]:

import spacy
from collections import defaultdict, Counter

nlp = spacy.load("en_core_web_sm")
print("✅ spaCy English model installed and loaded!")

In [None]:
def extract_entities(text):
    """Extract named entities from text using spaCy"""
    if pd.isna(text) or text.strip() == '':
        return []
    
    try:
        # Process text with spaCy
        doc = nlp(str(text))
        entities = []
        
        for ent in doc.ents:
            # Filter out very short entities and common false positives
            if len(ent.text.strip()) > 2 and ent.text.strip().lower() not in ['you', 'me', 'i', 'we', 'us', 'them']:
                entities.append({
                    'text': ent.text.strip(),
                    'label': ent.label_,
                    'description': spacy.explain(ent.label_)
                })
        
        return entities
    except Exception as e:
        return []

print("=== NAMED ENTITY RECOGNITION ANALYSIS ===")
print("Processing messages for named entities... This may take a few minutes for large datasets.")

# Extract entities from all messages
all_entities = []
user_entities = defaultdict(list)
entity_by_type = defaultdict(list)

# Process messages in batches to show progress
batch_size = 1000
total_messages = len(df)
processed = 0

for idx, row in df.iterrows():
    entities = extract_entities(row['text_message'])
    
    for entity in entities:
        all_entities.append(entity)
        user_entities[row['User']].append(entity)
        entity_by_type[entity['label']].append(entity)
    
    processed += 1
    if processed % batch_size == 0:
        print(f"Processed {processed:,}/{total_messages:,} messages ({(processed/total_messages)*100:.1f}%)")

print(f"✅ Completed processing {total_messages:,} messages!")

# Count entities
entity_counts = Counter([ent['text'].lower() for ent in all_entities])
entity_type_counts = Counter([ent['label'] for ent in all_entities])


In [None]:

print(f"\n=== ENTITY STATISTICS ===")
print(f"Total entities found: {len(all_entities):,}")
print(f"Unique entities: {len(entity_counts):,}")
print(f"Entity types found: {len(entity_type_counts)}")

# Show entity type distribution
print(f"\n=== ENTITY TYPES DISTRIBUTION ===")
for entity_type, count in entity_type_counts.most_common():
    description = spacy.explain(entity_type) or entity_type
    percentage = (count / len(all_entities)) * 100
    print(f"{entity_type} ({description}): {count:,} ({percentage:.1f}%)")

# Most mentioned entities overall
print(f"\n=== TOP 30 MOST MENTIONED ENTITIES (OVERALL) ===")
for i, (entity, count) in enumerate(entity_counts.most_common(30), 1):
    # Find the entity type for this entity
    entity_types = set()
    for ent in all_entities:
        if ent['text'].lower() == entity:
            entity_types.add(ent['label'])
    
    type_str = ', '.join(entity_types)
    print(f"{i:2d}. {entity.title():<20} : {count:,} mentions ({type_str})")

# Entities by user
print(f"\n=== TOP ENTITIES BY USER ===")
for user in df['User'].unique():
    user_entity_counts = Counter([ent['text'].lower() for ent in user_entities[user]])
    print(f"\n{user} - Top 15 entities:")
    
    for i, (entity, count) in enumerate(user_entity_counts.most_common(15), 1):
        percentage = (count / len(user_entities[user])) * 100 if len(user_entities[user]) > 0 else 0
        print(f"  {i:2d}. {entity.title():<20} : {count:,} mentions ({percentage:.1f}%)")

# Show entities by category
print(f"\n=== ENTITIES BY CATEGORY ===")
important_types = ['PERSON', 'GPE', 'ORG', 'PRODUCT', 'EVENT', 'FAC', 'LOC']

for entity_type in important_types:
    if entity_type in entity_type_counts:
        type_entities = [ent['text'].lower() for ent in all_entities if ent['label'] == entity_type]
        type_entity_counts = Counter(type_entities)
        
        description = spacy.explain(entity_type) or entity_type
        print(f"\n{entity_type} ({description}) - Top 10:")
        
        for i, (entity, count) in enumerate(type_entity_counts.most_common(10), 1):
            print(f"  {i:2d}. {entity.title():<25} : {count:,} mentions")

In [None]:
# Named Entity Word Cloud Visualizations
from wordcloud import WordCloud
import matplotlib.pyplot as plt

print("\n" + "="*70)
print("NAMED ENTITY WORD CLOUD VISUALIZATIONS")
print("="*70)

if len(entity_counts) > 0:
    # Create comprehensive visualization
    fig, axes = plt.subplots(1, 1, figsize=(12, 8))
    
    # Overall entities word cloud
    overall_wordcloud = WordCloud(
        width=800, 
        height=400, 
        background_color='white',
        colormap='viridis',
        max_words=100,
        relative_scaling=0.5,
        random_state=42
    ).generate_from_frequencies(entity_counts)
    
    axes.imshow(overall_wordcloud, interpolation='bilinear')
    axes.set_title('All Named Entities (Combined)', fontsize=16, fontweight='bold')
    axes.axis('off')
    plt.tight_layout()
    plt.show()


In [None]:
# Create individual word clouds for each user    
if len(entity_counts) > 0:
    users = list(df['User'].unique())
    
    # Create subplot for user comparison
    fig, axes = plt.subplots(1, len(users), figsize=(12 * len(users), 8))
    
    # Handle single user case
    if len(users) == 1:
        axes = [axes]
    
    for i, user in enumerate(users):
        user_entity_counts = Counter([ent['text'].lower() for ent in user_entities[user]])
        
        if len(user_entity_counts) > 0:
            user_wordcloud = WordCloud(
                width=800, 
                height=400, 
                background_color='white',
                colormap='plasma' if i == 0 else 'cool',
                max_words=50,
                relative_scaling=0.5,
                random_state=42
            ).generate_from_frequencies(user_entity_counts)
            
            axes[i].imshow(user_wordcloud, interpolation='bilinear')
            axes[i].set_title(f'{user} - Named Entities', fontsize=16, fontweight='bold')
            axes[i].axis('off')
        else:
            axes[i].text(0.5, 0.5, f'No entities found for {user}', 
                            ha='center', va='center', transform=axes[i].transAxes)
            axes[i].set_title(f'{user} - Named Entities')
    plt.tight_layout()
    plt.show()


In [None]:
# Entity type-specific word clouds
if len(entity_counts) > 0:
    # Entity type-specific word clouds
    important_types = ['PERSON', 'GPE', 'ORG']
    available_types = [t for t in important_types if t in entity_type_counts]
    
    if available_types:
        fig, axes = plt.subplots(1, len(available_types), figsize=(8 * len(available_types), 8))
        
        # Handle single type case
        if len(available_types) == 1:
            axes = [axes]
        
        colors = ['Set1', 'Set2', 'Set3']
        
        for i, entity_type in enumerate(available_types):
            type_entities = [ent['text'].lower() for ent in all_entities if ent['label'] == entity_type]
            type_entity_counts = Counter(type_entities)
            
            if len(type_entity_counts) > 0:
                type_wordcloud = WordCloud(
                    width=800, 
                    height=400, 
                    background_color='white',
                    colormap=colors[i % len(colors)],
                    max_words=30,
                    relative_scaling=0.5,
                    random_state=42
                ).generate_from_frequencies(type_entity_counts)
                
                description = spacy.explain(entity_type) or entity_type
                axes[i].imshow(type_wordcloud, interpolation='bilinear')
                axes[i].set_title(f'{entity_type} - {description}', fontsize=14, fontweight='bold')
                axes[i].axis('off')
            else:
                axes[i].text(0.5, 0.5, f'No {entity_type} entities found', 
                            ha='center', va='center', transform=axes[i].transAxes)
                axes[i].set_title(f'{entity_type} Entities')
        
        plt.tight_layout()
        plt.show()
    else:
        print("No major entity types found for visualization.")

In [None]:
# Create entity frequency bar charts
if len(all_entities) > 0:
    # Create entity frequency bar charts
    fig, axes = plt.subplots(2, 2, figsize=(20, 12))
    
    # Top entities overall
    top_entities = dict(entity_counts.most_common(20))
    if top_entities:
        axes[0,0].barh(range(len(top_entities)), list(top_entities.values()), color='skyblue')
        axes[0,0].set_title('Top 20 Most Mentioned Entities (Overall)')
        axes[0,0].set_xlabel('Mention Count')
        axes[0,0].set_ylabel('Entities')
        axes[0,0].set_yticks(range(len(top_entities)))
        axes[0,0].set_yticklabels([entity.title() for entity in top_entities.keys()])
        axes[0,0].invert_yaxis()
    
    # Entity types distribution
    if entity_type_counts:
        axes[0,1].bar(range(len(entity_type_counts)), list(entity_type_counts.values()), 
                     color='lightgreen', alpha=0.8)
        axes[0,1].set_title('Entity Types Distribution')
        axes[0,1].set_xlabel('Entity Type')
        axes[0,1].set_ylabel('Count')
        axes[0,1].set_xticks(range(len(entity_type_counts)))
        axes[0,1].set_xticklabels(list(entity_type_counts.keys()), rotation=45)
    
    # User comparison
    if len(users) >= 2:
        user1_entities = Counter([ent['text'].lower() for ent in user_entities[users[0]]])
        user2_entities = Counter([ent['text'].lower() for ent in user_entities[users[1]]])
        
        # Get common top entities for comparison
        all_top_entities = set(list(user1_entities.keys())[:10] + list(user2_entities.keys())[:10])
        common_entities = list(all_top_entities)[:15]
        
        user1_counts = [user1_entities.get(entity, 0) for entity in common_entities]
        user2_counts = [user2_entities.get(entity, 0) for entity in common_entities]
        
        x = np.arange(len(common_entities))
        width = 0.35
        
        axes[1,0].bar(x - width/2, user1_counts, width, label=users[0], alpha=0.8)
        axes[1,0].bar(x + width/2, user2_counts, width, label=users[1], alpha=0.8)
        axes[1,0].set_title('Top Entities Comparison by User')
        axes[1,0].set_xlabel('Entities')
        axes[1,0].set_ylabel('Mention Count')
        axes[1,0].set_xticks(x)
        axes[1,0].set_xticklabels([entity.title()[:10] for entity in common_entities], rotation=45)
        axes[1,0].legend()
    
    # Timeline of entity mentions (monthly)
    if len(all_entities) > 0:
        # Create entity timeline
        entity_timeline = defaultdict(int)
        
        for idx, row in df.iterrows():
            if row['text_message'] and not pd.isna(row['text_message']):
                entities_in_msg = extract_entities(row['text_message'])
                if entities_in_msg:
                    month_year = row['datetime'].strftime('%Y-%m')
                    entity_timeline[month_year] += len(entities_in_msg)
        
        if entity_timeline:
            timeline_sorted = dict(sorted(entity_timeline.items()))
            axes[1,1].plot(list(timeline_sorted.keys()), list(timeline_sorted.values()), 
                          marker='o', linewidth=2, markersize=4)
            axes[1,1].set_title('Named Entity Mentions Over Time')
            axes[1,1].set_xlabel('Month')
            axes[1,1].set_ylabel('Total Entity Mentions')
            axes[1,1].tick_params(axis='x', rotation=45)
            axes[1,1].grid(True, alpha=0.3)
    
    plt.tight_layout()
    plt.show()

else:
    print("❌ No named entities found in the dataset.")


In [None]:
# Advanced entity analysis
print(f"\n=== ADVANCED ENTITY INSIGHTS ===")

if len(all_entities) > 0:
    # Entity diversity by user
    for user in users:
        user_entity_count = len(user_entities[user])
        user_unique_entities = len(set([ent['text'].lower() for ent in user_entities[user]]))
        diversity = user_unique_entities / user_entity_count if user_entity_count > 0 else 0
        
        print(f"{user} entity usage:")
        print(f"  Total entity mentions: {user_entity_count:,}")
        print(f"  Unique entities mentioned: {user_unique_entities:,}")
        print(f"  Entity diversity ratio: {diversity:.3f}")
    
    # Most mentioned entity per category
    print(f"\n=== TOP ENTITY PER CATEGORY ===")
    for entity_type in important_types:
        if entity_type in entity_type_counts:
            type_entities = [ent['text'].lower() for ent in all_entities if ent['label'] == entity_type]
            if type_entities:
                top_entity = Counter(type_entities).most_common(1)[0]
                description = spacy.explain(entity_type) or entity_type
                print(f"{entity_type} ({description}): '{top_entity[0].title()}' ({top_entity[1]} mentions)")

print("\n" + "="*70)
print("NAMED ENTITY ANALYSIS COMPLETE!")
print("="*70)

## 10. Laughter Analysis 😂

This section analyzes laughter patterns including "lol", "lmao", "lmfao", and "ha" chains to discover who laughed the most and when.

In [None]:
# Laughter Analysis - Detecting various forms of laughter
import re
from collections import Counter, defaultdict

def detect_laughter_patterns(text):
    """Detect various laughter patterns in text"""
    if pd.isna(text):
        return {}
    
    text = str(text).lower()
    laughter_patterns = {}
    
    # LOL variants
    lol_pattern = r'\b(lol|lols|lolol|lololo)\b'
    lol_matches = re.findall(lol_pattern, text)
    laughter_patterns['lol'] = len(lol_matches)
    
    # LMAO variants
    lmao_pattern = r'\b(lmao|lmaoo|lmaooo)\b'
    lmao_matches = re.findall(lmao_pattern, text)
    laughter_patterns['lmao'] = len(lmao_matches)
    
    # LMFAO variants
    lmfao_pattern = r'\b(lmfao|lmfaoo|lmfaooo)\b'
    lmfao_matches = re.findall(lmfao_pattern, text)
    laughter_patterns['lmfao'] = len(lmfao_matches)
    
    # HA chains (haha, hahaha, hahahaha, etc.)
    ha_pattern = r'\b(ha){2,}\b'  # At least 2 consecutive "ha"s
    ha_matches = re.findall(ha_pattern, text)
    laughter_patterns['ha_chains'] = len(ha_matches)
    
    # Get the actual ha chain lengths for more detailed analysis
    ha_chain_lengths = []
    for match in re.finditer(r'\b(ha)+\b', text):
        chain_length = len(match.group()) // 2  # Each "ha" is 2 characters
        if chain_length >= 2:  # Only count chains of 2 or more
            ha_chain_lengths.append(chain_length)
    
    laughter_patterns['ha_chain_lengths'] = ha_chain_lengths
    laughter_patterns['max_ha_chain'] = max(ha_chain_lengths) if ha_chain_lengths else 0
    
    # ROFL and other variants
    rofl_pattern = r'\b(rofl|roflmao|rotfl)\b'
    rofl_matches = re.findall(rofl_pattern, text)
    laughter_patterns['rofl'] = len(rofl_matches)
    
    # Calculate total laughter instances
    laughter_patterns['total_laughter'] = (
        laughter_patterns['lol'] + 
        laughter_patterns['lmao'] + 
        laughter_patterns['lmfao'] + 
        laughter_patterns['ha_chains'] + 
        laughter_patterns['rofl']
    )
    
    return laughter_patterns


In [None]:

print("=== LAUGHTER ANALYSIS ===")
print("Analyzing laughter patterns in messages...")

# Apply laughter detection to all messages
laughter_data = []
user_laughter = defaultdict(lambda: defaultdict(int))
monthly_laughter = defaultdict(lambda: defaultdict(int))

for idx, row in df.iterrows():
    laughter = detect_laughter_patterns(row['text_message'])
    
    if laughter['total_laughter'] > 0:
        # Store message info with laughter data
        laughter_entry = {
            'message_id': idx,
            'user': row['User'],
            'date': row['date'],
            'datetime': row['datetime'],
            'message': row['text_message'],
            **laughter
        }
        laughter_data.append(laughter_entry)
        
        # Aggregate by user
        for pattern in ['lol', 'lmao', 'lmfao', 'ha_chains', 'rofl', 'total_laughter']:
            user_laughter[row['User']][pattern] += laughter[pattern]
        
        # Add max ha chain length for user
        if laughter['max_ha_chain'] > user_laughter[row['User']]['max_ha_chain']:
            user_laughter[row['User']]['max_ha_chain'] = laughter['max_ha_chain']
        
        # Aggregate by month
        month_key = row['datetime'].strftime('%Y-%m')
        for pattern in ['lol', 'lmao', 'lmfao', 'ha_chains', 'rofl', 'total_laughter']:
            monthly_laughter[month_key][pattern] += laughter[pattern]


In [None]:

# Convert to DataFrame for easier analysis
laughter_df = pd.DataFrame(laughter_data)

print(f"Messages with laughter: {len(laughter_df):,}")
print(f"Total laughter instances: {sum(entry['total_laughter'] for entry in laughter_data):,}")

# Overall laughter statistics
print(f"\n=== OVERALL LAUGHTER STATISTICS ===")
total_stats = {
    'lol': sum(entry['lol'] for entry in laughter_data),
    'lmao': sum(entry['lmao'] for entry in laughter_data),
    'lmfao': sum(entry['lmfao'] for entry in laughter_data),
    'ha_chains': sum(entry['ha_chains'] for entry in laughter_data),
    'rofl': sum(entry['rofl'] for entry in laughter_data),
}

for pattern, count in sorted(total_stats.items(), key=lambda x: x[1], reverse=True):
    percentage = (count / sum(total_stats.values())) * 100
    print(f"{pattern.upper():<10}: {count:>5,} ({percentage:>5.1f}%)")

# Laughter by user
print(f"\n=== LAUGHTER BY USER ===")
for user in df['User'].unique():
    user_total = user_laughter[user]['total_laughter']
    user_messages = len(df[df['User'] == user])
    laughter_rate = (user_total / user_messages) * 100 if user_messages > 0 else 0
    
    print(f"\n{user}:")
    print(f"  Total laughter: {user_total:,}")
    print(f"  Laughter rate: {laughter_rate:.2f}% of messages")
    print(f"  Longest ha chain: {'ha' * user_laughter[user]['max_ha_chain'] if user_laughter[user]['max_ha_chain'] > 0 else 'None'} ({user_laughter[user]['max_ha_chain']} has)")
    
    for pattern in ['lol', 'lmao', 'lmfao', 'ha_chains', 'rofl']:
        count = user_laughter[user][pattern]
        if count > 0:
            pattern_pct = (count / user_total) * 100 if user_total > 0 else 0
            print(f"    {pattern}: {count:,} ({pattern_pct:.1f}%)")

# Find the funniest messages (most laughter in one message)
if len(laughter_df) > 0:
    print(f"\n=== FUNNIEST MESSAGES (Most Laughter) ===")
    funniest = laughter_df.nlargest(10, 'total_laughter')
    
    for i, (idx, row) in enumerate(funniest.iterrows(), 1):
        preview = str(row['message'])[:100] + "..." if len(str(row['message'])) > 100 else str(row['message'])
        print(f"{i:2d}. {row['user']} - {row['date']} ({row['total_laughter']} laughs)")
        print(f"    {preview}")
        print()

# Peak laughter periods
if monthly_laughter:
    print(f"\n=== PEAK LAUGHTER PERIODS ===")
    monthly_totals = {month: data['total_laughter'] for month, data in monthly_laughter.items()}
    sorted_months = sorted(monthly_totals.items(), key=lambda x: x[1], reverse=True)
    
    print("Top 10 funniest months:")
    for i, (month, total) in enumerate(sorted_months[:10], 1):
        print(f"{i:2d}. {month}: {total:,} laughs")

# Sample messages for each laughter type
print(f"\n=== SAMPLE MESSAGES BY LAUGHTER TYPE ===")
for pattern in ['lol', 'lmao', 'lmfao', 'ha_chains']:
    pattern_messages = laughter_df[laughter_df[pattern] > 0]
    if len(pattern_messages) > 0:
        print(f"\nSample {pattern.upper()} messages:")
        samples = pattern_messages.sample(min(3, len(pattern_messages)))
        for i, (idx, row) in enumerate(samples.iterrows(), 1):
            preview = str(row['message'])[:120] + "..." if len(str(row['message'])) > 120 else str(row['message'])
            print(f"{i}. {row['user']} ({row['date']}): {preview}")

In [None]:
# Interactive Laughter Visualizations
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

print("\n" + "="*70)
print("INTERACTIVE LAUGHTER VISUALIZATIONS")
print("="*70)

if len(laughter_df) > 0:
    
    # 1. Laughter patterns comparison by user
    user_laughter_data = []
    users = list(df['User'].unique())
    
    for user in users:
        user_data = {
            'User': user,
            'LOL': user_laughter[user]['lol'],
            'LMAO': user_laughter[user]['lmao'], 
            'LMFAO': user_laughter[user]['lmfao'],
            'HA Chains': user_laughter[user]['ha_chains'],
            'ROFL': user_laughter[user]['rofl'],
            'Total': user_laughter[user]['total_laughter']
        }
        user_laughter_data.append(user_data)
    
    user_df = pd.DataFrame(user_laughter_data)
    
    # Interactive bar chart comparing laughter types by user
    fig1 = px.bar(
        user_df.melt(id_vars=['User'], value_vars=['LOL', 'LMAO', 'LMFAO', 'HA Chains', 'ROFL'], 
                     var_name='Laughter Type', value_name='Count'),
        x='User', y='Count', color='Laughter Type',
        title='Laughter Patterns by User (Interactive)',
        labels={'Count': 'Number of Instances'},
        color_discrete_sequence=px.colors.qualitative.Set2
    )
    fig1.update_layout(height=500, showlegend=True)
    fig1.show()
    
    # 2. Laughter over time - Monthly trends
    if monthly_laughter:
        monthly_df_data = []
        for month, data in monthly_laughter.items():
            monthly_df_data.append({
                'Month': month,
                'LOL': data['lol'],
                'LMAO': data['lmao'],
                'LMFAO': data['lmfao'],
                'HA Chains': data['ha_chains'],
                'ROFL': data['rofl'],
                'Total': data['total_laughter']
            })
        
        monthly_df = pd.DataFrame(monthly_df_data).sort_values('Month')
        
        # Line chart showing laughter trends over time
        fig2 = go.Figure()
        
        colors = ['#FF6B6B', '#4ECDC4', '#45B7D1', '#96CEB4', '#FFEAA7']
        patterns = ['LOL', 'LMAO', 'LMFAO', 'HA Chains', 'ROFL']
        
        for i, pattern in enumerate(patterns):
            fig2.add_trace(go.Scatter(
                x=monthly_df['Month'],
                y=monthly_df[pattern],
                mode='lines+markers',
                name=pattern,
                line=dict(color=colors[i], width=3),
                marker=dict(size=8)
            ))
        
        fig2.update_layout(
            title='Laughter Trends Over Time (Interactive)',
            xaxis_title='Month',
            yaxis_title='Number of Instances',
            hovermode='x unified',
            height=500,
            showlegend=True
        )
        fig2.show()
        
        # Total laughter trend
        fig3 = px.line(
            monthly_df, x='Month', y='Total',
            title='Total Laughter Over Time',
            labels={'Total': 'Total Laughter Instances'},
            markers=True
        )
        fig3.update_traces(line_color='#FF6B6B', line_width=4, marker_size=10)
        fig3.update_layout(height=400)
        fig3.show()
    
    # 3. Laughter intensity analysis (ha chain lengths)
    if len(laughter_df) > 0:
        ha_chain_data = []
        for idx, row in laughter_df.iterrows():
            if row['ha_chain_lengths']:
                for chain_length in row['ha_chain_lengths']:
                    ha_chain_data.append({
                        'User': row['user'],
                        'Chain Length': chain_length,
                        'Date': row['date'],
                        'Month': row['datetime'].strftime('%Y-%m')
                    })
        
        if ha_chain_data:
            ha_df = pd.DataFrame(ha_chain_data)
            
            # Histogram of ha chain lengths by user
            fig4 = px.histogram(
                ha_df, x='Chain Length', color='User',
                title='HA Chain Length Distribution by User',
                labels={'Chain Length': 'Number of "HA"s in Chain'},
                nbins=20,
                opacity=0.7
            )
            fig4.update_layout(height=400)
            fig4.show()
            
            # Box plot of ha chain lengths
            fig5 = px.box(
                ha_df, x='User', y='Chain Length',
                title='HA Chain Length Distribution (Box Plot)',
                labels={'Chain Length': 'Number of "HA"s in Chain'}
            )
            fig5.update_layout(height=400)
            fig5.show()
    
    # 4. Laughter rate vs total messages
    user_stats = []
    for user in users:
        user_total_msgs = len(df[df['User'] == user])
        user_total_laughs = user_laughter[user]['total_laughter']
        laughter_rate = (user_total_laughs / user_total_msgs) * 100 if user_total_msgs > 0 else 0
        
        user_stats.append({
            'User': user,
            'Total Messages': user_total_msgs,
            'Total Laughter': user_total_laughs,
            'Laughter Rate (%)': laughter_rate
        })
    
    stats_df = pd.DataFrame(user_stats)
    
    # Scatter plot: Total Messages vs Laughter Rate
    fig6 = px.scatter(
        stats_df, x='Total Messages', y='Laughter Rate (%)',
        size='Total Laughter', color='User', hover_name='User',
        title='Message Volume vs Laughter Rate',
        labels={'Laughter Rate (%)': 'Laughter Rate (% of messages)'}
    )
    fig6.update_layout(height=400)
    fig6.show()
    
    # 5. Daily laughter patterns (if data available)
    if len(laughter_df) > 0:
        # Group by date to see daily patterns
        daily_laughter = laughter_df.groupby('date')['total_laughter'].sum().reset_index()
        daily_laughter['date'] = pd.to_datetime(daily_laughter['date'])
        
        # Get top 20 funniest days
        top_funny_days = daily_laughter.nlargest(20, 'total_laughter')
    
        fig7 = px.bar(
            top_funny_days, x='date', y='total_laughter',
            title='Top 20 Funniest Days',
            labels={'total_laughter': 'Total Laughter Instances', 'date': 'Date'}
        )
        fig7.update_layout(height=500, xaxis_tickangle=-45)
        fig7.show()
    
    # 6. Pie chart of laughter distribution
    total_by_type = {
        'LOL': sum(user_laughter[user]['lol'] for user in users),
        'LMAO': sum(user_laughter[user]['lmao'] for user in users),
        'LMFAO': sum(user_laughter[user]['lmfao'] for user in users),
        'HA Chains': sum(user_laughter[user]['ha_chains'] for user in users),
        'ROFL': sum(user_laughter[user]['rofl'] for user in users)
    }
    
    # Filter out zero values
    total_by_type = {k: v for k, v in total_by_type.items() if v > 0}
    
    if total_by_type:
        fig8 = px.pie(
            values=list(total_by_type.values()),
            names=list(total_by_type.keys()),
            title='Distribution of Laughter Types'
        )
        fig8.update_layout(height=400)
        fig8.show()

else:
    print("No laughter patterns found in the dataset.")

print("\n" + "="*70)
print("LAUGHTER ANALYSIS COMPLETE!")
print("="*70)