# KAGR Case Competition Analysis - Midwest State University Athletic Revenue

**Data Source:** 2025 KODING with KAGR Case Competition Dataset

**Analysis Goal:** Analyze athletic event revenue streams and provide strategic recommendations for revenue optimization.

---

## 1. Setup and Installation

Install required libraries and import dependencies

In [None]:
# Install required packages (uncomment if running in Colab)
# !pip install pandas numpy matplotlib seaborn openpyxl plotly -q

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import warnings
from datetime import datetime
import os

# Configure display settings
warnings.filterwarnings('ignore')
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette("husl")

print("‚úÖ All libraries loaded successfully!")
print(f"Pandas version: {pd.__version__}")
print(f"NumPy version: {np.__version__}")

## 2. Data Loading

Load data from the Excel file (works for both local and Colab)

In [None]:
# File upload for Google Colab
# Uncomment these lines if running in Colab:

# from google.colab import files
# uploaded = files.upload()
# file_path = list(uploaded.keys())[0]

# For local execution or if file is already in Colab environment:
file_path = "../data/2025 KODING with KAGR Case Competition_Dataset.xlsx"

# Alternative: Use absolute path if needed
# file_path = "/content/2025 KODING with KAGR Case Competition_Dataset.xlsx"

try:
    # Load both sheets from the Excel file
    sports_df = pd.read_excel(file_path, sheet_name='midwest_state_sports')
    survey_df = pd.read_excel(file_path, sheet_name='Customer Experience Survey')
    
    print("‚úÖ Data loaded successfully!")
    print(f"\nSports Data: {sports_df.shape[0]} rows, {sports_df.shape[1]} columns")
    print(f"Survey Data: {survey_df.shape[0]} rows, {survey_df.shape[1]} columns")
except FileNotFoundError:
    print("‚ùå Error: File not found. Please upload the Excel file or check the path.")
    print("   For Colab: Uncomment the file upload section above.")

## 3. Data Exploration

Initial exploration of the datasets

In [None]:
# Display first few rows of sports data
print("=" * 80)
print("SPORTS EVENT DATA - First 5 Rows")
print("=" * 80)
display(sports_df.head())

In [None]:
# Display column information for sports data
print("\n" + "=" * 80)
print("SPORTS DATA - Column Information")
print("=" * 80)
print(sports_df.info())

In [None]:
# Display first few rows of survey data
print("\n" + "=" * 80)
print("CUSTOMER SURVEY DATA - First 5 Rows")
print("=" * 80)
display(survey_df.head())

In [None]:
# Display column information for survey data
print("\n" + "=" * 80)
print("SURVEY DATA - Column Information")
print("=" * 80)
print(survey_df.info())

In [None]:
# Check for missing values
print("\n" + "=" * 80)
print("MISSING VALUES ANALYSIS")
print("=" * 80)
print("\nSports Data Missing Values:")
missing_sports = sports_df.isnull().sum()
if missing_sports.sum() > 0:
    print(missing_sports[missing_sports > 0])
else:
    print("‚úÖ No missing values")

print("\nSurvey Data Missing Values:")
missing_survey = survey_df.isnull().sum()
if missing_survey.sum() > 0:
    print(missing_survey[missing_survey > 0])
else:
    print("‚úÖ No missing values")

In [None]:
# Display statistical summary
print("\n" + "=" * 80)
print("STATISTICAL SUMMARY - Sports Data")
print("=" * 80)
display(sports_df.describe())

## 4. Data Preparation and Feature Engineering

Calculate derived metrics and prepare data for analysis

In [None]:
# Calculate total revenue per event
sports_df['Total_Revenue'] = (sports_df['Ticket_Revenue'] + 
                               sports_df['Concession_Revenue'] + 
                               sports_df['Merchandise_Revenue'] + 
                               sports_df['Parking_Revenue'])

# Calculate per-attendee metrics
sports_df['Revenue_per_Attendee'] = sports_df['Total_Revenue'] / sports_df['Attendance'].replace(0, np.nan)
sports_df['Concession_per_Attendee'] = sports_df['Concession_Revenue'] / sports_df['Attendance'].replace(0, np.nan)
sports_df['Merchandise_per_Attendee'] = sports_df['Merchandise_Revenue'] / sports_df['Attendance'].replace(0, np.nan)

# Calculate venue utilization
sports_df['Venue_Utilization'] = (sports_df['Attendance'] / sports_df['Venue_Capacity']) * 100

# Calculate age from birth year in survey data
current_year = datetime.now().year
survey_df['Age'] = current_year - survey_df['Birth_Year']

# Create age groups
survey_df['Age_Group'] = pd.cut(survey_df['Age'], 
                                 bins=[0, 25, 35, 45, 55, 100],
                                 labels=['18-25', '26-35', '36-45', '46-55', '55+'])

print("‚úÖ Feature engineering completed!")
print(f"\nNew columns added to sports data: {['Total_Revenue', 'Revenue_per_Attendee', 'Concession_per_Attendee', 'Merchandise_per_Attendee', 'Venue_Utilization']}")
print(f"New columns added to survey data: {['Age', 'Age_Group']}")

---
# VISUALIZATIONS
---

## Visualization 1: Overall Revenue Composition

Breakdown of total athletic revenue by source

In [None]:
# Calculate total revenue by source
revenue_sources = {
    'Ticket Sales': sports_df['Ticket_Revenue'].sum(),
    'Concessions': sports_df['Concession_Revenue'].sum(),
    'Merchandise': sports_df['Merchandise_Revenue'].sum(),
    'Parking': sports_df['Parking_Revenue'].sum()
}

# Create interactive pie chart
fig = go.Figure(data=[go.Pie(
    labels=list(revenue_sources.keys()),
    values=list(revenue_sources.values()),
    hole=0.4,
    marker=dict(colors=['#FF6B6B', '#4ECDC4', '#45B7D1', '#FFA07A']),
    textinfo='label+percent',
    textfont_size=14,
    hovertemplate='<b>%{label}</b><br>Revenue: $%{value:,.0f}<br>Percentage: %{percent}<extra></extra>'
)])

total_revenue = sum(revenue_sources.values())
fig.update_layout(
    title={
        'text': f'Total Athletic Revenue Composition<br><sub>Total: ${total_revenue:,.0f}</sub>',
        'x': 0.5,
        'xanchor': 'center'
    },
    font=dict(size=12),
    height=500,
    showlegend=True
)

fig.show()

# Print summary
print("\nRevenue Breakdown:")
for source, amount in revenue_sources.items():
    pct = (amount / total_revenue) * 100
    print(f"{source:15} ${amount:>12,.0f} ({pct:5.1f}%)")

## Visualization 2: Revenue by Sport

Comparative analysis of revenue generation across different sports

In [None]:
# Aggregate revenue by sport
sport_revenue = sports_df.groupby('Sport').agg({
    'Total_Revenue': 'sum',
    'Attendance': 'sum',
    'Ticket_Revenue': 'sum',
    'Concession_Revenue': 'sum',
    'Merchandise_Revenue': 'sum',
    'Parking_Revenue': 'sum'
}).reset_index()

sport_revenue = sport_revenue.sort_values('Total_Revenue', ascending=True)

# Create horizontal bar chart
fig = go.Figure()

fig.add_trace(go.Bar(
    y=sport_revenue['Sport'],
    x=sport_revenue['Total_Revenue'],
    orientation='h',
    marker=dict(
        color=sport_revenue['Total_Revenue'],
        colorscale='Viridis',
        showscale=True,
        colorbar=dict(title="Revenue ($)")
    ),
    text=[f'${x:,.0f}' for x in sport_revenue['Total_Revenue']],
    textposition='outside',
    hovertemplate='<b>%{y}</b><br>Total Revenue: $%{x:,.0f}<extra></extra>'
))

fig.update_layout(
    title='Total Revenue by Sport',
    xaxis_title='Total Revenue ($)',
    yaxis_title='Sport',
    height=500,
    showlegend=False
)

fig.show()

## Visualization 3: Sport Performance Dashboard

Multi-metric comparison across sports (Revenue, Attendance, Utilization, Events)

In [None]:
# Calculate comprehensive sport metrics
sport_metrics = sports_df.groupby('Sport').agg({
    'Total_Revenue': 'sum',
    'Attendance': 'sum',
    'Venue_Utilization': 'mean',
    'Sport': 'count'
}).rename(columns={'Sport': 'Event_Count'}).reset_index()

sport_metrics['Avg_Revenue_per_Event'] = sport_metrics['Total_Revenue'] / sport_metrics['Event_Count']

# Create subplot dashboard
fig = make_subplots(
    rows=2, cols=2,
    subplot_titles=('Total Revenue by Sport', 'Total Attendance by Sport',
                    'Average Venue Utilization', 'Number of Events'),
    specs=[[{'type': 'bar'}, {'type': 'bar'}],
           [{'type': 'bar'}, {'type': 'bar'}]]
)

# Sort for consistent display
sport_metrics_sorted = sport_metrics.sort_values('Total_Revenue', ascending=False)

# 1. Total Revenue
fig.add_trace(
    go.Bar(x=sport_metrics_sorted['Sport'], 
           y=sport_metrics_sorted['Total_Revenue'],
           marker_color='#FF6B6B',
           name='Revenue',
           hovertemplate='%{x}<br>$%{y:,.0f}<extra></extra>'),
    row=1, col=1
)

# 2. Total Attendance
fig.add_trace(
    go.Bar(x=sport_metrics_sorted['Sport'], 
           y=sport_metrics_sorted['Attendance'],
           marker_color='#4ECDC4',
           name='Attendance',
           hovertemplate='%{x}<br>%{y:,} attendees<extra></extra>'),
    row=1, col=2
)

# 3. Venue Utilization
fig.add_trace(
    go.Bar(x=sport_metrics_sorted['Sport'], 
           y=sport_metrics_sorted['Venue_Utilization'],
           marker_color='#45B7D1',
           name='Utilization',
           hovertemplate='%{x}<br>%{y:.1f}%<extra></extra>'),
    row=2, col=1
)

# 4. Event Count
fig.add_trace(
    go.Bar(x=sport_metrics_sorted['Sport'], 
           y=sport_metrics_sorted['Event_Count'],
           marker_color='#FFA07A',
           name='Events',
           hovertemplate='%{x}<br>%{y} events<extra></extra>'),
    row=2, col=2
)

# Update layout
fig.update_xaxes(tickangle=45)
fig.update_layout(
    title_text="Sport Performance Dashboard",
    height=800,
    showlegend=False
)

fig.show()

## Visualization 4: Revenue Composition by Sport

Stacked bar chart showing revenue source breakdown for each sport

In [None]:
# Prepare data for stacked bar chart
sport_revenue_detail = sports_df.groupby('Sport').agg({
    'Ticket_Revenue': 'sum',
    'Concession_Revenue': 'sum',
    'Merchandise_Revenue': 'sum',
    'Parking_Revenue': 'sum'
}).reset_index()

# Sort by total revenue
sport_revenue_detail['Total'] = (sport_revenue_detail['Ticket_Revenue'] + 
                                  sport_revenue_detail['Concession_Revenue'] + 
                                  sport_revenue_detail['Merchandise_Revenue'] + 
                                  sport_revenue_detail['Parking_Revenue'])
sport_revenue_detail = sport_revenue_detail.sort_values('Total', ascending=False)

# Create stacked bar chart
fig = go.Figure()

fig.add_trace(go.Bar(
    x=sport_revenue_detail['Sport'],
    y=sport_revenue_detail['Ticket_Revenue'],
    name='Ticket Sales',
    marker_color='#FF6B6B',
    hovertemplate='Tickets: $%{y:,.0f}<extra></extra>'
))

fig.add_trace(go.Bar(
    x=sport_revenue_detail['Sport'],
    y=sport_revenue_detail['Concession_Revenue'],
    name='Concessions',
    marker_color='#4ECDC4',
    hovertemplate='Concessions: $%{y:,.0f}<extra></extra>'
))

fig.add_trace(go.Bar(
    x=sport_revenue_detail['Sport'],
    y=sport_revenue_detail['Merchandise_Revenue'],
    name='Merchandise',
    marker_color='#45B7D1',
    hovertemplate='Merchandise: $%{y:,.0f}<extra></extra>'
))

fig.add_trace(go.Bar(
    x=sport_revenue_detail['Sport'],
    y=sport_revenue_detail['Parking_Revenue'],
    name='Parking',
    marker_color='#FFA07A',
    hovertemplate='Parking: $%{y:,.0f}<extra></extra>'
))

fig.update_layout(
    title='Revenue Composition by Sport',
    xaxis_title='Sport',
    yaxis_title='Revenue ($)',
    barmode='stack',
    height=600,
    hovermode='x unified'
)

fig.show()

## Visualization 5: Customer Segment Analysis

Distribution of fans by customer type

In [None]:
# Analyze customer segments
customer_segments = survey_df['Customer_Type'].value_counts().reset_index()
customer_segments.columns = ['Customer_Type', 'Count']
customer_segments['Percentage'] = (customer_segments['Count'] / customer_segments['Count'].sum()) * 100

# Create donut chart
fig = go.Figure(data=[go.Pie(
    labels=customer_segments['Customer_Type'],
    values=customer_segments['Count'],
    hole=0.5,
    marker=dict(colors=['#FF6B6B', '#4ECDC4', '#45B7D1', '#FFA07A', '#95E1D3']),
    textinfo='label+percent',
    textfont_size=13,
    hovertemplate='<b>%{label}</b><br>Count: %{value}<br>Percentage: %{percent}<extra></extra>'
)])

fig.update_layout(
    title={
        'text': f'Customer Segment Distribution<br><sub>Total Respondents: {customer_segments["Count"].sum():,}</sub>',
        'x': 0.5,
        'xanchor': 'center'
    },
    height=500
)

fig.show()

# Print detailed breakdown
print("\nCustomer Segment Breakdown:")
print(customer_segments.to_string(index=False))

## Visualization 6: Customer Satisfaction Analysis

Overall satisfaction scores by customer type

In [None]:
# Calculate average satisfaction by customer type
satisfaction_by_type = survey_df.groupby('Customer_Type')['Overall_Satisfaction'].agg(['mean', 'count']).reset_index()
satisfaction_by_type = satisfaction_by_type.sort_values('mean', ascending=False)

# Create bar chart with error indication
fig = go.Figure()

fig.add_trace(go.Bar(
    x=satisfaction_by_type['Customer_Type'],
    y=satisfaction_by_type['mean'],
    marker=dict(
        color=satisfaction_by_type['mean'],
        colorscale='RdYlGn',
        showscale=True,
        colorbar=dict(title="Satisfaction")
    ),
    text=[f'{x:.2f}' for x in satisfaction_by_type['mean']],
    textposition='outside',
    hovertemplate='<b>%{x}</b><br>Avg Satisfaction: %{y:.2f}<extra></extra>'
))

fig.update_layout(
    title='Average Satisfaction Score by Customer Type',
    xaxis_title='Customer Type',
    yaxis_title='Average Satisfaction (1-5 scale)',
    height=500,
    yaxis=dict(range=[0, 5])
)

# Add reference line for overall average
overall_avg = survey_df['Overall_Satisfaction'].mean()
fig.add_hline(y=overall_avg, line_dash="dash", line_color="red",
              annotation_text=f"Overall Avg: {overall_avg:.2f}",
              annotation_position="right")

fig.show()

## Visualization 7: Sport Interest Levels

Fan interest in different sports from survey data

In [None]:
# Analyze sport interest from survey
sport_interest_cols = [col for col in survey_df.columns if 'Interest' in col]
sport_interest = {}

for col in sport_interest_cols:
    sport_name = col.replace('_Interest', '').replace('_', ' ').title()
    sport_interest[sport_name] = survey_df[col].mean()

# Sort by interest level
sport_interest_sorted = dict(sorted(sport_interest.items(), key=lambda x: x[1], reverse=True))

# Create horizontal bar chart
fig = go.Figure(go.Bar(
    y=list(sport_interest_sorted.keys()),
    x=list(sport_interest_sorted.values()),
    orientation='h',
    marker=dict(
        color=list(sport_interest_sorted.values()),
        colorscale='Plasma',
        showscale=True,
        colorbar=dict(title="Interest Level")
    ),
    text=[f'{x:.2f}' for x in sport_interest_sorted.values()],
    textposition='outside',
    hovertemplate='<b>%{y}</b><br>Avg Interest: %{x:.2f}<extra></extra>'
))

fig.update_layout(
    title='Average Fan Interest by Sport (Survey Data)',
    xaxis_title='Average Interest Level (1-5 scale)',
    yaxis_title='Sport',
    height=600,
    xaxis=dict(range=[0, 5])
)

fig.show()

## Visualization 8: Age Demographics

Age distribution of survey respondents

In [None]:
# Create age distribution histogram
fig = go.Figure()

fig.add_trace(go.Histogram(
    x=survey_df['Age'],
    nbinsx=30,
    marker=dict(color='#4ECDC4', line=dict(color='white', width=1)),
    hovertemplate='Age Range: %{x}<br>Count: %{y}<extra></extra>'
))

fig.update_layout(
    title='Fan Age Distribution',
    xaxis_title='Age',
    yaxis_title='Number of Fans',
    height=500,
    showlegend=False
)

# Add statistics
fig.add_vline(x=survey_df['Age'].mean(), line_dash="dash", line_color="red",
              annotation_text=f"Mean: {survey_df['Age'].mean():.1f}",
              annotation_position="top")

fig.add_vline(x=survey_df['Age'].median(), line_dash="dash", line_color="blue",
              annotation_text=f"Median: {survey_df['Age'].median():.1f}",
              annotation_position="bottom")

fig.show()

print(f"\nAge Statistics:")
print(f"Mean: {survey_df['Age'].mean():.1f}")
print(f"Median: {survey_df['Age'].median():.1f}")
print(f"Min: {survey_df['Age'].min()}")
print(f"Max: {survey_df['Age'].max()}")

## Visualization 9: Age Group Distribution by Customer Type

Cross-analysis of age groups and customer segments

In [None]:
# Create cross-tabulation
age_customer_crosstab = pd.crosstab(survey_df['Age_Group'], survey_df['Customer_Type'])

# Create grouped bar chart
fig = go.Figure()

colors = ['#FF6B6B', '#4ECDC4', '#45B7D1', '#FFA07A', '#95E1D3']
for i, customer_type in enumerate(age_customer_crosstab.columns):
    fig.add_trace(go.Bar(
        name=customer_type,
        x=age_customer_crosstab.index,
        y=age_customer_crosstab[customer_type],
        marker_color=colors[i % len(colors)],
        hovertemplate=f'<b>{customer_type}</b><br>Age Group: %{{x}}<br>Count: %{{y}}<extra></extra>'
    ))

fig.update_layout(
    title='Fan Distribution by Age Group and Customer Type',
    xaxis_title='Age Group',
    yaxis_title='Number of Fans',
    barmode='group',
    height=600,
    hovermode='x unified'
)

fig.show()

## Visualization 10: Revenue by Day of Week

Performance analysis by game day scheduling

In [None]:
# Analyze revenue by day of week
day_revenue = sports_df.groupby('Day_of_Week').agg({
    'Total_Revenue': ['sum', 'mean'],
    'Attendance': ['sum', 'mean'],
    'Sport': 'count'
}).reset_index()

day_revenue.columns = ['Day_of_Week', 'Total_Revenue', 'Avg_Revenue', 'Total_Attendance', 'Avg_Attendance', 'Event_Count']

# Order days correctly
day_order = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']
day_revenue['Day_of_Week'] = pd.Categorical(day_revenue['Day_of_Week'], categories=day_order, ordered=True)
day_revenue = day_revenue.sort_values('Day_of_Week')

# Create subplot with two metrics
fig = make_subplots(
    rows=1, cols=2,
    subplot_titles=('Total Revenue by Day', 'Average Attendance by Day'),
    specs=[[{'type': 'bar'}, {'type': 'bar'}]]
)

# Revenue chart
fig.add_trace(
    go.Bar(
        x=day_revenue['Day_of_Week'],
        y=day_revenue['Total_Revenue'],
        marker_color='#FF6B6B',
        name='Total Revenue',
        hovertemplate='%{x}<br>$%{y:,.0f}<extra></extra>'
    ),
    row=1, col=1
)

# Attendance chart
fig.add_trace(
    go.Bar(
        x=day_revenue['Day_of_Week'],
        y=day_revenue['Avg_Attendance'],
        marker_color='#4ECDC4',
        name='Avg Attendance',
        hovertemplate='%{x}<br>%{y:,.0f} avg attendees<extra></extra>'
    ),
    row=1, col=2
)

fig.update_xaxes(tickangle=45)
fig.update_layout(
    title_text="Performance by Day of Week",
    height=500,
    showlegend=False
)

fig.show()

## Visualization 11: Opponent Type Impact on Revenue

Revenue comparison based on opponent quality/rivalry

In [None]:
# Analyze revenue by opponent type
opponent_revenue = sports_df.groupby('Opponent_Type').agg({
    'Total_Revenue': ['sum', 'mean'],
    'Attendance': ['sum', 'mean'],
    'Revenue_per_Attendee': 'mean',
    'Sport': 'count'
}).reset_index()

opponent_revenue.columns = ['Opponent_Type', 'Total_Revenue', 'Avg_Revenue', 
                            'Total_Attendance', 'Avg_Attendance', 'Avg_Revenue_per_Attendee', 'Event_Count']

opponent_revenue = opponent_revenue.sort_values('Total_Revenue', ascending=False)

# Create grouped bar chart
fig = go.Figure()

fig.add_trace(go.Bar(
    x=opponent_revenue['Opponent_Type'],
    y=opponent_revenue['Avg_Revenue'],
    name='Avg Revenue per Event',
    marker_color='#FF6B6B',
    yaxis='y',
    hovertemplate='%{x}<br>Avg Revenue: $%{y:,.0f}<extra></extra>'
))

fig.add_trace(go.Bar(
    x=opponent_revenue['Opponent_Type'],
    y=opponent_revenue['Avg_Attendance'],
    name='Avg Attendance',
    marker_color='#4ECDC4',
    yaxis='y2',
    hovertemplate='%{x}<br>Avg Attendance: %{y:,.0f}<extra></extra>'
))

fig.update_layout(
    title='Impact of Opponent Type on Performance',
    xaxis=dict(title='Opponent Type'),
    yaxis=dict(title='Average Revenue ($)', side='left', showgrid=False),
    yaxis2=dict(title='Average Attendance', side='right', overlaying='y', showgrid=False),
    barmode='group',
    height=600,
    hovermode='x unified'
)

fig.show()

# Print summary table
print("\nOpponent Type Performance Summary:")
print(opponent_revenue[['Opponent_Type', 'Event_Count', 'Avg_Revenue', 'Avg_Attendance']].to_string(index=False))

## Visualization 12: Venue Utilization Analysis

Stadium/arena capacity utilization by sport

In [None]:
# Analyze venue utilization by sport
venue_util = sports_df.groupby('Sport').agg({
    'Venue_Utilization': ['mean', 'min', 'max'],
    'Attendance': 'mean',
    'Venue_Capacity': 'first',
    'Sport': 'count'
}).reset_index()

venue_util.columns = ['Sport', 'Avg_Utilization', 'Min_Utilization', 'Max_Utilization', 
                      'Avg_Attendance', 'Capacity', 'Event_Count']
venue_util = venue_util.sort_values('Avg_Utilization', ascending=False)

# Create bar chart with error bars
fig = go.Figure()

fig.add_trace(go.Bar(
    x=venue_util['Sport'],
    y=venue_util['Avg_Utilization'],
    marker=dict(
        color=venue_util['Avg_Utilization'],
        colorscale='RdYlGn',
        showscale=True,
        colorbar=dict(title="Utilization %")
    ),
    text=[f'{x:.1f}%' for x in venue_util['Avg_Utilization']],
    textposition='outside',
    hovertemplate='<b>%{x}</b><br>Avg Utilization: %{y:.1f}%<br>Capacity: %{customdata:,}<extra></extra>',
    customdata=venue_util['Capacity']
))

fig.update_layout(
    title='Average Venue Utilization by Sport',
    xaxis_title='Sport',
    yaxis_title='Utilization (%)',
    height=600,
    yaxis=dict(range=[0, 100])
)

# Add reference line for target utilization
fig.add_hline(y=80, line_dash="dash", line_color="green",
              annotation_text="Target: 80%",
              annotation_position="right")

fig.show()

## Visualization 13: Ancillary Revenue per Attendee

Concession and merchandise spending patterns by sport

In [None]:
# Calculate per-attendee ancillary revenue
ancillary_revenue = sports_df[sports_df['Attendance'] > 0].groupby('Sport').agg({
    'Concession_per_Attendee': 'mean',
    'Merchandise_per_Attendee': 'mean',
    'Attendance': 'sum'
}).reset_index()

ancillary_revenue['Total_Ancillary_per_Attendee'] = (ancillary_revenue['Concession_per_Attendee'] + 
                                                      ancillary_revenue['Merchandise_per_Attendee'])

ancillary_revenue = ancillary_revenue.sort_values('Total_Ancillary_per_Attendee', ascending=False)

# Create grouped bar chart
fig = go.Figure()

fig.add_trace(go.Bar(
    x=ancillary_revenue['Sport'],
    y=ancillary_revenue['Concession_per_Attendee'],
    name='Concessions',
    marker_color='#4ECDC4',
    hovertemplate='%{x}<br>Concessions: $%{y:.2f} per attendee<extra></extra>'
))

fig.add_trace(go.Bar(
    x=ancillary_revenue['Sport'],
    y=ancillary_revenue['Merchandise_per_Attendee'],
    name='Merchandise',
    marker_color='#FF6B6B',
    hovertemplate='%{x}<br>Merchandise: $%{y:.2f} per attendee<extra></extra>'
))

fig.update_layout(
    title='Average Ancillary Revenue per Attendee by Sport',
    xaxis_title='Sport',
    yaxis_title='Revenue per Attendee ($)',
    barmode='group',
    height=600,
    hovermode='x unified'
)

fig.show()

print("\nAncillary Revenue Summary:")
print(ancillary_revenue[['Sport', 'Concession_per_Attendee', 'Merchandise_per_Attendee', 'Total_Ancillary_per_Attendee']].to_string(index=False))

## Visualization 14: Communication Effectiveness vs Satisfaction

Correlation between communication and overall satisfaction

In [None]:
# Create scatter plot with trendline
fig = px.scatter(
    survey_df,
    x='Communication_Effectiveness',
    y='Overall_Satisfaction',
    color='Customer_Type',
    size='Recommendation_Likelihood',
    hover_data=['Age'],
    trendline='ols',
    title='Communication Effectiveness vs Overall Satisfaction',
    labels={
        'Communication_Effectiveness': 'Communication Effectiveness (1-5)',
        'Overall_Satisfaction': 'Overall Satisfaction (1-5)',
        'Customer_Type': 'Customer Type',
        'Recommendation_Likelihood': 'Likelihood to Recommend'
    },
    color_discrete_sequence=['#FF6B6B', '#4ECDC4', '#45B7D1', '#FFA07A', '#95E1D3']
)

fig.update_layout(
    height=600,
    xaxis=dict(range=[0, 6]),
    yaxis=dict(range=[0, 6])
)

fig.show()

# Calculate correlation
correlation = survey_df['Communication_Effectiveness'].corr(survey_df['Overall_Satisfaction'])
print(f"\nCorrelation between Communication Effectiveness and Overall Satisfaction: {correlation:.3f}")

## Visualization 15: Heatmap - Sport Interest by Customer Type

Cross-analysis of customer segments and sport preferences

In [None]:
# Create interest matrix by customer type
interest_cols = [col for col in survey_df.columns if 'Interest' in col]
interest_data = []

for customer_type in survey_df['Customer_Type'].unique():
    type_data = {'Customer_Type': customer_type}
    for col in interest_cols:
        sport_name = col.replace('_Interest', '').replace('_', ' ').title()
        avg_interest = survey_df[survey_df['Customer_Type'] == customer_type][col].mean()
        type_data[sport_name] = avg_interest
    interest_data.append(type_data)

interest_matrix = pd.DataFrame(interest_data)
interest_matrix = interest_matrix.set_index('Customer_Type')

# Create heatmap
fig = go.Figure(data=go.Heatmap(
    z=interest_matrix.values,
    x=interest_matrix.columns,
    y=interest_matrix.index,
    colorscale='RdYlGn',
    text=np.round(interest_matrix.values, 2),
    texttemplate='%{text}',
    textfont={"size": 10},
    colorbar=dict(title="Interest Level"),
    hovertemplate='Customer: %{y}<br>Sport: %{x}<br>Interest: %{z:.2f}<extra></extra>'
))

fig.update_layout(
    title='Sport Interest Heatmap by Customer Type',
    xaxis_title='Sport',
    yaxis_title='Customer Type',
    height=500
)

fig.show()

## Visualization 16: Revenue Trend Analysis (if date data available)

Time series analysis of revenue patterns

In [None]:
# Check if date column exists
if 'Date' in sports_df.columns or 'Event_Date' in sports_df.columns:
    date_col = 'Date' if 'Date' in sports_df.columns else 'Event_Date'
    
    # Convert to datetime if not already
    sports_df[date_col] = pd.to_datetime(sports_df[date_col])
    
    # Group by date and sport
    date_revenue = sports_df.groupby([date_col, 'Sport'])['Total_Revenue'].sum().reset_index()
    
    # Create line chart
    fig = px.line(
        date_revenue,
        x=date_col,
        y='Total_Revenue',
        color='Sport',
        title='Revenue Trends Over Time by Sport',
        labels={'Total_Revenue': 'Total Revenue ($)', date_col: 'Date'},
        markers=True
    )
    
    fig.update_layout(height=600, hovermode='x unified')
    fig.show()
else:
    print("‚ÑπÔ∏è Date column not found in dataset. Skipping time series analysis.")
    
    # Alternative: Show revenue by event sequence
    sports_df['Event_Number'] = sports_df.groupby('Sport').cumcount() + 1
    
    fig = px.line(
        sports_df,
        x='Event_Number',
        y='Total_Revenue',
        color='Sport',
        title='Revenue by Event Sequence',
        labels={'Total_Revenue': 'Total Revenue ($)', 'Event_Number': 'Event Number'},
        markers=True
    )
    
    fig.update_layout(height=600, hovermode='x unified')
    fig.show()

---
# KEY INSIGHTS & SUMMARY
---

## Summary Statistics

In [None]:
# Calculate and display key metrics
print("="*80)
print("KEY PERFORMANCE METRICS")
print("="*80)

total_revenue = sports_df['Total_Revenue'].sum()
total_attendance = sports_df['Attendance'].sum()
avg_revenue_per_event = sports_df['Total_Revenue'].mean()
avg_attendance_per_event = sports_df['Attendance'].mean()
avg_utilization = sports_df['Venue_Utilization'].mean()
avg_satisfaction = survey_df['Overall_Satisfaction'].mean()
avg_recommendation = survey_df['Recommendation_Likelihood'].mean()

print(f"\nüìä REVENUE METRICS:")
print(f"   Total Revenue:              ${total_revenue:,.2f}")
print(f"   Average Revenue per Event:  ${avg_revenue_per_event:,.2f}")
print(f"   Revenue per Attendee:       ${total_revenue/total_attendance:.2f}")

print(f"\nüë• ATTENDANCE METRICS:")
print(f"   Total Attendance:           {total_attendance:,}")
print(f"   Average per Event:          {avg_attendance_per_event:,.0f}")
print(f"   Average Venue Utilization:  {avg_utilization:.1f}%")

print(f"\n‚≠ê CUSTOMER SATISFACTION:")
print(f"   Average Satisfaction:       {avg_satisfaction:.2f}/5.0")
print(f"   Average Recommendation:     {avg_recommendation:.2f}/5.0")
print(f"   Total Survey Respondents:   {len(survey_df):,}")

print(f"\nüèÜ TOP PERFORMING SPORT:")
top_sport = sport_revenue.iloc[-1]
print(f"   Sport: {top_sport['Sport']}")
print(f"   Total Revenue: ${top_sport['Total_Revenue']:,.2f}")
print(f"   Total Attendance: {top_sport['Attendance']:,}")

print("\n" + "="*80)

## Export Data Summary (Optional)

Export processed data for further analysis

In [None]:
# Uncomment to export processed data

# sport_revenue.to_csv('sport_revenue_summary.csv', index=False)
# print("‚úÖ Sport revenue summary exported to 'sport_revenue_summary.csv'")

# ancillary_revenue.to_csv('ancillary_revenue_summary.csv', index=False)
# print("‚úÖ Ancillary revenue summary exported to 'ancillary_revenue_summary.csv'")

# customer_segments.to_csv('customer_segments.csv', index=False)
# print("‚úÖ Customer segments exported to 'customer_segments.csv'")

print("‚ÑπÔ∏è Uncomment the export code above to save processed data.")

---

## üìã Analysis Complete!

This notebook provides comprehensive visualization and analysis of:
- Revenue composition and trends
- Sport-by-sport performance
- Customer segmentation and satisfaction
- Ancillary revenue opportunities
- Venue utilization
- Fan demographics and preferences

**Data Source:** Excel file with sports event data and customer survey responses

---