# 2025 End of Season Rower Survey Analysis

This notebook contains the complete analysis of the Rower Survey data. It covers data loading, cleaning, processing, and visualization.

## 1. Setup and Imports
Import all the necessary libraries for the analysis.

In [11]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os
import re

## 2. Configuration
Define the file paths for the data and mapping files.

In [12]:
file_path = "2025 End of Season Rower Survey - Responses (AR).xlsx"
mapping_file_path = "column_mapping.xlsx"
brand_colors_file = "BrandColours.md"
chart_path = "charts"

## 3. Helper Functions
This section contains all the functions used for data processing and visualization, consolidated from the original `.py` scripts.

In [13]:
def create_bar_chart(df, column_name, chart_path="charts", highlight_bar=None, chart_name=None, title=None):
    """
    Creates and saves a bar chart for a given column.
    """
    if column_name not in df.columns:
        print(f"Column '{column_name}' not found in the DataFrame.")
        return

    if not os.path.exists(chart_path):
        os.makedirs(chart_path)

    brand_colors = get_brand_colors()
    default_color = brand_colors[0] if brand_colors else "#003E7E"
    # Use accent color for highlighting, which is the second in the list now.
    highlight_color = brand_colors[1] if brand_colors and len(brand_colors) > 1 else "#FFB81C"

    # FIX 1: Updated is_categorical_dtype check
    if isinstance(df[column_name].dtype, pd.CategoricalDtype) and df[column_name].cat.ordered:
        order = df[column_name].cat.categories
    else:
        order = df[column_name].value_counts().index

    palette = [highlight_color if bar == highlight_bar else default_color for bar in order] if highlight_bar and highlight_bar in order else [default_color] * len(order)

    plt.figure(figsize=(10, 6))
    # FIX 2: Updated sns.countplot call to address FutureWarning
    y_data = df[column_name].dropna()
    sns.countplot(y=y_data, order=order, palette=palette, hue=y_data, legend=False)
    plt.title(title if title else f'Distribution of Responses for "{column_name}"')
    plt.xlabel("Count")
    plt.ylabel("Response")
    plt.tight_layout()

    file_name = f"{chart_name}.png" if chart_name else f"{column_name}_distribution.png"
    save_path = os.path.join(chart_path, file_name)
    plt.savefig(save_path)
    print(f"\nChart saved to '{save_path}'")
    plt.close()

## 4. Data Loading and Pre-processing
Load the survey data and the column mapping file, then rename and apply the correct data types to the columns.

In [14]:
survey_data_raw = load_data(file_path)
mapping_df = load_column_mapping(mapping_file_path)

if survey_data_raw is not None and mapping_df is not None:
    # Filter the mapping to only include columns where 'masters_related' is 'Y'
    if 'masters_related' in mapping_df.columns:
        print("Found 'masters_related' column. Filtering columns...")
        # Ensure we handle non-string values and case-insensitivity
        mapping_df_filtered = mapping_df[mapping_df['masters_related'].astype(str).str.upper() == 'Y'].copy()
    else:
        print("Warning: 'masters_related' column not found in mapping file. Processing all columns as before.")
        mapping_df_filtered = mapping_df.copy()

    # Get the list of original column names to keep
    columns_to_keep = mapping_df_filtered['old_name'].tolist()
    
    # Filter the main DataFrame to only keep the selected columns
    survey_data = survey_data_raw[columns_to_keep]
    print(f"\nFiltered survey data to {len(survey_data.columns)} columns based on 'masters_related' flag.")

    # Create the renaming dictionary from the filtered mapping
    column_mapping = dict(zip(mapping_df_filtered["old_name"], mapping_df_filtered["new_name"]))
    survey_data.rename(columns=column_mapping, inplace=True)
    print("\nColumns renamed.")
    
    # Apply data types using the filtered mapping
    survey_data = apply_data_types(survey_data, mapping_df_filtered)
    
    print("\nData processing complete. Displaying DataFrame info:")
    survey_data.info()
    display(survey_data.head())


Data loaded successfully.
Column mapping loaded successfully.
Found 'masters_related' column. Filtering columns...

Filtered survey data to 49 columns based on 'masters_related' flag.

Columns renamed.
Applied ordered categorical type to 'support_1st_place_medals_masters'.
Applied ordered categorical type to 'rating_promotion_governance'.
Applied ordered categorical type to 'rating_accessibility'.
Applied ordered categorical type to 'rating_positive_experience'.
Applied ordered categorical type to 'rating_high_performance_pathways'.

Data types applied successfully.

Data processing complete. Displaying DataFrame info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 155 entries, 0 to 154
Data columns (total 49 columns):
 #   Column                                    Non-Null Count  Dtype   
---  ------                                    --------------  -----   
 0   age_category                              155 non-null    category
 1   gender                                    155 n

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  survey_data.rename(columns=column_mapping, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[col_name] = df[col_name].astype(dtype)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[col_name] = df[col_name].astype(dtype)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instea

Unnamed: 0,age_category,gender,club_affiliation,participated_in_regattas_2025,rating_training,rating_regattas_racing,rating_physical_health,rating_mental_health,rating_social_community,rating_rowing_culture,...,reason_recreational_social_aspect,support_transition_to_competitive,factors_to_consider_competitive,changes_to_start_competing,factors_not_competing,rowing_value_vs_other_sports,rating_promotion_governance,rating_accessibility,rating_positive_experience,rating_high_performance_pathways
0,18-26,Female,West Australian Rowing Club,Yes,4,4,4,5,4,3,...,,,,,,,Agree,Agree,Agree,Disagree
1,Under 18 (by selecting this box I am confirmin...,Female,West Australian Rowing Club,Yes,5,5,4,5,4,5,...,0.0,0.0,,,,0.0,Agree,Disagree,Agree,Disagree
2,41-60,Female,University of Western Australia Boat Club,No,4,4,4,4,4,4,...,4.0,3.0,,Race-prep education sessions (explaining regat...,Lack of encouragement and support from my club...,3.0,Neutral,Agree,Agree,Neutral
3,27-40,Male,Swan River Rowing Club,No,5,4,4,3,5,4,...,4.0,4.0,More novices support and guidance through the ...,Crews must have a mix of new and experienced r...,"Time commitment required for regatta days,Worr...",2.0,Neutral,Neutral,Agree,
4,Under 18 (by selecting this box I am confirmin...,Male,No Club,Yes,5,5,5,5,5,5,...,0.0,0.0,,,,0.0,Agree,Strongly Agree,Agree,Agree


## 5. Filter for Masters Rowers
Create a separate DataFrame containing only the responses from Masters rowers (age 27+).

In [15]:
if 'age_category' in survey_data.columns:
    masters_age_categories = ["27-40", "41-60", "61+"]
    masters_df = survey_data[survey_data["age_category"].isin(masters_age_categories)].copy()
    print(f"Filtered for Masters rowers. Found {len(masters_df)} responses.")
    display(masters_df.head())
else:
    print("Column 'age_category' not found. Cannot filter for Masters rowers.")

Filtered for Masters rowers. Found 76 responses.


Unnamed: 0,age_category,gender,club_affiliation,participated_in_regattas_2025,rating_training,rating_regattas_racing,rating_physical_health,rating_mental_health,rating_social_community,rating_rowing_culture,...,reason_recreational_social_aspect,support_transition_to_competitive,factors_to_consider_competitive,changes_to_start_competing,factors_not_competing,rowing_value_vs_other_sports,rating_promotion_governance,rating_accessibility,rating_positive_experience,rating_high_performance_pathways
2,41-60,Female,University of Western Australia Boat Club,No,4,4,4,4,4,4,...,4.0,3.0,,Race-prep education sessions (explaining regat...,Lack of encouragement and support from my club...,3.0,Neutral,Agree,Agree,Neutral
3,27-40,Male,Swan River Rowing Club,No,5,4,4,3,5,4,...,4.0,4.0,More novices support and guidance through the ...,Crews must have a mix of new and experienced r...,"Time commitment required for regatta days,Worr...",2.0,Neutral,Neutral,Agree,
5,27-40,Male,Old Scotch Collegians,Yes,5,4,5,5,4,4,...,,,,,,,Agree,Agree,Agree,Agree
6,41-60,Male,Curtin University Boating Club,Yes,5,4,5,5,5,5,...,4.0,3.0,More time efficiency,,Time commitment required for regatta days,4.0,Agree,Agree,Strongly Agree,Agree
9,41-60,Male,Canning Bridge Rowing Club,Yes,5,5,5,5,5,5,...,,,,,,,Agree,Agree,Agree,


## 6. Generate Visualizations
Create and save charts for the analyzed data. Each chart is generated in its own cell.

### Masters: Season Extension

In [None]:
create_bar_chart(
    masters_df,
    "desired_masters_season_extension",
    chart_name="desired_masters_season_extension_distribution",
    title="Desired Masters Season Extension",
)

### Masters: Location Preference (Canning Bridge vs. Champion Lakes)

In [16]:
create_comparison_chart(
    masters_df,
    "prefer_canning_bridge_masters",
    "prefer_champion_lakes_masters",
    chart_name="location_preference_comparison_masters",
    title="Masters Rowers: Preference for Canning Bridge vs. Champion Lakes",
    legend_labels=("Canning Bridge", "Champion Lakes"),
)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  melted_df["location"].replace({col1: legend_labels[0], col2: legend_labels[1]}, inplace=True)



Comparison chart saved to 'charts\location_preference_comparison_masters.png'


### Masters: Support for 1st Place Medals

In [None]:
create_bar_chart(
    masters_df,
    "support_1st_place_medals_masters",
    chart_name="support_1st_place_medals_masters_distribution",
    title="Support for 1st Place Medals for Masters",
)

### Masters: Did you participate in any regattas in 2025?

In [19]:
create_bar_chart(
    masters_df,
    "participated_in_regattas_2025",
    chart_name="participated_in_regattas_2025_distribution",
    title="Did you participate in any regattas in 2025?",
)

create_bar_chart(
    masters_df,
    "competed_pennant",
    chart_name="competed_pennant_distribution",
    title="Did you compete in any Pennant regattas this season?",
)

create_bar_chart(
    masters_df,
    "competed_masters_regattas",
    chart_name="competed_masters_regattas_distribution",
    title="Did you compete in any Masters regattas this season?",
)


Chart saved to 'charts\participated_in_regattas_2025_distribution.png'

Chart saved to 'charts\competed_pennant_distribution.png'

Chart saved to 'charts\competed_masters_regattas_distribution.png'


### General Ratings (All Rowers)

In [None]:
rating_columns = {
    "rating_promotion_governance": "Rating of Promotion and Governance",
    "rating_accessibility": "Rating of Accessibility",
    "rating_positive_experience": "Rating of Positive Experience",
    "rating_high_performance_pathways": "Rating of High-Performance Pathways",
}

for col, title in rating_columns.items():
    create_bar_chart(
        survey_data,
        col,
        chart_name=f"{col}_distribution",
        title=title,
    )

### Masters: Reasons for Not Competing

In [None]:
reasons_columns = {
    "reason_recreational_time_commitment": "Time Commitment",
    "reason_recreational_skill_level": "Skill Level",
    "reason_recreational_cost": "Cost",
    "reason_recreational_social_aspect": "Prefer Social Aspect",
}

create_reasons_summary_chart(
    masters_df,
    reason_columns=reasons_columns,
    chart_name="masters_reasons_not_competing",
    title="Primary Reasons Masters Rowers Do Not Compete",
)

### Masters: Support for Transition to Competitive Rowing

In [None]:
create_bar_chart(
    masters_df,
    "support_transition_to_competitive",
    chart_name="masters_support_transition_to_competitive",
    title="Support for Transitioning to Competitive Rowing (Masters)",
)

### Masters: Competition Overlap (Pennant vs. Masters Regattas)
This chart shows the number of Masters rowers who competed in both types of regattas, versus those who competed in only one type.

In [20]:
# Analyze the overlap between Pennant and Masters regatta participation
# We assume the values in these columns are 'Yes' and 'No'.

# Create boolean Series for participation
pennant_yes = masters_df['competed_pennant'] == 'Yes'
masters_yes = masters_df['competed_masters_regattas'] == 'Yes'

# Calculate the counts for each distinct group
competed_in_both = (pennant_yes & masters_yes).sum()
competed_in_pennant_only = (pennant_yes & ~masters_yes).sum()
competed_in_masters_only = (~pennant_yes & masters_yes).sum()

# Prepare data for plotting
overlap_data = {
    'Competition Category': ['Pennant Only', 'Masters Only', 'Both'],
    'Count': [competed_in_pennant_only, competed_in_masters_only, competed_in_both]
}
overlap_df = pd.DataFrame(overlap_data)

# Get brand colors for the chart
brand_colors = get_brand_colors()
# Use a palette of primary, accent, and a third color if available
palette = brand_colors[:3] if brand_colors and len(brand_colors) >= 3 else ["#003E7E", "#FFB81C", "#2D3436"]

# Create the bar chart
plt.figure(figsize=(10, 6))
sns.barplot(data=overlap_df, x='Competition Category', y='Count', palette=palette, hue='Competition Category', legend=False)
plt.title('Overlap of Competition for Masters Rowers')
plt.xlabel('Competition Category')
plt.ylabel('Number of Rowers')
plt.tight_layout()

# Save the figure
chart_name = "competition_overlap_masters"
save_path = os.path.join(chart_path, f"{chart_name}.png")
plt.savefig(save_path)
print(f"\nChart saved to '{save_path}'")
plt.close()


Chart saved to 'charts\competition_overlap_masters.png'
