In [18]:
# Stack Overflow Developer Survey Analysis
# Author: Ansam Nawar
# Udacity Data Science Blog Post Project


# %% Imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from pathlib import Path
import seaborn as sns
import matplotlib.pyplot as plt

In [19]:
# %% Configuration
DATA_DIR = Path("data")
CSV_PATH = DATA_DIR / "survey_results_public.csv"
FIG_DIR = Path("figures")
FIG_DIR.mkdir(parents=True, exist_ok=True)

In [20]:


# %% Helper Functions
def to_float_years(x):
    """
    Converts string representations of years of coding experience
    (including 'Less than 1 year' and 'More than 50 years') into
    a float for numerical analysis.

    Args:
        x (str or float): The input value from the 'YearsCode' column.

    Returns:
        float: The numeric years of experience, or np.nan if conversion fails.
    """
    try:
        if pd.isna(x):
            return np.nan
        if isinstance(x, str):
            if 'Less' in x:
                return 0.5
            if 'More' in x:
                return 51.0
            return float(x)
        return float(x)
    except Exception:
        return np.nan

def minimal_barh(series, title, xlabel, out_path, figsize=(7,5)):
    """
    Generates a minimal horizontal bar chart and saves it as an image.

    Args:
        series (pd.Series): Series to plot (index = category, values = metric).
        title (str): Plot title.
        xlabel (str): Label for the x-axis.
        output_path (Path): File path to save the figure.
        figsize (tuple): Figure size in inches.

    Returns:
        Path: The output path of the saved figure.
    """
    fig, ax = plt.subplots(figsize=figsize)
    series.plot(kind='barh', ax=ax)
    ax.set_xlabel(xlabel)
    ax.set_title(title, fontsize=12)
    ax.grid(False)
    plt.tight_layout()
    fig.savefig(out_path, dpi=200)
    plt.close(fig)
    return out_path

def minimal_scatter(x, y, title, xlabel, ylabel, out_path, sample=5000, figsize=(7, 5)):
    """
    Creates and saves a minimal scatter plot for two variables.

    Args:
        x (pd.Series): X-axis data.
        y (pd.Series): Y-axis data.
        title (str): Chart title.
        xlabel (str): Label for the X-axis.
        ylabel (str): Label for the Y-axis.
        out_path (Path): Path to save the figure.
        sample (int): Maximum sample size for plotting.
        figsize (tuple): Figure size.

    Returns:
        Path: Path to the saved figure or None if insufficient data.
    """
        # Filter out missing data
    mask = x.notnull() & y.notnull()
    if mask.sum() == 0:
        return None
    s = pd.concat([x, y], axis=1).loc[mask]
    if len(s) > sample:
        s = s.sample(sample, random_state=42)
    fig, ax = plt.subplots(figsize=figsize)
    ax.scatter(s.iloc[:,0], s.iloc[:,1], alpha=0.18, s=8)
    ax.set_xlabel(xlabel)
    ax.set_ylabel(ylabel)
    ax.set_title(title, fontsize=12)
    ax.set_yscale('log')
    ax.grid(False)
    plt.tight_layout()
    fig.savefig(out_path, dpi=200)
    plt.close(fig)
    return out_path


In [27]:
def clean_remote_work(value):
    """
    Normalizes raw RemoteWork responses into clean categories.

    Args:
        x (str): Raw text response from RemoteWork column.
    Returns:
        str: Simplified category ('Remote', 'Hybrid', 'In-person').
    """
    if pd.isna(value):
        return None
    value = value.lower()
    if "remote" in value and "in-person" not in value:
        return "Remote"
    elif "hybrid" in value or "your choice" in value or ("remote" in value and "in-person" in value):
        return "Hybrid"
    elif "in-person" in value:
        return "In-person"
    else:
        return None


In [21]:
# %% Load and Prepare Data
def load_survey(csv_path=CSV_PATH):
    """
    Loads the Stack Overflow survey dataset from the given path.

    Args:
        csv_path (Path): Path to the survey CSV file.

    Returns:
        pd.DataFrame: Loaded survey DataFrame.
    """
    if not csv_path.exists():
        raise FileNotFoundError(f"Survey CSV not found at {csv_path}. Download from https://survey.stackoverflow.co/2024 and place it there.")
    df = pd.read_csv(csv_path, low_memory=False)
    return df

In [23]:
def standardize(df):
    """
    Cleans and standardizes the main variables in the survey DataFrame.

    Operations:
    - Extracts and cleans salary data.
    - Converts 'YearsCode' to numeric format.
    - Simplifies remote work classifications.
    - Expands languages worked with.

    Args:
        df (pd.DataFrame): Raw survey data.

    Returns:
        tuple: (cleaned DataFrame, exploded language DataFrame)
    """
    # Standardize salary
    # Standardize salary
    if 'ConvertedCompYearly' in df.columns:
        df['salary_usd'] = pd.to_numeric(df['ConvertedCompYearly'], errors='coerce')
    elif 'CompTotal' in df.columns:
        df['salary_usd'] = pd.to_numeric(df['CompTotal'], errors='coerce')
    else:
        df['salary_usd'] = np.nan

    # Convert experience
    df['years_code'] = df['YearsCode'].apply(to_float_years) if 'YearsCode' in df.columns else np.nan

    # Simplify RemoteWork column into categories
    if 'RemoteWork' in df.columns:
        df['remote_status'] = (
            df['RemoteWork']
            .fillna('')
            .apply(lambda x: 'remote' if 'remote' in x.lower()
                   else 'hybrid' if 'hybrid' in x.lower()
                   else 'in-person')
        )
    else:
        df['remote_status'] = 'unknown'

    # Process programming languages
    if 'LanguageHaveWorkedWith' in df.columns:
        df['languages'] = df['LanguageHaveWorkedWith'].fillna('').apply(
            lambda x: [s.strip() for s in x.split(';') if s.strip()]
        )
        langs_exp = df.explode('languages')
    else:
        df['languages'] = [[]]
        langs_exp = pd.DataFrame()

    return df, langs_exp

In [28]:

# %% Analysis
def run_analysis(csv_path=CSV_PATH, fig_dir=FIG_DIR):
    """
    Executes the full Stack Overflow survey analysis workflow:
    - Loads and cleans data
    - Generates visualizations for languages, salary, and experience
    - Outputs summary CSV for remote vs salary

    Args:
        csv_path (Path): Path to the survey dataset.
        fig_dir (Path): Directory to store output figures.

    Returns:
        dict: Mapping of generated output names to file paths.
    """
    df = load_survey(csv_path)
    df, langs_exp = standardize(df)
    generated_files = {}

    # Top languages by respondent count
    if not langs_exp.empty:
        lang_counts = langs_exp['languages'].value_counts().head(12)
        out_path_top_langs = fig_dir / "top_languages_counts.png"
        minimal_barh(
            lang_counts.sort_values(),
            "Top languages by respondent count",
            "Number of respondents",
            out_path_top_langs,
        )
        generated_files['top_languages_counts'] = out_path_top_langs

    # Median salary by country
    if 'Country' in df.columns:
        valid_df = df.dropna(subset=['Country', 'salary_usd'])
        country_counts = valid_df['Country'].value_counts()
        valid_countries = country_counts[country_counts >= 100].index
        country_median = (
            valid_df[valid_df['Country'].isin(valid_countries)]
            .groupby('Country')['salary_usd']
            .median()
            .sort_values(ascending=False)
        )
        if not country_median.empty:
            out_path_salary_country = fig_dir / "median_salary_countries.png"
            minimal_barh(
                country_median.head(15).sort_values(),
                "Median salary by country (top 15, >=100 respondents)",
                "Median salary (USD)",
                out_path_salary_country,
            )
            generated_files['median_salary_countries'] = out_path_salary_country

    # Experience vs Salary scatter
    out_path_experience_salary = fig_dir / "experience_vs_salary.png"
    scatter_plot = minimal_scatter(
        df['years_code'],
        df['salary_usd'],
        "Years of experience vs Salary (log scale)",
        "Years of experience",
        "Salary (USD)",
        out_path_experience_salary,
    )
    if scatter_plot:
        generated_files['experience_vs_salary'] = out_path_experience_salary

    # Median salary by language
    if not langs_exp.empty:
        valid_lang_df = langs_exp.dropna(subset=['languages', 'salary_usd'])
        lang_salary = valid_lang_df.groupby('languages')['salary_usd'].median()
        lang_counts = langs_exp['languages'].value_counts()
        lang_summary = pd.concat([lang_salary, lang_counts], axis=1)
        lang_summary.columns = ['median_salary', 'count']
        lang_summary = lang_summary[lang_summary['count'] >= 100].sort_values('median_salary', ascending=False)

        if not lang_summary.empty:
            out_path_salary_lang = fig_dir / "median_salary_languages.png"
            minimal_barh(
                lang_summary.head(12).sort_values('median_salary'),
                "Top languages by median salary (>=100 respondents)",
                "Median salary (USD)",
                out_path_salary_lang,
            )
            generated_files['median_salary_languages'] = out_path_salary_lang

    # Median salary by remote status
    if 'remote_status' in df.columns:
        remote_summary = (
            df.groupby('remote_status')
            .agg(
                median_salary=('salary_usd', 'median'),
                median_years=('years_code', 'median'),
                respondents=('salary_usd', 'count')
            )
        )
        out_csv = fig_dir / "remote_salary_summary.csv"
        remote_summary.to_csv(out_csv)
        generated_files['remote_salary_summary'] = out_csv

    
    # Create a cleaned WorkSetup column
    df['WorkSetup'] = df['RemoteWork'].apply(clean_remote_work)

    # Drop missing or invalid salary/setup values
    df['salary_usd'] = pd.to_numeric(df['salary_usd'], errors='coerce')
    df = df.dropna(subset=['salary_usd', 'WorkSetup'])

    # Remove top 1% salary outliers for clarity
    upper = df['salary_usd'].quantile(0.99)
    df_filtered = df[df['salary_usd'] < upper]

    # Generate boxplot visualization
    plt.figure(figsize=(8, 6))
    sns.boxplot(
        data=df_filtered,
        x='WorkSetup',
        y='salary_usd',
        order=['In-person', 'Hybrid', 'Remote'],
        palette='pastel'
    )
    plt.title('Compensation Distribution by Work Setup', fontsize=12)
    plt.xlabel('Work Setup')
    plt.ylabel('Annual Compensation (USD)')
    plt.tight_layout()

    out_path_remote_box = fig_dir / "salary_by_worksetup.png"
    plt.savefig(out_path_remote_box, dpi=200)
    plt.close()
    generated_files['salary_by_worksetup'] = out_path_remote_box

    return generated_files


In [29]:
# %% Run the full analysis
generated_files = run_analysis()
for key, value in generated_files.items():
    print(f"Generated {key}: {value}")


Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  sns.boxplot(


Generated top_languages_counts: figures/top_languages_counts.png
Generated median_salary_countries: figures/median_salary_countries.png
Generated experience_vs_salary: figures/experience_vs_salary.png
Generated median_salary_languages: figures/median_salary_languages.png
Generated remote_salary_summary: figures/remote_salary_summary.csv
Generated salary_by_worksetup: figures/salary_by_worksetup.png
