In [1]:
# Stack Overflow Developer Survey Analysis
# Author: Ansam Nawar
# Udacity Data Science Blog Post Project


# %% Imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from pathlib import Path

In [2]:
# %% Configuration
DATA_DIR = Path("data")
CSV_PATH = DATA_DIR / "survey_results_public.csv"
FIG_DIR = Path("figures")
FIG_DIR.mkdir(parents=True, exist_ok=True)

In [11]:


# %% Helper Functions
def to_float_years(x):
    try:
        if pd.isna(x):
            return np.nan
        if isinstance(x, str):
            if 'Less' in x:
                return 0.5
            if 'More' in x:
                return 51.0
            return float(x)
        return float(x)
    except Exception:
        return np.nan

def minimal_barh(series, title, xlabel, out_path, figsize=(7,5)):
    fig, ax = plt.subplots(figsize=figsize)
    series.plot(kind='barh', ax=ax)
    ax.set_xlabel(xlabel)
    ax.set_title(title, fontsize=12)
    ax.grid(False)
    plt.tight_layout()
    fig.savefig(out_path, dpi=200)
    plt.close(fig)
    return out_path

def minimal_scatter(x, y, title, xlabel, ylabel, out_path, sample=5000, figsize=(7,5)):
    mask = x.notnull() & y.notnull()
    if mask.sum() == 0:
        return None
    s = pd.concat([x, y], axis=1).loc[mask]
    if len(s) > sample:
        s = s.sample(sample, random_state=42)
    fig, ax = plt.subplots(figsize=figsize)
    ax.scatter(s.iloc[:,0], s.iloc[:,1], alpha=0.18, s=8)
    ax.set_xlabel(xlabel)
    ax.set_ylabel(ylabel)
    ax.set_title(title, fontsize=12)
    ax.set_yscale('log')
    ax.grid(False)
    plt.tight_layout()
    fig.savefig(out_path, dpi=200)
    plt.close(fig)
    return out_path


In [5]:
# %% Load and Prepare Data
def load_survey(csv_path=CSV_PATH):
    if not csv_path.exists():
        raise FileNotFoundError(f"Survey CSV not found at {csv_path}. Download from https://survey.stackoverflow.co/2024 and place it there.")
    df = pd.read_csv(csv_path, low_memory=False)
    return df

In [6]:
def standardize(df):
    if 'ConvertedCompYearly' in df.columns:
        df['salary_usd'] = pd.to_numeric(df['ConvertedCompYearly'], errors='coerce')
    elif 'CompTotal' in df.columns:
        df['salary_usd'] = pd.to_numeric(df['CompTotal'], errors='coerce')
    else:
        df['salary_usd'] = np.nan

    if 'YearsCode' in df.columns:
        df['years_code'] = df['YearsCode'].apply(to_float_years)
    else:
        df['years_code'] = np.nan

    if 'RemoteWork' in df.columns:
        df['is_remote'] = df['RemoteWork'].fillna('').str.contains('remote', case=False, na=False)
    else:
        df['is_remote'] = False

    if 'LanguageHaveWorkedWith' in df.columns:
        df['languages'] = df['LanguageHaveWorkedWith'].fillna('').apply(lambda x: [s.strip() for s in x.split(';') if s.strip()])
        langs_exp = df.explode('languages')
    else:
        df['languages'] = [[]]
        langs_exp = pd.DataFrame()

    return df, langs_exp

In [9]:

# %% Analysis
def run_analysis(csv_path=CSV_PATH, fig_dir=FIG_DIR):
    df = load_survey(csv_path)
    df, langs_exp = standardize(df)
    generated = {}

    # Top languages by respondent count
    if not langs_exp.empty:
        lang_counts = langs_exp['languages'].value_counts().head(12)
        out1 = fig_dir / "top_languages_counts.png"
        minimal_barh(lang_counts.sort_values(), "Top languages by respondent count", "Number of respondents", out1)
        generated['top_languages_counts'] = out1

    # Median salary by country
    if 'Country' in df.columns:
        tmp = df.dropna(subset=['Country', 'salary_usd'])
        counts = tmp['Country'].value_counts()
        valid = counts[counts >= 100].index
        country_median = tmp[tmp['Country'].isin(valid)].groupby('Country').salary_usd.median().sort_values(ascending=False)
        if not country_median.empty:
            out2 = fig_dir / "median_salary_countries.png"
            minimal_barh(country_median.head(15).sort_values(), "Median salary by country (top 15, >=100 respondents)", "Median salary (USD)", out2)
            generated['median_salary_countries'] = out2

    # Experience vs Salary scatter
    out3 = minimal_scatter(df['years_code'], df['salary_usd'], "Years experience vs Salary (log scale)", "Years of experience", "Salary (USD)", fig_dir / "experience_vs_salary.png")
    if out3:
        generated['experience_vs_salary'] = out3

    # Median salary by language
    if not langs_exp.empty:
        lang_salary = langs_exp.dropna(subset=['languages', 'salary_usd']).groupby('languages').salary_usd.median()
        lang_counts_full = langs_exp['languages'].value_counts()
        lang_df = pd.concat([lang_salary, lang_counts_full], axis=1)
        lang_df.columns = ['median_salary', 'count']
        lang_df = lang_df[lang_df['count'] >= 100].sort_values('median_salary', ascending=False)
        if not lang_df.empty:
            out4 = fig_dir / "median_salary_languages.png"
            minimal_barh(lang_df.head(12).sort_values('median_salary'), "Top languages by median salary (>=100 respondents)", "Median salary (USD)", out4)
            generated['median_salary_languages'] = out4
            # Remote vs Non-remote summary
        if 'is_remote' in df.columns:
            remote_stats = df.groupby('is_remote').agg(median_salary=('salary_usd','median'), median_years=('years_code','median'), respondents=('salary_usd','count'))
            out_csv = fig_dir / "remote_stats_summary.csv"
            remote_stats.to_csv(out_csv)
            generated['remote_stats_summary'] = out_csv

        return generated

In [12]:
# %% Run the full analysis
generated_files = run_analysis()
for key, value in generated_files.items():
    print(f"Generated {key}: {value}")

Generated top_languages_counts: figures/top_languages_counts.png
Generated median_salary_countries: figures/median_salary_countries.png
Generated experience_vs_salary: figures/experience_vs_salary.png
Generated median_salary_languages: figures/median_salary_languages.png
Generated remote_stats_summary: figures/remote_stats_summary.csv
