In [None]:
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt 

try:
    df = pd.read_csv('data/nobel.csv')
except FileNotFoundError:
    print("Error: The file 'nobel.csv' was not found. Please check the file path and try again.")
else:
    top_gender = df['sex'].value_counts().index[0]
    top_country = df['birth_country'].value_counts().index[0]
    
    # Create a boolean column for US-born winners
    df['US-born winner'] = df['birth_country'] == 'United States of America'
    
    # Create the 'decade' column
    df['decade'] = (np.floor(df['year'] / 10) * 10).astype(int)
    
    # Group by decade and calculate proportion of US-born winners
    usa_per_decade = df.groupby('decade')['US-born winner'].mean().reset_index()
    
    # Get the decade with the highest proportion of US-born winners
    max_decade_usa = usa_per_decade.loc[usa_per_decade['US-born winner'].idxmax(), 'decade']
    
    print(f"Top gender: {top_gender}")
    print(f"Top birth country: {top_country}")
    print(f"Decade with highest proportion of US-born winners: {max_decade_usa}")
    
    # Plotting proportion of US-born winners per decade
    sns.lineplot(x='decade', y='US-born winner', data=usa_per_decade)
    plt.title("Proportion of US-born Nobel Prize Winners by Decade")
    plt.xlabel("Decade")
    plt.ylabel("Proportion US-born Winners")
    plt.show()

    # Female winner column
    df['female_winner'] = df['sex'] == 'Female'
    
    # Total winners by decade and category
    total_by_decade_category = df.groupby(['decade', 'category']).size().reset_index(name='total_count')

    # Female winners by decade and category
    female_by_decade_category = (
        df[df['female_winner']]
        .groupby(['decade', 'category'])
        .size()
        .reset_index(name='female_count')
    )

    # Merge total and female counts
    merged = pd.merge(total_by_decade_category, female_by_decade_category, on=['decade', 'category'], how='left')
    merged['female_count'] = merged['female_count'].fillna(0)

    # Calculate female ratio
    merged['female_ratio'] = merged['female_count'] / merged['total_count']

    # Find row with max female_ratio
    top_row = merged.loc[merged['female_ratio'].idxmax()]
    max_female_dict = {int(top_row['decade']): top_row['category']}

    print(f"Decade and category with highest female winners: {max_female_dict}")
    
    # Plot female winners by decade and category
    sns.relplot(
        data=female_by_decade_category,
        x='decade',
        y='female_count',
        hue='category',
        kind='line',
        marker='o',
        aspect=2
    )
    plt.title("Number of Female Nobel Prize Winners by Category and Decade")
    plt.xlabel("Decade")
    plt.ylabel("Number of Female Winners")
    plt.tight_layout()
    plt.show()

    # Find first woman to win a Nobel Prize
    female_winners = df[df['female_winner']]
    earliest_year = female_winners['year'].min()
    first_woman_row = female_winners[female_winners['year'] == earliest_year].iloc[0]
    
    first_woman_name = first_woman_row['full_name']
    first_woman_category = first_woman_row['category']
    
    print(f"First woman Nobel laureate: {first_woman_name} in category {first_woman_category}")

    # Count wins per winner
    win_counts = df['full_name'].value_counts()

    # Filter winners who won two or more times
    repeat_winners = win_counts[win_counts >= 2]
    
    # Get their names as a list
    repeat_list = repeat_winners.index.tolist()
    
    print("Repeat winners:")
    print(repeat_list)