# Pandas Documentation Guide

In [184]:
import pandas as pd
import numpy as np

## Data Loading & Exploration

- pd.DataFrame(dict) - creates df from dict.
- pd.read_csv(url, parse_dates=[['Year', 'Month', 'Day']]) - read csv and parse datetime columns
- df.info() - data types and missing values

- df.describe() - statistical summary
- df.shape - (rows, columns)
- df.columns.tolist() - column names

In [127]:
dict_data = {
    'name': ['Joe', 'Blake', 'Kyle'],
    'age': [20, 22, 19],
    'major': ['CPSC', 'MATH', 'ECON'],
    'gpa': ['2.8', '0', '4.0']
}

random_dict_df = pd.DataFrame(dict_data)
random_dict_df
print(random_dict_df.info())
print(random_dict_df.describe())
print(random_dict_df.shape)
print(random_dict_df.columns.tolist())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3 entries, 0 to 2
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   name    3 non-null      object
 1   age     3 non-null      int64 
 2   major   3 non-null      object
 3   gpa     3 non-null      object
dtypes: int64(1), object(3)
memory usage: 228.0+ bytes
None
             age
count   3.000000
mean   20.333333
std     1.527525
min    19.000000
25%    19.500000
50%    20.000000
75%    21.000000
max    22.000000
(3, 4)
['name', 'age', 'major', 'gpa']


In [128]:
url = 'https://raw.githubusercontent.com/kemiolamudzengi/dsci-320-datasets/main/world-data-gapminder.csv'
gapminder = pd.read_csv(url, parse_dates=['year'])
print(gapminder.shape)

(38982, 14)


## Data Selection & Filtering

- df['column'] - single column (Series)
- df[['col1', 'col2']] - multiple columns (DataFrame)
- df[df['col'] > value] - boolean filtering
  
- df[(condition1) & (condition2)] - multiple conditions
- df['col'].isnull() / .notna() - missing value checks

In [19]:
countries = gapminder['country']
subset = gapminder[['country', 'year', 'life_expectancy']]

combined_filter = gapminder[
    (gapminder['year'] == '1982') & 
    (gapminder['life_expectancy'] > 75)
]

#filter for new dataset with region == Europe with > 80 life exp.
europe_or_high_life = gapminder[
    (gapminder['region'] == 'Europe') | 
    (gapminder['life_expectancy'] > 80)
]

#check which columns have missing data, true = 1 false = 0
missing_data = gapminder.isnull().sum()
print("Missing values per column:", missing_data)

#filter dataset to include NON N/A rows of co2_per_cap
co2_data = gapminder[gapminder['co2_per_capita'].notna()]

#print all unique years
years = co2_data['year'].unique() 

Missing values per column: country                      0
year                         0
population                   0
region                       0
sub_region                   0
income_group                 0
life_expectancy              0
income                       0
children_per_woman           0
child_mortality              2
pop_density              26700
co2_per_capita           22697
years_in_school_men      30794
years_in_school_women    30794
dtype: int64


## Data Manipulation

- df.copy() - Creates an independent copy to avoid warnings
- df['new_column'] = calculation - Creates new column using existing columns (vectorized, happens to all rows at once)
- np.where(condition, value_if_true, value_if_false) - creates new column based on conditions such as if col < 60 then set 'x' else set 'y'
  
- df['col'].value_counts() - shows how many rows are in the col
- df.sort_values(['col1', 'col2'], ascending = [True, False] - sorts data first by col1, then by col2 in ascending then desending order.

In [23]:
#Creating new column
co2_data = co2_data.copy()
co2_data['total_co2'] = co2_data['population'] * co2_data['co2_per_capita']

#if life exp < 20 -> set category to 'very low' 
#ELSE IF life exp <= 50 -> set catory to 'medium' -> ELSE -> set category to 'very high'
co2_data['life_exp_category'] = np.where(
    co2_data['life_expectancy'] < 20, 'Very Low',
    np.where(co2_data['life_expectancy'] <= 50, 'Medium', 'Very High')
)

print(co2_data['life_exp_category'].value_counts())

#sort by region with lowest life expectancy
multi_sort_df = gapminder.sort_values(['region', 'life_expectancy'], ascending = [True, True])
print(multi_sort_df[['region', 'country', 'life_expectancy']])

life_exp_category
Very High    10070
Medium        6157
Very Low        58
Name: count, dtype: int64
        region    country  life_expectancy
35497   Africa    Tunisia             1.50
22405   Africa  Mauritius             4.00
11478   Africa   Ethiopia             4.01
11477   Africa   Ethiopia             5.02
24194   Africa    Namibia             5.19
...        ...        ...              ...
1749   Oceania  Australia            82.50
1747   Oceania  Australia            82.60
1748   Oceania  Australia            82.60
1750   Oceania  Australia            82.70
1751   Oceania  Australia            82.90

[38982 rows x 3 columns]


## Grouping & Aggregation

- df['year'].dt.year - extracts year number from datetime column

- df[df['col'].isin(['x','y','z'])] - filters col1 for values == the list

- df['col'].max() - finds max value of column

- df.nlargest(2, 'col') - returns df of the top 2 largest values of col
- df.nsmallest(1, 'col') - returns df of the lowest value of col
- df.groupby('col')['col2'].mean() - divides data into groups based on unique values of col and selects col2 for each group. Then calculates the mean for each group.
- df.groupby('col')['col2'].agg(['count', 'mean']) - divides data into groups based on values of col then finds the count of 'col1' and mean of 'col2'
- df.groupby('col').agg({'col1': 'mean', 'col2': 'sum'})) - applies different functions to different columns

- df[['col1', 'col2']].corr() - finds correlation between 2 columns
- df['col1'].value_counts() - unique value counts for col1
- df['col1'].rolling(int).rank(method = "average") - calculates rolling averages for column

- df1.merge(df2, on='col', suffixes=('_d1', '_d2') - merges 2 datasets on col and includes suffixes for each column such as "age_d1", "age_d2"

In [27]:
recent_year = co2_data['year'].dt.year.max()
print(recent_year)

#filter data from specific years
year_list = ['1952', '2004', '1977']
time_series_data = gapminder[gapminder['year'].isin(year_list)]

top2_co2 = co2_data.nlargest(2, 'co2_per_capita')
lowest_co2 = co2_data.nsmallest(1, 'co2_per_capita')

correlation = gapminder[['life_expectancy', 'income']].corr()
unique_regions = gapminder['region'].value_counts()

rolling_avg = gapminder['income'].rolling(2).rank(method = "average")

2014


  time_series_data = gapminder[gapminder['year'].isin(year_list)]


## Common Patterns

# Chaning operations together:
df = (gapminder.query('year == "2000"') #filter by year
      [['country', 'region', 'life_expectancy']].sort_values('life_expectancy', ascending=True)) #pull columns and sort by lowest life exp.
df

In [36]:
# Question: Which 2 countries had the fastest improvement in life expectancy between 1960 and 2004?

df_1960 = gapminder[gapminder['year'] == '1960']

df_2004 = gapminder[gapminder['year'] == '2004']

merged_df = df_1960.merge(
    df_2004,
    on='country',
    suffixes=('_1960', '_2004'))

merged_df['improvement_diff'] = (merged_df['life_expectancy_2004'] - merged_df['life_expectancy_1960'])

best_two = merged_df.nlargest(2, 'improvement_diff')

best_two

Unnamed: 0,country,year_1960,population_1960,region_1960,sub_region_1960,income_group_1960,life_expectancy_1960,income_1960,children_per_woman_1960,child_mortality_1960,...,income_group_2004,life_expectancy_2004,income_2004,children_per_woman_2004,child_mortality_2004,pop_density_2004,co2_per_capita_2004,years_in_school_men_2004,years_in_school_women_2004,improvement_diff
98,Maldives,1960-01-01,89900,Asia,Southern Asia,Upper middle,32.2,1090,7.02,340.0,...,Upper middle,73.8,8920,2.42,28.6,1030.0,2.15,6.54,5.97,41.6
32,China,1960-01-01,658000000,Asia,Eastern Asia,Upper middle,30.9,891,3.99,309.0,...,Upper middle,71.9,5170,1.55,26.3,140.0,3.98,9.55,8.03,41.0


## Self-Check Questions

1. **What's the difference between selecting columns with `df['col']` vs `df[['col']]`?**
    - df['col'] returns a pandas series, which is a 1d array.
    - df[['col']] returns a pandas dataframe.

2. **How do you combine multiple conditions in boolean indexing?**
    - df_new = df[(df['col1'] == "123") & (df['col2'] > 5)]
   - Remember: always use parentheses around each condition!


3. **What's the purpose of using `.copy()` when creating filtered datasets?**
    - We use .copy() to avoid warnings, it will change the main dataset if you filter on it. If you want to preserve the main dataset be sure to copy it first.

   
4. **How do you find the top N values in a column?**
   - df.nlargest(n, 'column')

5. **What's the basic pattern for groupby operations?**
    - df.groupby('col1')['col2'].agg(['count', 'mean'])
    - df.groupby('col1').agg({dict})

### **Applied Skills (Quiz-level questions)**

4. **Complex Filtering Challenge:**
   Write code to find countries that meet ALL these conditions:
   - Life expectancy greater than 70
   - Population greater than 5 million
   - From either Europe OR North America
   

In [43]:
region_filter = gapminder[
    (gapminder['region'] == 'Europe') |
    (gapminder['region'] == 'North America')]

life_and_pop_filter = region_filter[
    (region_filter['life_expectancy'] > 70) &
     (region_filter['population'] > 5000000)]

life_and_pop_filter['country'].unique()

array(['Austria', 'Belarus', 'Belgium', 'Bulgaria', 'Czech Republic',
       'Denmark', 'Finland', 'France', 'Germany', 'Greece', 'Hungary',
       'Italy', 'Netherlands', 'Norway', 'Poland', 'Portugal', 'Romania',
       'Russia', 'Serbia', 'Slovak Republic', 'Spain', 'Sweden',
       'Switzerland', 'Ukraine', 'United Kingdom'], dtype=object)

5. **Groupby Analysis Challenge:**
   For each region, calculate:
   - The count of countries
   - The average life expectancy
   - The country with the highest income (hint: use `.idxmax()`)

In [100]:
df = gapminder.groupby('region').agg({
    'country': 'count',
    'life_expectancy': 'mean',
    'income': 'idxmax'
})

highest_income = gapminder.groupby('country')['income'].idxmax()
highest_income

country
Afghanistan              218
Albania                  437
Algeria                  654
Angola                   871
Antigua and Barbuda     1083
                       ...  
Venezuela              38064
Vietnam                38324
Yemen                  38535
Zambia                 38712
Zimbabwe               38935
Name: income, Length: 178, dtype: int64

6. **Missing Data Strategy Challenge:**
   You have a dataset where:
   - 'essential_column' has 5% missing values
   - 'optional_column' has 40% missing values
   - 'analysis_column' has 15% missing values
   
   What strategy would you use for each column and why? Practice implementing your strategy in code.

- fill the missing values with averages of the essential column, or remove the rows with missing values entirely.
- might be worth excluding this column, as it is optional.
- either exclude the 15% from the analysis, or fill the missing values with averages.

In [101]:
# suppose life_expectancy is the essential column:

# life_exp_mean = gapminder['life_expectancy'].mean()
# gapminder['life_expectancy'].fillna(life_exp_mean, inplace = True)

# OR 

# gapminder['life_expectancy'].dropna(subset=['life_expectancy'])


# suppose region is the optional column

# gapminder.drop('region', axis = 1)

## Claude Practice Questions

In [52]:
#Find all countries where population is less than 20,000
population = gapminder[gapminder["population"] < 20000]
population["country"].unique()

array(['Kiribati', 'Qatar', 'Seychelles', 'Tonga'], dtype=object)

In [55]:
#Find European countries OR countries with life expectancy < 50
df = gapminder[(gapminder["country"] == "Europe") |
                (gapminder["life_expectancy"] < 50)]
df["country"].unique()

array(['Afghanistan', 'Albania', 'Algeria', 'Angola',
       'Antigua and Barbuda', 'Argentina', 'Armenia', 'Australia',
       'Austria', 'Azerbaijan', 'Bahamas', 'Bahrain', 'Bangladesh',
       'Barbados', 'Belarus', 'Belgium', 'Belize', 'Benin', 'Bhutan',
       'Bolivia', 'Bosnia and Herzegovina', 'Botswana', 'Brazil',
       'Bulgaria', 'Burkina Faso', 'Burundi', 'Cambodia', 'Cameroon',
       'Canada', 'Central African Republic', 'Chad', 'Chile', 'China',
       'Colombia', 'Comoros', 'Congo, Dem. Rep.', 'Congo, Rep.',
       'Costa Rica', "Cote d'Ivoire", 'Croatia', 'Cuba', 'Cyprus',
       'Czech Republic', 'Denmark', 'Djibouti', 'Dominican Republic',
       'Ecuador', 'Egypt', 'El Salvador', 'Equatorial Guinea', 'Eritrea',
       'Estonia', 'Ethiopia', 'Fiji', 'Finland', 'France', 'Gabon',
       'Gambia', 'Georgia', 'Germany', 'Ghana', 'Greece', 'Grenada',
       'Guatemala', 'Guinea', 'Guinea-Bissau', 'Guyana', 'Haiti',
       'Honduras', 'Hungary', 'Iceland', 'India', 'Indo

In [60]:
#Filter for countries in Asia with income > 10000

asia = gapminder[(gapminder["region"] == "Asia") &
                (gapminder["income"] > 10000)]
asia["country"].unique()

array(['Azerbaijan', 'Bahrain', 'China', 'Cyprus', 'Georgia', 'Indonesia',
       'Iran', 'Iraq', 'Israel', 'Japan', 'Kazakhstan', 'Kuwait',
       'Lebanon', 'Malaysia', 'Maldives', 'Mongolia', 'Oman', 'Qatar',
       'Saudi Arabia', 'Singapore', 'South Korea', 'Sri Lanka',
       'Thailand', 'Turkey', 'Turkmenistan', 'United Arab Emirates'],
      dtype=object)

Create a categorical column 'income_level' where income < 5000 = 'Low', 5000-15000 = 'Medium', >15000 = 'High'

In [63]:
gapminder = gapminder.copy()

gapminder["income_level"] = np.where(gapminder['income'] <= 5000, 'low',
                                     np.where(gapminder['income'] < 15000, 'medium', 'high'))
gapminder["income_level"].unique()

array(['low', 'medium', 'high'], dtype=object)

For each region, find the total population and average income

In [65]:
gapminder.groupby('region').agg({
    'population': 'sum',
    'income': 'mean'
})

Unnamed: 0_level_0,population,income
region,Unnamed: 1_level_1,Unnamed: 2_level_1
Africa,59192998600,1774.840095
Americas,63837885500,4581.39402
Asia,330133218800,4818.689012
Europe,98766930400,7975.743941
Oceania,2422277600,3775.721461


In [82]:
# Find the 5 countries with the biggest improvement in child mortality from 1960 to 2010

data_2010 = gapminder[gapminder["year"].dt.year == 2010]
data_1960 = gapminder[gapminder["year"].dt.year == 1960]

merged = data_1960.merge(data_2010, on = 'country', suffixes=('_1960', '_2010'))

merged["improvement"] = merged["child_mortality_1960"] - merged["child_mortality_2010"]

top5 = merged.nlargest(5, 'improvement')

top5["country"].unique()

array(['Oman', 'Yemen', 'Maldives', 'Mali', 'Iran'], dtype=object)

In [70]:
# Show countries where 'co2_per_capita' is NOT missing

co2 = gapminder[gapminder['co2_per_capita'].notna()]
co2["country"].unique()

array(['Afghanistan', 'Albania', 'Algeria', 'Angola',
       'Antigua and Barbuda', 'Argentina', 'Armenia', 'Australia',
       'Austria', 'Azerbaijan', 'Bahamas', 'Bahrain', 'Bangladesh',
       'Barbados', 'Belarus', 'Belgium', 'Belize', 'Benin', 'Bhutan',
       'Bolivia', 'Bosnia and Herzegovina', 'Botswana', 'Brazil',
       'Bulgaria', 'Burkina Faso', 'Burundi', 'Cambodia', 'Cameroon',
       'Canada', 'Central African Republic', 'Chad', 'Chile', 'China',
       'Colombia', 'Comoros', 'Congo, Dem. Rep.', 'Congo, Rep.',
       'Costa Rica', "Cote d'Ivoire", 'Croatia', 'Cuba', 'Cyprus',
       'Czech Republic', 'Denmark', 'Djibouti', 'Dominican Republic',
       'Ecuador', 'Egypt', 'El Salvador', 'Equatorial Guinea', 'Eritrea',
       'Estonia', 'Ethiopia', 'Fiji', 'Finland', 'France', 'Gabon',
       'Gambia', 'Georgia', 'Germany', 'Ghana', 'Greece', 'Grenada',
       'Guatemala', 'Guinea', 'Guinea-Bissau', 'Guyana', 'Haiti',
       'Honduras', 'Hungary', 'Iceland', 'India', 'Indo

# -------------------------------------------------------------------------------------------

# Altair Documentation Guide

In [129]:
import pandas as pd
import altair as alt
#from vega_datasets import data


#filepath = "data/world-data-gapminder.csv"
filepath = 'https://raw.githubusercontent.com/kemiolamudzengi/dsci-320-datasets/main/world-data-gapminder.csv'

# Read in the data using pandas, remember to set parse_dates!
gm = pd.read_csv(filepath, parse_dates=["year"])

# Display basic information about the dataset
print(f"Dataset shape: {gm.shape}")
print(f"Years covered: {gm.year.dt.year.min()} to {gm.year.dt.year.max()}")
print(f"Number of countries: {gm.country.nunique()}")
print(f"Regions included: {', '.join(sorted(gm.region.unique()))}")
print(f"Column names: {sorted(gm.columns)}")



# Create a subset for 2000 data for clearer examples
data2000 = gm[gm.year == '2000']   # notice how we have to treat it as a string to search, if you change this to an integer it breaks
print(f"Year 2000 subset: {data2000.shape}")

Dataset shape: (38982, 14)
Years covered: 1800 to 2018
Number of countries: 178
Regions included: Africa, Americas, Asia, Europe, Oceania
Column names: ['child_mortality', 'children_per_woman', 'co2_per_capita', 'country', 'income', 'income_group', 'life_expectancy', 'pop_density', 'population', 'region', 'sub_region', 'year', 'years_in_school_men', 'years_in_school_women']
Year 2000 subset: (178, 14)


## Key Concepts

- alt.Chart([DATA]).mark_[TYPE]().encode([MAPPINGS])

### Data Type Suffixes
- `:N` - Nominal (categories, no order)
- `:O` - Ordinal (categories with order)  
- `:Q` - Quantitative (numbers, measurements)
- `:T` - Temporal (dates and times)

### Common Mark Types
- `.mark_point()` - Scatter plot points
- `.mark_circle()` - Filled circles. Best for when you want to emphasize individual data points
- `.mark_square()` - Filled squares
- `.mark_bar()` - Bar charts. Useful when showing distribution of categorical data
- `.mark_tick()` - Distribution marks. Useful when showing distribution of a single variable.

### Essential Encoding Channels
- `x`, `y` - Position (most important)
- `color` - Categories or intensity
- `size` - Magnitude (quantitative only)  
- `shape` - Categories only
- `opacity` - Transparency
- `column` - Separate panels for specific variable (i.e. different plots for region)
- `alt.Tooltip('variable').title().format('.1f')` - Additional information (hover mouse to see more info)
- `alt.X/Y/Color('variable').sort('x,y,-x,-y`) - Additional x,y edit tools
- `alt.X/Y('variable').stack('normalize')` - Stacked Bar chart for percentages
- `alt.Color('variable', scale = alt.Scale(scheme='category10')` - For using a specific color scheme
- `.encode().properties(width = ..., height = ...)` - Change width / height of graph
- `mark_line(point = alt.OverlayMarkDef(size = 10, filled = True, strokeWidth = 1)` - Creates filled dots for each point, of size 10

---

**How to tell if two variables are correlated by looking at a scatter plot**

**Positive Correlation**
- **Upward trend**: As X increases, Y tends to increase.
- **Tight clustering**: Points follow a clear upward line or curve.
- **Few outliers**: Most points conform to the pattern.

**Negative Correlation**
- **Downward trend**: As X increases, Y tends to decrease.
- **Tight clustering**: Points follow a clear downward line or curve.
- **Few outliers**: Most points fit the downward trend.

**No Correlation**
- **No clear trend**: Points are scattered randomly.
- **Wide spread**: No discernible line or curve.
- **Many outliers**: No obvious relationship between X and Y.
![Correlation](https://articles.outlier.org/_next/image?url=https%3A%2F%2Fimages.ctfassets.net%2Fkj4bmrik9d6o%2F2oArz66jpUDD00bOYo58e9%2F90ee20b033c2695c6884c5c652f75b81%2FOutlier_Graph_NegativeCorrelation-02.png&w=1080&q=75)

In [83]:
#corr_value = recent_data['life_expectancy'].corr(recent_data['co2_per_capita'])
#print(f"Correlation between life expectancy and carbon emissions: {corr_value:.2f}")

## Example Plots

In [4]:
# Using only the x-channel to show distribution
x_chart = alt.Chart(data2000).mark_tick().encode(
    x='children_per_woman:Q'  # Only x-position is encoded
)

# Using only the y-channel 
y_chart = alt.Chart(data2000).mark_tick().encode(
    y='life_expectancy:Q'  # Only y-position is encoded
)

x_chart | y_chart

In [5]:
nominal_color = alt.Chart(data2000).mark_point().encode(
    x='children_per_woman:Q',
    y='life_expectancy:Q',
    color='region:N'  # Color represents region membership
)
# Color with quantitative data
quantitative_color = alt.Chart(data2000).mark_point().encode(
    x='children_per_woman:Q',
    y='life_expectancy:Q',
    color='population:Q'  # Color intensity represents population
)

nominal_color | quantitative_color

In [6]:
# Shape encoding for nominal data
shape = alt.Chart(data2000).mark_point().encode(
    x='children_per_woman:Q',
    y='life_expectancy:Q',
    shape='region:N'  # Different shapes for each region
)

# Opacity encoding
opacity = alt.Chart(data2000).mark_point(filled=True).encode(
    x='children_per_woman:Q',
    y='life_expectancy:Q',
    opacity='population:Q'  # Transparency represents population
)

shape | opacity

In [7]:
# Adding tooltips
tooltip = alt.Chart(data2000).mark_point().encode(
    x='children_per_woman:Q',
    y='life_expectancy:Q',
    tooltip=['country', 'population', 'region']  # Show these fields on hover
)

tooltip

In [133]:
if np.issubdtype(gapminder['year'].dtype, np.datetime64):
    gapminder['year'] = gapminder['year'].dt.year
subset = gapminder[gapminder.year.isin([1952,1962,1972,1982,1992,2002,2012])]

grouped_bar = alt.Chart(subset).mark_bar().encode(
    x = 'year:O',
    y = 'sum(population)',
    color = 'region:N',
    column = 'region:N'
).properties(width=90)

grouped_bar

In [102]:
stacked = alt.Chart(subset).mark_bar().encode(
    x = 'year:O',
    y = 'sum(population):Q',
    color = 'region:N'
)

stackpercent = alt.Chart(subset).mark_bar().encode(
    x = 'year:O',
    y = alt.Y('sum(population):Q', stack = 'normalize'),
    color = 'region:N'
)

stacked | stackpercent

In [131]:
alt.Chart(data2000).mark_square().encode(
    x = 'income:Q',
    y = 'life_expectancy:Q',
    color = 'region:N'
)

# Exercises

Exercise 1: Single Channel Encoding Create a chart showing the distribution of life expectancy using only tick marks along the y-axis.


In [8]:
#1
alt.Chart(data2000).mark_tick().encode(
    y = "life_expectancy"
)

Exercise 2: Basic Scatter Plot Create a scatter plot showing the relationship between population (population) and GDP per capita (gdpPercap).

In [18]:
#2
datacopy = data2000.copy()
datacopy["gdpPercap"] = datacopy["income"] / datacopy["population"]

alt.Chart(datacopy).mark_point().encode(
    x = "gdpPercap:Q",
    y = "population:Q",
    color = 'region:N',
)

Exercise 4: Bar Chart Practice Create a horizontal bar chart showing the count of countries by region, sorted from most to least countries.

In [17]:
alt.Chart(data2000).mark_bar().encode(
    y = alt.Y('region:N', sort='-x'),
    x = "count():Q"
)

Exercise 5: Data Type Experiment Create the same 1-dimensional chart using fertility data, but try it with both :Q (quantitative) and :N (nominal) data types. Compare the results.

In [19]:
withq = alt.Chart(data2000).mark_tick().encode(
    x='children_per_woman:Q'
)

withn = alt.Chart(data2000).mark_tick().encode(
    x='children_per_woman:N'
)

withq & withn

Create a scatter plot that reveals the relationship between carbon emissions and life expectancy using the 2014 data.

In the code cell below, write code that:

Create a visualization with the following specs:
- Use the **`circle`** mark
- Encode CO₂ per capita (`co2_per_capita`) on the **y channel**
- Encode life expectancy (`life_expectancy`) on the **x channel**
- Encode continent (`region`) on the **color channel**

**Add tooltips** showing `country`, `co2_per_capita`, and `life_expectancy`

In [135]:
recent_data = gm[gm.year == '2014']


alt.Chart(recent_data).mark_circle().encode(
    x = 'life_expectancy:Q',
    y = 'co2_per_capita:Q',
    color = 'region:N',
    tooltip = ['country', 'co2_per_capita', 'life_expectancy']
)

Create a temporal stacked bar chart showing how regional CO2 emissions have changed over decades.
 
The data wrangling has been provided for you. Create a visualization with the following specs:
- Use the `bar` mark
- Encode year (`year`) on the **x channel** as temporal data
- Encode sum of CO2 per capita (`sum(co2_per_capita)`) on the **y channel**
- Encode region (`region`) on the **color channel** with a better color scheme using `alt.Color('region:N', scale=alt.Scale(scheme='category10'))`
- Encode multiple fields on the **tooltip channel**: `year`, `region`, and `co2_per_capita`
- Set chart width to 600 pixels using `.properties(width=600)`



In [31]:
# Data Wrangling

# Convert year (int) back to datetime64
gm['year'] = pd.to_datetime(gm['year'], format='%Y')

# Filter for countries with CO2 data
co2_data = gm[gm.co2_per_capita.notna()]

# Aggregate by continent and year
co2_by_continent = co2_data.groupby(['year', 'region']).agg({
    'co2_per_capita': 'sum'
}).reset_index()


In [138]:
# Create  stacked chart

alt.Chart(co2_by_continent).mark_bar().encode(
    x = 'year:T',
    y = 'sum(co2_per_capita):Q',
    color = alt.Color('region:N', scale=alt.Scale(scheme='category10')),
    tooltip = ['year', 'region', 'co2_per_capita']
).properties(width=600)

#### Follow on
Update the viz above by normalizing the `y` channel to get a better view of the patterns



In [36]:
alt.Chart(co2_by_continent).mark_bar().encode(
    x = 'year:T',
    y = alt.Y('sum(co2_per_capita):Q', stack = 'normalize'),
    color = alt.Color('region:N', scale=alt.Scale(scheme='category10')),
    tooltip = ['year', 'region', 'co2_per_capita']
).properties(width=600)

## Claude Practice Questions

Create a scatter plot with the following specifications:

x-axis: life_expectancy
y-axis: income
color: region
size: population
tooltips: country, life_expectancy, income

In [108]:
alt.Chart(data2000).mark_point().encode(
    x = 'life_expectancy:Q',
    y = 'income:Q',
    color = 'region:N',
    size = 'population:Q',
    tooltip = ['country', 'life_expectancy', 'income']
)

Create a horizontal bar chart showing the average life expectancy by region, sorted from highest to lowest average life expectancy.

In [114]:
alt.Chart(data2000).mark_bar().encode(
    x = 'mean(life_expectancy):Q',
    y = alt.Y('region:N', sort = '-x'),
    color = 'region'
)

Create a chart showing the relationship between children_per_woman and life_expectancy, with separate panels for each region using the column encoding.

In [116]:
alt.Chart(data2000).mark_point().encode(
    x = 'life_expectancy:Q',
    y = 'children_per_woman:Q',
    column = 'region:N'
)

Create a visualization that combines:

A scatter plot of income vs life_expectancy for 2000 data
Color encoding by region
Size encoding by population
Custom color scheme using 'category10'
Chart dimensions of 400x300 pixels
Tooltips showing all relevant information

In [125]:
alt.Chart(subset).mark_point().encode(
    x = 'life_expectancy:Q',
    y = 'income:Q',
    size = 'population:Q',
    color = alt.Color('region', scale = alt.Scale(scheme='category10')),
    tooltip = ['region', 'income', 'life_expectancy', 'population']
).properties(width = 400, height = 300)


# -----------------------------------------------------------------------------------------

# Quiz 3: Temporal Charts + Theory

## Theory / Terms:
- **Cardinality** - number of unique values in an attribute (nominal/ordinal -> unique categories, quantitative data -> (min->max))
- **Expressiveness** - match the channel type to data characteristcs. The visual encoding should express all of, and only, the information in the dataset attributes.
    - Region being encoded for size (**BAD**, it's not an ordered attribute).
    - Categorical values should not be used for magnitude attributes.
- **Effectiveness** - for a given task, some channels are better than others so it is important to select the most effective channel for the data.
    - Think: can you distinguish between the distinct values?
        - When you want to compare between values, does it fail?
    - Color for population (continous value) so this is **BAD!!!**
- **Discriminability** - How many unique steps can we perceive? How easily can differences between attribute levels be perceived?
    - Color hue, we should use at max 5, using more confuses the perception of the graph.
    - Shapes, we should use at max 5, using more can create confusion.
- **Separability** - our ability to use this channel affected by another. Is our ability to use this channel affected by another one?
    - **Integral dimensions:** two or more attributes are perceived holistically (not independently)
        - Is another channel interfering with the users ability to determine differences?
            - Ellipses and circles: these don't stand on their own, typically grouped with another channel
            - Area and color: area influences your perception of the color, if the area is small you can't see the color
            - Opt for this when aiming for holistic impact.
            - This approach encourages viewers to perceive multiple elements as a single cohesive unit. Ensuring they are seen as one simple entitiy.
    - **Separable dimensions**: people tend to make separate judgements about each dimension (i.e. attribute)
        - Scatter plots are good to use with position and color
            - when we add size, some interference shows.
            - when we add width + height there is significant interference.
         - This is suitable when the objective is for viewers to concentrate on individual elements sequentially. It allows them to distinguish between different channels one by one. 
- **Popout effect**
    - Color is good for popout
    - Parallel processing on many individual channels
        -  Speed independent of distractor count
        -  Speed depends on channel and amoint of difference from distractors
            -  Red square vs red dot takes longer to find than blue/red circles.   
- **Grouping**
    - Can the channel show perceptual grouping of items?
        - Containment -> boxes of items (same group)
        - Similarity -> colors
        - Connection -> lines connecting multiple dots
        - Proximity -> dots that are close together
- **Accuracy**: How precisely can we tell the difference between encoded items?
    - Length is super accurate: linear
    - Factors affecting accuracy:
        - alignment
        - distractors
        - distance
        - common scale
- **Relative vs. Absolute Judgements**
    - Perceptual system mostly operates with relative judgements, not absolute.
        - This is why accuracy increases with common frame/scale and alignment
        - Weber's Law: ratio of increment to background is constant
            - Filled rectangles differ in length by 1:9, difficult judgement
            - White rectangles differ in length by 1:2, easy judgement  

## Coding Practice (from lecture):

In [142]:
import pandas as pd
import altair as alt

# If on PL use this one
filepath = 'data/owid_dataset.csv',

# If running locally on your machine use this one
filepath = 'https://raw.githubusercontent.com/kemiolamudzengi/dsci-320-datasets/main/owid_dataset.csv'

# Load the OWID energy dataset
owid_data = pd.read_csv( filepath, parse_dates=['year'])

print(f"Dataset shape: {owid_data.shape}")
print(f"Years covered: {owid_data['year'].dt.year.min()} to {owid_data['year'].dt.year.max()}")
print(f"Countries: {owid_data['country'].nunique()}")

# List of countries to explore
wind_countries = ['China', 'USA', 'Germany', 'India', 'Brazil', 'UK']
wind_data = owid_data[owid_data.country.isin(wind_countries)].copy()

Dataset shape: (1957, 132)
Years covered: 1964 to 2024
Countries: 33


<div class="alert alert-info" style="color:black; padding: 15px; border-radius: 8px; background-color:#eaf4ff;">
  <h2>VIZ TASK: Multi-line Chart for Wind Electricity</h2>
  <p>< Create a multi-line chart comparing wind electricity generation across countries.</p>

  <p>Using the <code>wind_data</code> dataset, create a visualization with:</p>
  <ul>
    <li>Use <code>mark_line</code> to represent wind generation trends</li>
    <li>Encode <code>year</code> on the <strong>x channel</strong></li>
    <li>Encode <code>wind_electricity</code> on the <strong>y channel</strong></li>
    <li>Encode <code>country</code> on the <strong>color channel</strong></li>
    <li>Include a tooltip that shows <code>Country</code>, <code>Year</code>, and <code>Wind Generation (TWh)</code> formatted to 1 decimal place (<code>.1f</code>)</li>
  </ul>

  <p><strong>Styling Specifications:</strong></p>
  <ul>
    <li><strong>Chart Properties:</strong> Width = 500px, Height = 300px, Title = "Wind Electricity Growth: Technology Adoption Leaders"</li>
    <li><strong>Mark Styling:</strong> <code>strokeWidth=1</code>, Points = size 10, filled = True</li>
    <li><strong>X Channel:</strong> Title = "Year", Format = "%Y"</li>
    <li><strong>Y Channel:</strong> Title = "Wind Electricity Generation (TWh)", Format = ".0f"</li>
    <li><strong>Legend:</strong> Title = "Country"</li>
    <li><strong>Color Scheme:</strong> Use <code>category10</code></li>
  </ul>
</div>

In [159]:
wind_graph = alt.Chart(wind_data).mark_line(point = alt.OverlayMarkDef(size = 10, filled = True, strokeWidth = 1)).encode(
    x = alt.X('year:T').title("Year").axis(format="%Y"),
    y = alt.Y('wind_electricity:Q').title("Wind Electricity Generation (TWh)").axis(format=".0f"),
    color = alt.Color('country:N').title("Country").scale(scheme = "category10"),
    tooltip = [
        alt.Tooltip('country:N'),
        alt.Tooltip('year:T'),
        alt.Tooltip('wind_electricity:Q').format(".1f")
    ]
).properties(width = 500, height = 300, title = "Wind Electricity Growth: Technology Adoption Leaders")
wind_graph

<div class="alert alert-info" style="color:black; padding: 15px; border-radius: 8px; background-color:#eaf4ff;">
  <h2>Data Task: Low-Carbon Electricity Composition</h2>

  <p><strong> TASK:</strong> Prepare the global low-carbon dataset for each technology's contribution over time.</p>

  <p><strong>Step-by-step instructions (follow in order):</strong></p>
  <ul>
    <li><strong>STEP 0: Select the low-carbon sources you will plot.</strong>
      <br>Create a list with the four columns you’ll examine: <code>['hydro_electricity', 'nuclear_electricity', 'solar_electricity', 'wind_electricity']</code>.
    </li>
    <li><strong>STEP 1: Filter the dataset to global totals.</strong>
      <br>From the full OWID table, keep only rows where <code>country == "World"</code>. Use <code>.copy()</code> to avoid chained-assignment warnings.
    </li>
    <li><strong>STEP 2: Select the columns of interest. </strong></li>
    <li><strong>STEP 3: Reshape from wide → long.</strong>
      <br>Use <code>.melt()</code> so each row represents one (year, technology, generation) triple. This long format is required for stacked area encoding.
    </li>
    <li><strong>STEP 4: Clean the technology labels for presentation.</strong>
      <br>Map raw column names to nicer labels (e.g. <code>hydro_electricity → Hydropower</code>, <code>solar_electricity → Solar PV</code>, etc.) so the legend reads professionally.
      <br><em>Hint:</em> use a dictionary and <code>.map()</code> to replace names.
    </li>
  </ul>

In [172]:
columns_to_examine = ['hydro_electricity', 'nuclear_electricity', 'solar_electricity', 'wind_electricity']

data_copy = owid_data[owid_data["country"] == "World"].copy()

selected_columns = data_copy[['year'] + columns_to_examine] #include year + other columns 

melted_df = selected_columns.melt('year') #exclude year from the melt

# STEP 4: CLEAN and REPLACE technology names
tech_names = {
    'hydro_electricity': 'Hydropower',
    'nuclear_electricity': 'Nuclear',
    'solar_electricity': 'Solar PV',
    'wind_electricity': 'Wind',
}
melted_df['technology'] = melted_df['variable'].map(tech_names)
melted_df

Unnamed: 0,year,variable,value,technology
0,1964-01-01,hydro_electricity,,Hydropower
1,1965-01-01,hydro_electricity,923.198,Hydropower
2,1966-01-01,hydro_electricity,983.817,Hydropower
3,1967-01-01,hydro_electricity,1005.742,Hydropower
4,1968-01-01,hydro_electricity,1059.289,Hydropower
...,...,...,...,...
239,2020-01-01,wind_electricity,1591.370,Wind
240,2021-01-01,wind_electricity,1856.650,Wind
241,2022-01-01,wind_electricity,2106.930,Wind
242,2023-01-01,wind_electricity,2312.130,Wind


<div class="alert alert-info" style="color:black; padding: 15px; border-radius: 8px; background-color:#eaf4ff;">
  <h2>VIZ TASK: Low-Carbon Electricity Composition</h2>

  <p>Create a stacked area chart showing the contribution of each low-carbon technology to global electricity generation.</p>

  <p>Using the <code>melted_df</code> dataset, create a visualization with:</p>
  <ul>
    <li>Use <code>mark_area</code> with stacking set to <code>'zero'</code></li>
    <li>Encode <code>year</code> on the <strong>x channel</strong></li>
    <li>Encode <code>generation</code> on the <strong>y channel</strong></li>
    <li>Encode <code>technology</code> on the <strong>color channel</strong></li>
    <li>Include a tooltip that shows <code>Year</code>, <code>Technology</code>, and <code>Generation (TWh)</code></li>
  </ul>

  <p><strong>Styling Specifications:</strong></p>
  <ul>
    <li><strong>Chart Properties:</strong> Width = 500px, Height = 300px, Title = "Global Low-Carbon Electricity Sources: Technology Composition"</li>
    <li><strong>Mark Styling:</strong> Default area mark with opacity set by stacking</li>
    <li><strong>X Channel:</strong> Title = "Year"</li>
    <li><strong>Y Channel:</strong> Title = "Low-Carbon Electricity Generation (TWh)", Format = ".0f"</li>
    <li><strong>Legend:</strong> Title = "Technology"</li>
    <li><strong>Color Scheme:</strong> Custom palette = <code>#1f77b4</code> (blue), <code>#ff7f0e</code> (orange), <code>#2ca02c</code> (green), <code>#d62728</code> (red)</li>
  </ul>

**Color Strategy for Low-Carbon Technologies:**
- **Hydropower**: Blue (#1f77b4) - water association
- **Nuclear**: Orange (#ff7f0e) - energy/power association
- **Solar**: Green (#2ca02c) - natural/renewable association
- **Wind**: Red (#d62728) - dynamic/movement association
</div>

In [183]:
stacked_area_chart = alt.Chart(melted_df).mark_area().encode(
    x = alt.X('year:T').title("Year"),
    y = alt.Y('value:Q').title("Low-Carbon Electricity Generation (TWh)").axis(format=".0f").stack('zero'),
    color = alt.Color('technology:N').title('Technology').scale(range=['#1f77b4','#ff7f0e','#2ca02c','#d62728']),
    tooltip = [
        alt.Tooltip('year:T'),
        alt.Tooltip('technology:N'),
        alt.Tooltip('value:Q')
    ]
).properties(width = 500, height = 300, title = "Global Low-Carbon Electricity Sources: Technology Composition")
stacked_area_chart

# Quiz 4: Wrap of everything so far.. + additional info

**Position: Mapping data values to spatial location along the x or y axis.**
- Design impact:
    - Reserve for most important variables = position is too valuable to waste
    - Always your first choice for quantitative comparisons
    - Combine with other channels for additional dimensions
- Problems with using it:
    - Limited real estate: Only 2 primary position channels available 
    - Scale sensitivity: requires careful attention to axis ranges & zero baselines
    - Overplotting: points can overlap and obscure patterns
- Bar Chart (length)
    - If you are calculating the values by looking at the size differences
- Bar Chart (position)
    - If you are looking at it relative to the y axis values, then its position
- Encoding: "where is this mark along the scale?"
 
**Length: Mapping data values to the linear extent of visual marks**
- Design impact:
    - Excellent for bar charts and quantitative comparisons
    - Use when position channels are occupied by other variables
- Problems with using it:
    - Bars must start from - or comparisons difficult to perceive
    - Limited space: can’t show too many bars without overcrowding
    - Accuracy is influenced by distance, alignment, distractors etc.
- Stacked bar (first group, positioned at 0 so you can use position)
    - Other groups, not starting at 0 so you use length.
- Encoding: "how long is this bar?"


**Area: Mapping data values to the 2D extent of visual marks (circle sizes, bubble areas)**
- Design impact:
    - Use sparingly and only when position/length unavailable 
    - Avoid for precise comparisons - reserve for general magnitude sense
    - Consider square root scaling to compensate for perceptual bias
- Problems with using it:
    - Systematic underestimation: humans consistently underperceive area ratios
    - Size limitations: very large differences create unusable visualizations
    - Overlapping issues: large areas obscure smaller ones

**Color hue: Mapping data values to different colors (red, blue, green, etc..**
- Design impact:
    - Perfect for nominal/categorical data - creates instant groups
    - Never use for quantitative data - no natural ordering
    - Always provide alternative encodings for accessibility
- Problems with using it
    - No natural ordering: red isn’t inherently “more” than blue
    - Accessibility issues: 8% of men have red-green color blindness
    - Cultural meanings: Colors carry unintended associations 
- Layered density chart with colors (blue, orange, red for nominal data)

**Color Lightness: Mapping data values to show how light or dark a color appears (dark blue -> light blue)**
- Design impact:
    - Good for ordinal data with few categories
    - Use high contrast for better discrimination
    - Test across devices and printing conditions
- Problems with using it:
    - Limited discriminability: only ~4-6 distinguishable values
    - Same color looks different with different backgrounds
    - Colors display differently across media
 
**Shape: Mapping data values to different geometric forms**
- Design impact:
    - Excellent for small numbers of categories (2-6 groups)
    - Combine with color for redundant encoding
    - Use familiar, simple shapes
- Problems with using it:
    - Limited discriminability: only ~6-10 distinguishable shapes
    - Size interactions: shape perception changes with size
    - Cultural interpretation: shapes have unintended meanings 


ignore whatever this is (idk how to remove this text, its really weird wtf lol):nings 
 media
l data)
r ones
e length.


## Extra Pandas

### Data Selection & Filtering
- `df['column']` - single column (Series)
- `df[['col1', 'col2']]` - multiple columns (DataFrame)
- `df[df['col'] > value]` - boolean filtering
  
- `df[(condition1) & (condition2)]` - multiple conditions
- `df['col'].isnull() / .notna()` - missing value checks
-  `df = df_subset[['col1'] + ['col2'] + list_of_cols]` - selects the columns of interest to use for df

### Data Manipulation
- `df.copy()` - Creates an independent copy to avoid warnings
- `df['new_column'] = calculation` - Creates new column using existing columns (vectorized, happens to all rows at once)
- `np.where(condition, value_if_true, value_if_false)` - creates new column based on conditions such as if col < 60 then set 'x' else set 'y'
  
- `df['col'].value_counts()` - shows how many rows are in the col
- `df.sort_values(['col1', 'col2'], ascending = [True, False])` - sorts data first by col1, then by col2 in ascending then desending order.
- `df_long = df.melt(id_vars = ['col1', 'col2'], var_name = 'energy_type', value_name = 'consumption')` - reshapes data into long format
    - id_vars = identifier variables
    - var_name = variable column
    - value_name = value column


### Grouping & Aggregation
- `df['year'].dt.year` - extracts year number from datetime column

- `df[df['col'].isin(['x','y','z'])]` - filters col1 for values == the list

- `df['col'].max()` - finds max value of column

- `df.nlargest(2, 'col')` - returns df of the top 2 largest values of col
- `df.nsmallest(1, 'col')` - returns df of the lowest value of col
- `df.groupby('col')['col2'].mean()` - divides data into groups based on unique values of col and selects col2 for each group. Then calculates the mean for each group.
- `df.groupby('col')['col2'].agg(['count', 'mean'])` - divides data into groups based on values of col then finds the count of 'col1' and mean of 'col2'
- `df.groupby('col').agg({'col1': 'mean', 'col2': 'sum'}))` - applies different functions to different columns

- `df[['col1', 'col2']].corr()` - finds correlation between 2 columns
- `df['col1'].value_counts()` - unique value counts for col1
- `df['col1'].rolling(int).rank(method = "average")` - calculates rolling averages for column
- `df1.merge(df2, on='col', suffixes=('_d1', '_d2')` - merges 2 datasets on col and includes suffixes for each column such as "age_d1", "age_d2"
- `df['new_col'] = df['old_col'].map(dict_of_names)` - rename columns based on dict of names

In [186]:
import pandas as pd
import altair as alt
filepath = 'https://raw.githubusercontent.com/kemiolamudzengi/dsci-320-datasets/main/owid_dataset.csv'

# Load the OWID energy dataset and only keep from 1974 onwards 
owid_data = (pd.read_csv(filepath, parse_dates=['year'])
             .query('year.dt.year >= 1974'))
print(owid_data.shape)

(1659, 132)


In [187]:
countries = ['Germany', 'Canada', 'Indonesia', 'Brazil', 'China']
energies = [ 'gas_consumption','oil_consumption','hydro_consumption','coal_consumption','nuclear_consumption']

# STEP 1: SELECT THE COUNTRIES OF INTEREST (select the rows of interest)
country_subset = owid_data[owid_data["country"].isin(countries)].copy()

# STEP 2: SELECT the Energy Consumptions of interest (select the columns of interest)
selected = country_subset[["year"] + ["country"] + energies]

# STEP 3: RESHAPE the data into long format
energy_consumed = selected.melt(id_vars = ["country", 'year'], #columns you DON'T want to melt
                                var_name = "energy_type", #rename variable column
                                value_name = "consumption") #rename value column

# STEP4: CLEAN energy type names
energy_type_names = {
    'gas_consumption': 'Gas',
    'oil_consumption': 'Oil',
    'hydro_consumption': 'Hydro',
    'coal_consumption': 'Coal',
    'nuclear_consumption': 'Nuclear',
}
energy_consumed['energy_type'] = energy_consumed['energy_type'].map(energy_type_names)

energy_consumed.sample(10)

Unnamed: 0,country,year,energy_type,consumption
630,China,1992-01-01,Hydro,363.017
730,Indonesia,1990-01-01,Hydro,18.035
804,Brazil,2013-01-01,Coal,191.647
846,Canada,2004-01-01,Coal,349.854
541,Brazil,2005-01-01,Hydro,907.142
663,Germany,1974-01-01,Hydro,48.995
691,Germany,2002-01-01,Hydro,64.932
1271,Indonesia,2021-01-01,Nuclear,0.0
210,Indonesia,1980-01-01,Gas,71.354
9,Brazil,1983-01-01,Gas,17.629


## Extra Altair

### Essential Encoding Channels
- `x`, `y` - Position (most important)
- `color` - Categories or intensity
- `size` - Magnitude (quantitative only)  
- `shape` - Categories only
- `opacity` - Transparency (can be put within mark_circle(__))
- `column` - Separate panels for specific variable (i.e. different plots for region)
- `alt.Tooltip('variable').title().format('.1f')` - Additional information (hover mouse to see more info)
- `alt.X/Y('variable').sort('x,y,-x,-y).axis(format='0.1f').title()` - Additional x,y edit tools
- `alt.X/Y('variable').stack('normalize')` - Stacked Bar chart for percentages
- `alt.Color('variable').scale(scheme/range='category10').legend(None).title()` - For using a specific color scheme
- `.encode().properties(width = ..., height = ...)` - Change width / height of graph
- `tooltip = [alt.Tooltip(), alt.Tooltip()]` - good for when you want to edit multiple tooltips separately
- `stroke = alt.StrokeWidth('col1').scale(range=[0.5,5])` - adds stroke based on col1
- `alt.Size('col1').scale(range=[100,1000]).legend(format='.1s')` - format numbers in legend for each size

### New Marks
- `mark_line(point = alt.OverlayMarkDef(size = 10, filled = True, strokeWidth = 1)` - Creates filled dots for each point, of size 10
- `mark_line().encode().facet(column=alt.Column('variable', title=None)).resolve_scale(y = 'independent')`. - supress auto facet header title and makes each plot have their own axis range
- `mark_text(align='left', dx=8, fontSize=11, fontWeight='bold').encode(x, y, text = alt.condition(alt.datum.period_label == "panel", 'col1', alt.value('')))` - idrk what this does, but I think it labels end points for identification..
    - `mark_text` → creates text marks instead of lines or points.
    - `align='left', dx=8` → pushes labels slightly to the right of the data point, so they don’t overlap the line.
    - `fontSize` and `fontWeight` → make the labels readable and bold.
    - `x='period_label:O'` → plots text at the categorical x-axis positions (`2005–2009` vs `2015–2019`).
    - `y='renewables_electricity:Q'` → places the text at the same height as the country’s renewable electricity value.
    - `text=alt.condition(...)` → **only shows labels for the “Post-Paris” period**, leaving the left-hand side blank. This avoids clutter.
    - `color='country:N'` → makes label colors match the country line colors.
- `alt.Chart(data)` - creates 'empty' base chart
- `alt.layer(line_layer, points_layer, labels_layer))` - combines multiple plots together
- `mark.rect()` - for building a heatmap
    - `alt.X/Y('month(date):O)`
    - `alt.X/Y('hoursminutes(date):O)`
    - `alt.X/Y('day(date):O)`
    - `alt.X/Y('monthdate(date):O)`
- `mark_area().encode()` - creates area chart (can create a stacked area chart, add color)
    - `interpolate = 'basis-open'`,
    - `line = {'color': 'darkred'}`

<div style="border-left: 5px solid #007BFF; padding: 1em; background-color: #F0F8FF;">

<h3><b>Viz Task: Faceted Energy Transition Area Chart</b></h3>

<ul>
<li>Use the <code>area</code> mark to show the share of different energy types over time.</li>
<li>Encode:
<ul>
<li><code>year</code> on the <b>x channel</b> as temporal.</li>
<li><code>percentage</code> on the <b>y channel</b> as quantitative. </li>
<li><code>energy_type</code> on the <b>color channel</b> </li>
</ul>
</li>
<li>Facet the chart by <code>country</code> into multiple small multiples, arranged in 3 columns.</li>
<li>Set the size to <b>200 × 200</b> for each chart.</li>
<li>Add a descriptive title: <i>“Primary energy consumption by source (Faceted by Country)”</i>.</li>
</ul>
</div>



In [190]:
energy_small_multiples = alt.Chart(energy_consumed).mark_line().encode(
    x=alt.X('year:T', title='Year'),
    y=alt.Y('consumption:Q', title='Consumption (TWh)'),
    color=alt.Color('energy_type:N', title='Energy Type')
).properties(
    width=200, height=200
).facet(
    column=alt.Column('country:N', title=None)  # suppress auto facet header title
).properties(
    title="Primary energy consumption by source (Faceted by Country)"
).resolve_scale(y='independent') #lets each country have their own y-axis range!
energy_small_multiples

<div style="border-left: 5px solid #007BFF; padding: 15px; background-color: #F0F8FF; border-radius: 8px;">

<h2>Renewable Electricity Slope Graph</h2>

<p><strong>Exploratory Question:</strong>
<em>How has renewable electricity generation changed across countries before and after the Paris Agreement?</em></p>

<p><strong>VIZ TASK:</strong> Create a slope graph comparing renewable electricity growth for multiple countries between two periods.</p>

<h3>Chart Specification:</h3>
<ul>
<li>Create a base chart using the <code>slope_data</code> dataset.</li>
<li>Use <code>mark_line</code> to show changes for each country over time.</li>
<li>Encode <code>period_label</code> on the <strong>x channel</strong> as ordinal.</li>
<li>Encode <code>renewables_electricity</code> on the <strong>y channel</strong> as quantitative.</li>
<li>Encode <code>country</code> on the <strong>color channel</strong>.</li>
<li>Include tooltips for <code>country</code>, <code>period_label</code>, and <code>renewables_electricity</code>.</li>
</ul>

<h3>Styling Specifications:</h3>
<ul>
<li>Line width: <code>strokeWidth=3</code>.</li>
<li>Point size: 100 pixels.</li>
<li>Font size for labels: 11px, bold, aligned left with dx=8 offset.</li>
<li>Color scheme: <code>category10</code>.</li>
</ul>

</div>


#### just data creation stuff for this question

In [193]:
def calculate_period_averages(data, attribute='renewables_electricity'):
    """Calculate average values for two comparison periods for all individual countries
       (excludes world/continent aggregates)."""

    results = []

    # Exclude non-country regions often found in OWID datasets
    exclude_list = [
        "World", "Africa", "Asia", "Europe", "European Union",
        "North America", "South America", "Oceania"
    ]
    
    period_1_years = list(range(2005, 2010))  # 2005-2009: Pre-Paris Agreement
    period_2_years = list(range(2015, 2020))  # 2015-2019: Post-Paris Agreement


    # Get only actual countries
    all_countries = data.loc[~data['country'].isin(exclude_list), 'country'].unique()

    for country in all_countries:
        country_data = data.query('country == @country')

        # Filter data for the two periods
        # Selects only the rows where the year is in the chosen period.
        # .dt.year extracts just the year part from a datetime column.
        
        period_1_data = country_data[country_data['year'].dt.year.isin(period_1_years)]
        period_2_data = country_data[country_data['year'].dt.year.isin(period_2_years)]

        if len(period_1_data) > 0 and len(period_2_data) > 0:
            
            # Calculate average for each time period
            period_1_avg = period_1_data[attribute].mean()
            period_2_avg = period_2_data[attribute].mean()

            # Calculate changes
            absolute_change = period_2_avg - period_1_avg
            percent_change = (absolute_change / period_1_avg * 100) if period_1_avg > 0 else 0

            # Store results
            results.append({
                'country': country,
                'period_1_avg': period_1_avg,
                'period_2_avg': period_2_avg,
                'absolute_change': absolute_change,
                'percent_change': percent_change
            })

    return pd.DataFrame(results)
    
renewable_change = calculate_period_averages(owid_data)

top_changers = renewable_change.nlargest(5, 'absolute_change')


# Reshape data for slope graph
slope_data = top_changers[['country', 'period_1_avg', 'period_2_avg']].melt(
    id_vars= 'country',   # FILL IN THE BLANKS
    var_name='period',
    value_name='renewables_electricity',
)

# Create meaningful period labels
period_labels = {
    'period_1_avg': '2005-2009\n(Pre-Paris)',
    'period_2_avg': '2015-2019\n(Post-Paris)'
}
slope_data['period_label'] = slope_data['period'].map(period_labels)
print(slope_data.head())

         country        period  renewables_electricity            period_label
0          China  period_1_avg                 536.254  2005-2009\n(Pre-Paris)
1  United States  period_1_avg                 374.956  2005-2009\n(Pre-Paris)
2        Germany  period_1_avg                  83.176  2005-2009\n(Pre-Paris)
3          India  period_1_avg                 130.004  2005-2009\n(Pre-Paris)
4         Brazil  period_1_avg                 382.692  2005-2009\n(Pre-Paris)


In [199]:
base = alt.Chart(slope_data)

# Lines showing change (core of slope graph)
lines = base.mark_line(strokeWidth=3).encode(
    x=alt.X('period_label:O', title='Time Period',
            axis=alt.Axis(labelAngle=0, labelFontSize=12)),
    y=alt.Y('renewables_electricity:Q', title='Renewable Electricity (TWh)',
            axis=alt.Axis(format='.0f')),
    color=alt.Color('country:N', title='Country', scale=alt.Scale(scheme='category10'), legend = None),
    tooltip=[
        alt.Tooltip('country:N', title='Country'),
        alt.Tooltip('period_label:O', title='Period'),
        alt.Tooltip('renewables_electricity:Q', title='Renewable Electricity (TWh)', format='.1f')
    ]
)

# Points at each period for precision
points = base.mark_circle(size=100, opacity = 1).encode(
    x='period_label:O',
    y='renewables_electricity:Q',
    color=alt.Color('country:N', scale=alt.Scale(scheme='category10'), legend= None),
    tooltip=[
        alt.Tooltip('country:N', title='Country'),
        alt.Tooltip('period_label:O', title='Period'),
        alt.Tooltip('renewables_electricity:Q', title='Renewable Electricity (TWh)', format='.1f')
    ]
)

# Country labels at end points for identification
labels = base.mark_text(
    align='left', dx=8, fontSize=11, fontWeight='bold'
).encode(
    x='period_label:O',
    y='renewables_electricity:Q',
    text=alt.condition(
        alt.datum.period_label == '2015-2019\\n(Post-Paris)',  # compare to the Post-Paris panel
        'country:N',
        alt.value('')
    ),
    color=alt.Color('country:N', scale=alt.Scale(scheme='category10')),
)


# Combine all layers and attach the param ONCE
slope_graph = alt.layer(lines, points, labels).properties(
    width=400,
    height=500,
    title=["Renewable Electricity Growth:", "Pre-Paris vs Post-Paris Agreement"]
).resolve_scale(
    color='independent'
)
slope_graph

In [200]:
path = 'https://raw.githubusercontent.com/kemiolamudzengi/dsci-320-datasets/main/energy_usage.csv'

data = pd.read_csv(path)

alt.Chart(data).mark_rect().encode(
    x = alt.X('monthdate(date):O',title='Day'),
    y = alt.Y('hoursminutes(date):O',title='Time of Day'),
    color = alt.Color('usage:Q', title='Usage (kW)'),
    tooltip =[
        alt.Tooltip('monthdate(date):O', title='Day'),
        alt.Tooltip('hoursminutes(date):O', title='Time'),
        alt.Tooltip('usage:Q', title='Usage (kW)')
    ]
).properties(width=700, height = 150, title = "Mike Bostocks' Household Energy Usage 2019")

# Quiz 5: EDA Charts

### Definitions:
- EDA is a process that includes:
    - detection of mistakes
    - checking of assumptions
    - preliminary selection of appropriate models
    - determining relationships among the explanatory variables
    - assessing the direction and rough size of relationships between explanatory and outcome variables.

----------------------------------------------------------------
- EDA Cycle Involves:
    - Using what you learn to refine the questions and possibly generate new questions
    - Generate summary views (both visual and numerical)
    - Generate questions about your data
    - Search for answers by transforming, modeling and then visualizing the data


----------------------------------------------------------------
- EDA Categories:
    - Medium
        - Numerical Statistical Summaries
        - Visual Data Analysis
     - Attribute
         - Unvariate - one column at a time
         - Multivariate - two or more variables at a time, looking for relationships
         - Attribute Type - categorical or quantitative.
    - Role
        - Outcome
        - Explanatory

### Code Walkthrough:

In [2]:
import pandas as pd
import altair as alt
import numpy as np

alt.renderers.enable("default")
# If running locally on your machine use this one
filepath = 'https://raw.githubusercontent.com/kemiolamudzengi/dsci-320-datasets/main/hawks.csv'

# Load the hawks dataset
hawks = pd.read_csv(filepath)

In [6]:
print(hawks.shape)
hawks.info()

(908, 20)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 908 entries, 0 to 907
Data columns (total 20 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Unnamed: 0    908 non-null    int64  
 1   Month         908 non-null    int64  
 2   Day           908 non-null    int64  
 3   Year          908 non-null    int64  
 4   CaptureTime   908 non-null    object 
 5   ReleaseTime   907 non-null    object 
 6   BandNumber    908 non-null    object 
 7   Species       908 non-null    object 
 8   Age           908 non-null    object 
 9   Sex           332 non-null    object 
 10  Wing          907 non-null    float64
 11  Weight        898 non-null    float64
 12  Culmen        901 non-null    float64
 13  Hallux        902 non-null    float64
 14  Tail          908 non-null    int64  
 15  StandardTail  571 non-null    float64
 16  Tarsus        75 non-null     float64
 17  WingPitFat    77 non-null     float64
 18  KeelFat       567 no

We can see a lot of information here, how many rows / columns is in the data, what the data types are, how many non-null values, etc..

In [7]:
# sample 10 random rows to understand the data structure
hawks.sample(10)

Unnamed: 0.1,Unnamed: 0,Month,Day,Year,CaptureTime,ReleaseTime,BandNumber,Species,Age,Sex,Wing,Weight,Culmen,Hallux,Tail,StandardTail,Tarsus,WingPitFat,KeelFat,Crop
557,558,10,9,2000,14:00,,1807-82702,RT,I,,371.0,905.0,23.6,26.8,197,204.0,,,3.0,0.5
532,533,9,27,2000,12:20,,1204-45870,CH,I,M,225.0,325.0,15.8,18.7,179,187.0,,,2.0,0.5
381,382,10,1,1998,10:42,,1207-72606,RT,I,,400.0,1254.0,25.2,30.0,237,,,,,
82,83,9,8,1994,12:10,,1423-16201,SS,I,F,205.0,194.0,12.6,14.4,155,,,,,
555,556,10,8,2000,13:29,,1177-04646,RT,I,,381.0,920.0,27.0,29.6,223,226.0,,,1.0,0.25
144,145,10,13,1994,13:05,,1207-64614,RT,I,,422.0,1205.0,28.8,31.3,238,,,,,
795,796,11,8,2002,14:29,,1177-0473,RT,I,,366.0,1115.0,27.1,21.0,215,219.0,,,2.0,0.75
682,683,10,28,2001,11:37,,1207-72662,RT,A,,404.0,1300.0,29.4,32.8,224,229.0,,,4.0,0.0
24,25,10,23,1992,11:45,,877-76334,RT,I,,380.0,990.0,26.0,30.0,211,,,,,
659,660,10,17,2001,14:03,,2003-99331,SS,A,F,199.0,175.0,12.8,14.5,159,160.0,,,3.0,0.0


In [8]:
# Missing data landscape

#creates a dataframe of booleans (True where value is mising), sums up all the values which are true
missing_summary = hawks.isnull().sum()
print("Missing values per column:")

#prints the number of missing values, but ONLY for columns that actually have missing values (i.e. > 0 missing values)
print(missing_summary[missing_summary > 0])

#adds up ALL the missing counts across the columns (total # of missing entries in the whole dataset)
print(f"\nTotal missing values: {missing_summary.sum()}")

#dropna removes any row that contains at least 1 missing value... .shape[0] gives the number of remaining rows
#divided by the total dataset row count
print(f"Percentage of complete rows: {(hawks.dropna().shape[0] / hawks.shape[0] * 100):.1f}%")

Missing values per column:
ReleaseTime       1
Sex             576
Wing              1
Weight           10
Culmen            7
Hallux            6
StandardTail    337
Tarsus          833
WingPitFat      831
KeelFat         341
Crop            343
dtype: int64

Total missing values: 3286
Percentage of complete rows: 3.9%


In [9]:
#Temporary fix, remove the columns with the issues..

cols_to_drop = [
    'Unnamed: 0', 'ReleaseTime', 'StandardTail', 'Tarsus', 'Sex',
    'KeelFat', 'Crop', 'BandNumber', 'CaptureTime', 'WingPitFat', 'Day'
]

hawks_clean = (
    hawks
    .drop(columns=cols_to_drop, errors='ignore')
    .rename(columns=lambda x: x.strip().lower()) # we are used to lower-case attribute names, so drop the title case
).copy()

print(f"Cleaned dataset shape: {hawks_clean.shape}")

Cleaned dataset shape: (908, 9)


### Setting Up the Core EDA Question

> **In typical EDA workflows, we ask: "How does a quantitative variable distribute across our observed data?"**

This is one of the most fundamental questions in exploratory data analysis. Let's investigate the distribution of hawk weights.

<div style="border-left: 5px solid #007BFF; padding: 15px; background-color: #F0F8FF; border-radius: 8px;">

<h2>Hawk Weight Histogram</h2>

<p><strong>Exploratory Question:</strong>  
<em>What is the distribution of hawk weights in the dataset?</em></p>

<p><strong>VIZ TASK:</strong> Create a histogram showing the distribution of hawk weights.</p>

<h3>Chart Specification:</h3>
<ul>
<li>Use <code>mark_bar</code> with <code>binSpacing=0</code> to ensure no gaps between bins.</li>
<li>Encode <code>weight</code> on the <strong>x channel</strong> as <code>quantitative</code> with a binning specification (<code>maxbins=25</code>).</li>
<li>Encode <code>count()</code> on the <strong>y channel</strong> as <code>quantitative</code>, titled <em>Number of Hawks</em>.</li>
<li>Set the chart <strong>width</strong> to 400 pixels and <strong>height</strong> to 300 pixels.</li>
<li>Give the chart the title <em>Hawk Weight Distribution</em>.</li>
</ul>

</div>

In [10]:
alt.Chart(hawks_clean).mark_bar(binSpacing=0).encode(
    x = alt.X('weight:Q').bin(maxbins=25),
    y = alt.Y('count():Q').title("Number of Hawks"),
).properties(width = 400, height = 300, title = "Hawk Weight Distribution")

Now a question is raised, should we answer this question statistically or visually?

## The Statistical Approach 

### Fundamental Pandas EDA Techniques

Let's systematically investigate using **computational summaries**:

In [11]:
# 1. Overall distribution characteristics
print("Overall weight distribution:")
print(hawks_clean['weight'].describe())

Overall weight distribution:
count     898.000000
mean      772.080178
std       462.311760
min        56.000000
25%       185.000000
50%       970.000000
75%      1120.000000
max      2030.000000
Name: weight, dtype: float64


In [13]:
# 2. Investigate potential grouping variables
print("Available categorical variables:")

#counts how many unique categories it has
categorical_cols = hawks_clean.select_dtypes(include=['object']).columns

for col in categorical_cols:
    print(f"\n{col}: {hawks_clean[col].nunique()} unique values")
    print(hawks_clean[col].value_counts())

Available categorical variables:

species: 3 unique values
species
RT    577
SS    261
CH     70
Name: count, dtype: int64

age: 2 unique values
age
I    684
A    224
Name: count, dtype: int64


In [14]:
# 3. Group-wise statistical analysis
# Hypothesis 1: Does species explain the distribution?
species_stats = hawks_clean.groupby('species')['weight'].describe()
print("Weight statistics by species:")
print(species_stats)

Weight statistics by species:
         count         mean         std    min    25%     50%      75%     max
species                                                                       
CH        70.0   420.485714  162.031643   56.0  335.0   377.5   505.00  1119.0
RT       572.0  1094.430070  189.210250  101.0  980.0  1070.0  1210.00  2030.0
SS       256.0   147.968750   80.652675   85.0  100.0   155.0   177.75  1094.0


In [15]:
# 3. Group-wise statistical analysis
# Hypothesis 2: Does age explain the distribution?
age_stats = hawks_clean.groupby('age')['weight'].describe()
print("Weight statistics by age:")
print(age_stats)

Weight statistics by age:
     count        mean         std   min    25%    50%     75%     max
age                                                                   
A    221.0  747.366516  493.883991  56.0  185.0  960.0  1140.0  1670.0
I    677.0  780.147710  451.617738  85.0  188.0  971.0  1120.0  2030.0


### 🔍 Observation: Statistical Detective Findings

**From the statistical analysis, what do you notice?**

1. there is a high distribution of weight across all species (except for SS), (max 1119.0 min 56.0)?
2. The mean for weight stats by age are the same..
3. the mean of weights are vastly different for each species
4. std for age has a much higher value (more spread out, higher variation)


## Part 4: The Visual  Approach 

### Visual Investigation Using Faceting

Now let's investigate the **same question** using visual exploration

<br>

<div style="border-left: 5px solid #007BFF; padding: 15px; background-color: #F0F8FF; border-radius: 8px;">

<h2>Density of Hawk Weights by Species</h2>

<p><strong>Exploratory Question:</strong>  
<em>Does species explain the multimodal distribution of hawk weights?</em></p>

<p><strong>VIZ TASK:</strong> Create a density plot of hawk weights, grouped by species, to compare distributions.</p>

<h3>Chart Specification:</h3>
<ul>
<li>Transform the data using <code>transform_density</code> on <code>weight</code>, grouping by <code>species</code>.</li>
<li>Use <code>mark_area</code> for smooth density curves.</li>
<li>Encode <code>weight</code> on the <strong>x channel</strong> as <code>quantitative</code>, with the title <em>Weight (g)</em>.</li>
<li>Encode <code>density</code> on the <strong>y channel</strong> as <code>quantitative</code>, with the title <em>Density</em>.</li>
<li>Encode <code>species</code> on the <strong>color channel</strong> as <code>nominal</code>, with the legend title <em>Species</em>.</li>
<li>Set the chart <strong>width</strong> to 400 pixels and <strong>height</strong> to 200 pixels.</li>
<li>Give the chart the title <em>Density of Hawk Weights by Species</em>.</li>
</ul>

<h3>Styling Specifications:</h3>
<ul>
<li>Set <code>opacity</code> of the density areas to <code>0.7</code> for overlap visibility.</li>
</ul>

</div>


In [24]:
alt.Chart(hawks_clean).mark_area(opacity = 0.7).transform_density(
    'weight',
    as_=['weight', 'density'],
    groupby = ['species']
).encode(
    x = alt.X('weight:Q').title("Weight (g)"),
    y = alt.Y('density:Q').title("Density"),
    color = alt.Color("species:N").title("Species")
).properties(width = 400, height = 200, title = "Density of Hawk Weights by Species")

<div style="border-left: 5px solid #007BFF; padding: 15px; background-color: #F0F8FF; border-radius: 8px;">

<h2>Density of Hawk Weights by Age</h2>

<p><strong>Exploratory Question:</strong>  
<em>Does age explain the multimodal distribution of hawk weights?</em></p>

<p><strong>VIZ TASK:</strong> Create a density plot of hawk weights, grouped by age, to compare distributions across age categories.</p>

<h3>Chart Specification:</h3>
<ul>
<li>Transform the data using <code>transform_density</code> on <code>weight</code>, grouping by <code>age</code>.</li>
<li>Use <code>mark_area</code> to display smooth density curves.</li>
<li>Encode <code>weight</code> on the <strong>x channel</strong> as <code>quantitative</code>, with the axis title <em>Weight (g)</em>.</li>
<li>Encode <code>density</code> on the <strong>y channel</strong> as <code>quantitative</code>, with the axis title <em>Density</em>.</li>
<li>Encode <code>age</code> on the <strong>color channel</strong> as <code>ordinal</code>, with the legend title <em>Age</em>.</li>
<li>Set the chart <strong>width</strong> to 400 pixels and <strong>height</strong> to 200 pixels.</li>
<li>Give the chart the title <em>Density of Hawk Weights by Age</em>.</li>
</ul>

<h3>Styling Specifications:</h3>
<ul>
<li>Set <code>opacity</code> of the density areas to <code>0.7</code> for overlap visibility.</li>
</ul>

</div>


In [26]:
alt.Chart(hawks_clean).mark_area(opacity = 0.7).transform_density(
    'weight',
    as_=['weight', 'density'],
    groupby = ['age']
).encode(
    x = alt.X('weight:Q').title("Weight (g)"),
    y = alt.Y('density:Q').title("Density"),
    color = alt.Color("age:O").title("Age")
).properties(width = 400, height = 200, title = "Density of Hawk Weights by Age")

#### Note!
There are 6 common distribution shapes that exist.

<img title="Common Distribution Types" src="https://www.data-to-viz.com/graph/density_files/figure-html/unnamed-chunk-2-1.png" style="max-width: 400px;"><br/>

## Part 5: Integration - The Best of Both Worlds 

### Combining Statistical Precision with Visual Insight

In [27]:
# Start with visual exploration → Species clearly explains multimodal distribution
# Now use statistics to quantify the visual findings

species_summary = hawks_clean.groupby('species')['weight'].agg([
    'count', 'mean', 'std', 'min', 'max'
]).round(1)

print("Quantified species differences:")
print(species_summary)

Quantified species differences:
         count    mean    std    min     max
species                                     
CH          70   420.5  162.0   56.0  1119.0
RT         572  1094.4  189.2  101.0  2030.0
SS         256   148.0   80.7   85.0  1094.0


In [28]:
# The visual analysis suggested we should investigate age differences too
# Let's get statistical confirmation:

detailed_stats = hawks_clean.groupby(['species', 'age'])['weight'].agg([
    'count', 'mean', 'std'
]).round(1)

print("Detailed breakdown by species AND age:")
print(detailed_stats)

Detailed breakdown by species AND age:
             count    mean    std
species age                      
CH      A       32   450.3  197.6
        I       38   395.3  121.8
RT      A      121  1161.4  194.6
        I      451  1076.5  183.9
SS      A       68   150.4   40.3
        I      188   147.1   91.0


<div style="border-left: 5px solid #007BFF; padding: 15px; background-color: #F0F8FF; border-radius: 8px;">

<h2>Weight Distribution by Species</h2>

<p><strong>Exploratory Question:</strong>  
<em>Do different hawk species require different binning strategies when examining weight distributions?</em></p>

<p><strong>VIZ TASK:</strong> Create small multiple histograms to compare the weight distributions across species.</p>

<h3>Chart Specification:</h3>
<ul>
<li>Use <code>mark_bar</code> to create histograms of hawk weights.</li>
<li>Encode <code>weight</code> on the <strong>x channel</strong> as <code>quantitative</code>, applying binning with <code>maxbins=15</code>. Title the axis <em>Weight (g)</em>.</li>
<li>Encode <code>count()</code> on the <strong>y channel</strong> as <code>quantitative</code>, with the axis title <em>Count</em>.</li>
<li>Encode <code>species</code> on the <strong>color channel</strong> as <code>nominal</code>, with the legend title <em>Species</em>.</li>
<li>Use the <strong>column channel</strong> for <code>species</code> so that each species appears as a separate small multiple histogram, with the title <em>Species</em>.</li>
<li>Set the overall chart title to <em>Weight Distribution by Species</em>.</li>
</ul>

<h3>Styling Specifications:</h3>
<ul>
<li>Set the <strong>width</strong> of each facet to 200 pixels and <strong>height</strong> to 150 pixels.</li>
<li>Apply a distinct color for each species category (Altair’s default categorical palette).</li>
</ul>

</div>

In [35]:
# Do different species need different binning strategies?
species_histograms = alt.Chart(hawks_clean).mark_bar().encode(
    x = alt.X('weight:Q').bin(maxbins=35).title("Weight (g)"),
    y = alt.Y('count():Q').title("Count"),
    color = alt.Color('species:N').title("Species"),
).properties(width = 200, height = 150, title = "Weight Distribution by Species.").facet(column = 'species', title = "Species")

# Show the plot
species_histograms