# Pandas Documentation Guide

In [37]:
import pandas as pd
import numpy as np

## Data Loading & Exploration

- pd.DataFrame(dict) - creates df from dict.
- pd.read_csv(url, parse_dates=[['Year', 'Month', 'Day']]) - read csv and parse datetime columns
- df.info() - data types and missing values

- df.describe() - statistical summary
- df.shape - (rows, columns)
- df.columns.tolist() - column names

In [9]:
dict_data = {
    'name': ['Joe', 'Blake', 'Kyle'],
    'age': [20, 22, 19],
    'major': ['CPSC', 'MATH', 'ECON'],
    'gpa': ['2.8', '0', '4.0']
}

random_dict_df = pd.DataFrame(dict_data)
random_dict_df
print(random_dict_df.info())
print(random_dict_df.describe())
print(random_dict_df.shape)
print(random_dict_df.columns.tolist())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3 entries, 0 to 2
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   name    3 non-null      object
 1   age     3 non-null      int64 
 2   major   3 non-null      object
 3   gpa     3 non-null      object
dtypes: int64(1), object(3)
memory usage: 228.0+ bytes
None
             age
count   3.000000
mean   20.333333
std     1.527525
min    19.000000
25%    19.500000
50%    20.000000
75%    21.000000
max    22.000000
(3, 4)
['name', 'age', 'major', 'gpa']


In [38]:
url = 'https://raw.githubusercontent.com/kemiolamudzengi/dsci-320-datasets/main/world-data-gapminder.csv'
gapminder = pd.read_csv(url, parse_dates=['year'])
print(gapminder.shape)

(38982, 14)


## Data Selection & Filtering

- df['column'] - single column (Series)
- df[['col1', 'col2']] - multiple columns (DataFrame)
- df[df['col'] > value] - boolean filtering
  
- df[(condition1) & (condition2)] - multiple conditions
- df['col'].isnull() / .notna() - missing value checks

In [19]:
countries = gapminder['country']
subset = gapminder[['country', 'year', 'life_expectancy']]

combined_filter = gapminder[
    (gapminder['year'] == '1982') & 
    (gapminder['life_expectancy'] > 75)
]

#filter for new dataset with region == Europe with > 80 life exp.
europe_or_high_life = gapminder[
    (gapminder['region'] == 'Europe') | 
    (gapminder['life_expectancy'] > 80)
]

#check which columns have missing data, true = 1 false = 0
missing_data = gapminder.isnull().sum()
print("Missing values per column:", missing_data)

#filter dataset to include NON N/A rows of co2_per_cap
co2_data = gapminder[gapminder['co2_per_capita'].notna()]

#print all unique years
years = co2_data['year'].unique() 

Missing values per column: country                      0
year                         0
population                   0
region                       0
sub_region                   0
income_group                 0
life_expectancy              0
income                       0
children_per_woman           0
child_mortality              2
pop_density              26700
co2_per_capita           22697
years_in_school_men      30794
years_in_school_women    30794
dtype: int64


## Data Manipulation

- df.copy() - Creates an independent copy to avoid warnings
- df['new_column'] = calculation - Creates new column using existing columns (vectorized, happens to all rows at once)
- np.where(condition, value_if_true, value_if_false) - creates new column based on conditions such as if col < 60 then set 'x' else set 'y'
  
- df['col'].value_counts() - shows how many rows are in the col
- df.sort_values(['col1', 'col2'], ascending = [True, False] - sorts data first by col1, then by col2 in ascending then desending order.

In [23]:
#Creating new column
co2_data = co2_data.copy()
co2_data['total_co2'] = co2_data['population'] * co2_data['co2_per_capita']

#if life exp < 20 -> set category to 'very low' 
#ELSE IF life exp <= 50 -> set catory to 'medium' -> ELSE -> set category to 'very high'
co2_data['life_exp_category'] = np.where(
    co2_data['life_expectancy'] < 20, 'Very Low',
    np.where(co2_data['life_expectancy'] <= 50, 'Medium', 'Very High')
)

print(co2_data['life_exp_category'].value_counts())

#sort by region with lowest life expectancy
multi_sort_df = gapminder.sort_values(['region', 'life_expectancy'], ascending = [True, True])
print(multi_sort_df[['region', 'country', 'life_expectancy']])

life_exp_category
Very High    10070
Medium        6157
Very Low        58
Name: count, dtype: int64
        region    country  life_expectancy
35497   Africa    Tunisia             1.50
22405   Africa  Mauritius             4.00
11478   Africa   Ethiopia             4.01
11477   Africa   Ethiopia             5.02
24194   Africa    Namibia             5.19
...        ...        ...              ...
1749   Oceania  Australia            82.50
1747   Oceania  Australia            82.60
1748   Oceania  Australia            82.60
1750   Oceania  Australia            82.70
1751   Oceania  Australia            82.90

[38982 rows x 3 columns]


## Grouping & Aggregation

- df['year'].dt.year - extracts year number from datetime column

- df[df['col'].isin(['x','y','z'])] - filters col1 for values == the list

- df['col'].max() - finds max value of column

- df.nlargest(2, 'col') - returns df of the top 2 largest values of col
- df.nsmallest(1, 'col') - returns df of the lowest value of col
- df.groupby('col')['col2'].mean() - divides data into groups based on unique values of col and selects col2 for each group. Then calculates the mean for each group.
- df.groupby('col')['col2'].agg(['count', 'mean']) - divides data into groups based on values of col then finds the count of 'col1' and mean of 'col2'
- df.groupby('col').agg({'col1': 'mean', 'col2': 'sum'})) - applies different functions to different columns

- df[['col1', 'col2']].corr() - finds correlation between 2 columns
- df['col1'].value_counts() - unique value counts for col1
- df['col1'].rolling(int).rank(method = "average") - calculates rolling averages for column

- df1.merge(df2, on='col', suffixes=('_d1', '_d2') - merges 2 datasets on col and includes suffixes for each column such as "age_d1", "age_d2"

In [27]:
recent_year = co2_data['year'].dt.year.max()
print(recent_year)

#filter data from specific years
year_list = ['1952', '2004', '1977']
time_series_data = gapminder[gapminder['year'].isin(year_list)]

top2_co2 = co2_data.nlargest(2, 'co2_per_capita')
lowest_co2 = co2_data.nsmallest(1, 'co2_per_capita')

correlation = gapminder[['life_expectancy', 'income']].corr()
unique_regions = gapminder['region'].value_counts()

rolling_avg = gapminder['income'].rolling(2).rank(method = "average")

2014


  time_series_data = gapminder[gapminder['year'].isin(year_list)]


## Common Patterns

# Chaning operations together:
df = (gapminder.query('year == "2000"') #filter by year
      [['country', 'region', 'life_expectancy']].sort_values('life_expectancy', ascending=True)) #pull columns and sort by lowest life exp.
df

In [36]:
# Question: Which 2 countries had the fastest improvement in life expectancy between 1960 and 2004?

df_1960 = gapminder[gapminder['year'] == '1960']

df_2004 = gapminder[gapminder['year'] == '2004']

merged_df = df_1960.merge(
    df_2004,
    on='country',
    suffixes=('_1960', '_2004'))

merged_df['improvement_diff'] = (merged_df['life_expectancy_2004'] - merged_df['life_expectancy_1960'])

best_two = merged_df.nlargest(2, 'improvement_diff')

best_two

Unnamed: 0,country,year_1960,population_1960,region_1960,sub_region_1960,income_group_1960,life_expectancy_1960,income_1960,children_per_woman_1960,child_mortality_1960,...,income_group_2004,life_expectancy_2004,income_2004,children_per_woman_2004,child_mortality_2004,pop_density_2004,co2_per_capita_2004,years_in_school_men_2004,years_in_school_women_2004,improvement_diff
98,Maldives,1960-01-01,89900,Asia,Southern Asia,Upper middle,32.2,1090,7.02,340.0,...,Upper middle,73.8,8920,2.42,28.6,1030.0,2.15,6.54,5.97,41.6
32,China,1960-01-01,658000000,Asia,Eastern Asia,Upper middle,30.9,891,3.99,309.0,...,Upper middle,71.9,5170,1.55,26.3,140.0,3.98,9.55,8.03,41.0


## Self-Check Questions

1. **What's the difference between selecting columns with `df['col']` vs `df[['col']]`?**
    - df['col'] returns a pandas series, which is a 1d array.
    - df[['col']] returns a pandas dataframe.

2. **How do you combine multiple conditions in boolean indexing?**
    - df_new = df[(df['col1'] == "123") & (df['col2'] > 5)]
   - Remember: always use parentheses around each condition!


3. **What's the purpose of using `.copy()` when creating filtered datasets?**
    - We use .copy() to avoid warnings, it will change the main dataset if you filter on it. If you want to preserve the main dataset be sure to copy it first.

   
4. **How do you find the top N values in a column?**
   - df.nlargest(n, 'column')

5. **What's the basic pattern for groupby operations?**
    - df.groupby('col1')['col2'].agg(['count', 'mean'])
    - df.groupby('col1').agg({dict})

### **Applied Skills (Quiz-level questions)**

4. **Complex Filtering Challenge:**
   Write code to find countries that meet ALL these conditions:
   - Life expectancy greater than 70
   - Population greater than 5 million
   - From either Europe OR North America
   

In [43]:
region_filter = gapminder[
    (gapminder['region'] == 'Europe') |
    (gapminder['region'] == 'North America')]

life_and_pop_filter = region_filter[
    (region_filter['life_expectancy'] > 70) &
     (region_filter['population'] > 5000000)]

life_and_pop_filter['country'].unique()

array(['Austria', 'Belarus', 'Belgium', 'Bulgaria', 'Czech Republic',
       'Denmark', 'Finland', 'France', 'Germany', 'Greece', 'Hungary',
       'Italy', 'Netherlands', 'Norway', 'Poland', 'Portugal', 'Romania',
       'Russia', 'Serbia', 'Slovak Republic', 'Spain', 'Sweden',
       'Switzerland', 'Ukraine', 'United Kingdom'], dtype=object)

5. **Groupby Analysis Challenge:**
   For each region, calculate:
   - The count of countries
   - The average life expectancy
   - The country with the highest income (hint: use `.idxmax()`)

In [100]:
df = gapminder.groupby('region').agg({
    'country': 'count',
    'life_expectancy': 'mean',
    'income': 'idxmax'
})

highest_income = gapminder.groupby('country')['income'].idxmax()
highest_income

country
Afghanistan              218
Albania                  437
Algeria                  654
Angola                   871
Antigua and Barbuda     1083
                       ...  
Venezuela              38064
Vietnam                38324
Yemen                  38535
Zambia                 38712
Zimbabwe               38935
Name: income, Length: 178, dtype: int64

6. **Missing Data Strategy Challenge:**
   You have a dataset where:
   - 'essential_column' has 5% missing values
   - 'optional_column' has 40% missing values
   - 'analysis_column' has 15% missing values
   
   What strategy would you use for each column and why? Practice implementing your strategy in code.

- fill the missing values with averages of the essential column, or remove the rows with missing values entirely.
- might be worth excluding this column, as it is optional.
- either exclude the 15% from the analysis, or fill the missing values with averages.

In [101]:
# suppose life_expectancy is the essential column:

# life_exp_mean = gapminder['life_expectancy'].mean()
# gapminder['life_expectancy'].fillna(life_exp_mean, inplace = True)

# OR 

# gapminder['life_expectancy'].dropna(subset=['life_expectancy'])


# suppose region is the optional column

# gapminder.drop('region', axis = 1)

## Claude Practice Questions

In [52]:
#Find all countries where population is less than 20,000
population = gapminder[gapminder["population"] < 20000]
population["country"].unique()

array(['Kiribati', 'Qatar', 'Seychelles', 'Tonga'], dtype=object)

In [55]:
#Find European countries OR countries with life expectancy < 50
df = gapminder[(gapminder["country"] == "Europe") |
                (gapminder["life_expectancy"] < 50)]
df["country"].unique()

array(['Afghanistan', 'Albania', 'Algeria', 'Angola',
       'Antigua and Barbuda', 'Argentina', 'Armenia', 'Australia',
       'Austria', 'Azerbaijan', 'Bahamas', 'Bahrain', 'Bangladesh',
       'Barbados', 'Belarus', 'Belgium', 'Belize', 'Benin', 'Bhutan',
       'Bolivia', 'Bosnia and Herzegovina', 'Botswana', 'Brazil',
       'Bulgaria', 'Burkina Faso', 'Burundi', 'Cambodia', 'Cameroon',
       'Canada', 'Central African Republic', 'Chad', 'Chile', 'China',
       'Colombia', 'Comoros', 'Congo, Dem. Rep.', 'Congo, Rep.',
       'Costa Rica', "Cote d'Ivoire", 'Croatia', 'Cuba', 'Cyprus',
       'Czech Republic', 'Denmark', 'Djibouti', 'Dominican Republic',
       'Ecuador', 'Egypt', 'El Salvador', 'Equatorial Guinea', 'Eritrea',
       'Estonia', 'Ethiopia', 'Fiji', 'Finland', 'France', 'Gabon',
       'Gambia', 'Georgia', 'Germany', 'Ghana', 'Greece', 'Grenada',
       'Guatemala', 'Guinea', 'Guinea-Bissau', 'Guyana', 'Haiti',
       'Honduras', 'Hungary', 'Iceland', 'India', 'Indo

In [60]:
#Filter for countries in Asia with income > 10000

asia = gapminder[(gapminder["region"] == "Asia") &
                (gapminder["income"] > 10000)]
asia["country"].unique()

array(['Azerbaijan', 'Bahrain', 'China', 'Cyprus', 'Georgia', 'Indonesia',
       'Iran', 'Iraq', 'Israel', 'Japan', 'Kazakhstan', 'Kuwait',
       'Lebanon', 'Malaysia', 'Maldives', 'Mongolia', 'Oman', 'Qatar',
       'Saudi Arabia', 'Singapore', 'South Korea', 'Sri Lanka',
       'Thailand', 'Turkey', 'Turkmenistan', 'United Arab Emirates'],
      dtype=object)

Create a categorical column 'income_level' where income < 5000 = 'Low', 5000-15000 = 'Medium', >15000 = 'High'

In [63]:
gapminder = gapminder.copy()

gapminder["income_level"] = np.where(gapminder['income'] <= 5000, 'low',
                                     np.where(gapminder['income'] < 15000, 'medium', 'high'))
gapminder["income_level"].unique()

array(['low', 'medium', 'high'], dtype=object)

For each region, find the total population and average income

In [65]:
gapminder.groupby('region').agg({
    'population': 'sum',
    'income': 'mean'
})

Unnamed: 0_level_0,population,income
region,Unnamed: 1_level_1,Unnamed: 2_level_1
Africa,59192998600,1774.840095
Americas,63837885500,4581.39402
Asia,330133218800,4818.689012
Europe,98766930400,7975.743941
Oceania,2422277600,3775.721461


In [82]:
# Find the 5 countries with the biggest improvement in child mortality from 1960 to 2010

data_2010 = gapminder[gapminder["year"].dt.year == 2010]
data_1960 = gapminder[gapminder["year"].dt.year == 1960]

merged = data_1960.merge(data_2010, on = 'country', suffixes=('_1960', '_2010'))

merged["improvement"] = merged["child_mortality_1960"] - merged["child_mortality_2010"]

top5 = merged.nlargest(5, 'improvement')

top5["country"].unique()

array(['Oman', 'Yemen', 'Maldives', 'Mali', 'Iran'], dtype=object)

In [70]:
# Show countries where 'co2_per_capita' is NOT missing

co2 = gapminder[gapminder['co2_per_capita'].notna()]
co2["country"].unique()

array(['Afghanistan', 'Albania', 'Algeria', 'Angola',
       'Antigua and Barbuda', 'Argentina', 'Armenia', 'Australia',
       'Austria', 'Azerbaijan', 'Bahamas', 'Bahrain', 'Bangladesh',
       'Barbados', 'Belarus', 'Belgium', 'Belize', 'Benin', 'Bhutan',
       'Bolivia', 'Bosnia and Herzegovina', 'Botswana', 'Brazil',
       'Bulgaria', 'Burkina Faso', 'Burundi', 'Cambodia', 'Cameroon',
       'Canada', 'Central African Republic', 'Chad', 'Chile', 'China',
       'Colombia', 'Comoros', 'Congo, Dem. Rep.', 'Congo, Rep.',
       'Costa Rica', "Cote d'Ivoire", 'Croatia', 'Cuba', 'Cyprus',
       'Czech Republic', 'Denmark', 'Djibouti', 'Dominican Republic',
       'Ecuador', 'Egypt', 'El Salvador', 'Equatorial Guinea', 'Eritrea',
       'Estonia', 'Ethiopia', 'Fiji', 'Finland', 'France', 'Gabon',
       'Gambia', 'Georgia', 'Germany', 'Ghana', 'Greece', 'Grenada',
       'Guatemala', 'Guinea', 'Guinea-Bissau', 'Guyana', 'Haiti',
       'Honduras', 'Hungary', 'Iceland', 'India', 'Indo

# -------------------------------------------------------------------------------------------

# Altair Documentation Guide

In [3]:
import pandas as pd
import altair as alt
#from vega_datasets import data


#filepath = "data/world-data-gapminder.csv"
filepath = 'https://raw.githubusercontent.com/kemiolamudzengi/dsci-320-datasets/main/world-data-gapminder.csv'

# Read in the data using pandas, remember to set parse_dates!
gm = pd.read_csv(filepath, parse_dates=["year"])

# Display basic information about the dataset
print(f"Dataset shape: {gm.shape}")
print(f"Years covered: {gm.year.dt.year.min()} to {gm.year.dt.year.max()}")
print(f"Number of countries: {gm.country.nunique()}")
print(f"Regions included: {', '.join(sorted(gm.region.unique()))}")
print(f"Column names: {sorted(gm.columns)}")



# Create a subset for 2000 data for clearer examples
data2000 = gm[gm.year == '2000']   # notice how we have to treat it as a string to search, if you change this to an integer it breaks
print(f"Year 2000 subset: {data2000.shape}")

Dataset shape: (38982, 14)
Years covered: 1800 to 2018
Number of countries: 178
Regions included: Africa, Americas, Asia, Europe, Oceania
Column names: ['child_mortality', 'children_per_woman', 'co2_per_capita', 'country', 'income', 'income_group', 'life_expectancy', 'pop_density', 'population', 'region', 'sub_region', 'year', 'years_in_school_men', 'years_in_school_women']
Year 2000 subset: (178, 14)


## Key Concepts

- alt.Chart([DATA]).mark_[TYPE]().encode([MAPPINGS])

### Data Type Suffixes
- `:N` - Nominal (categories, no order)
- `:O` - Ordinal (categories with order)  
- `:Q` - Quantitative (numbers, measurements)
- `:T` - Temporal (dates and times)

### Common Mark Types
- `.mark_point()` - Scatter plot points
- `.mark_circle()` - Filled circles. Best for when you want to emphasize individual data points
- `.mark_square()` - Filled squares
- `.mark_bar()` - Bar charts. Useful when showing distribution of categorical data
- `.mark_tick()` - Distribution marks. Useful when showing distribution of a single variable.

### Essential Encoding Channels
- `x`, `y` - Position (most important)
- `color` - Categories or intensity
- `size` - Magnitude (quantitative only)  
- `shape` - Categories only
- `opacity` - Transparency
- `column` - Separate panels for specific variable (i.e. different plots for region)
- `tooltip` - Additional information (hover mouse to see more info)
- `alt.X/Y/Color('variable', sort = 'x,y,-x,-y)` - Additional x,y edit tools
- `alt.X/Y('variable', stack = 'normalize')` - Stacked Bar chart for percentages
- `alt.Color('variable', scale = alt.Scale(scheme='category10')` - For using a specific color scheme
- `.encode().properties(width = ..., height = ...)` - Change width / height of graph

---

**How to tell if two variables are correlated by looking at a scatter plot**

**Positive Correlation**
- **Upward trend**: As X increases, Y tends to increase.
- **Tight clustering**: Points follow a clear upward line or curve.
- **Few outliers**: Most points conform to the pattern.

**Negative Correlation**
- **Downward trend**: As X increases, Y tends to decrease.
- **Tight clustering**: Points follow a clear downward line or curve.
- **Few outliers**: Most points fit the downward trend.

**No Correlation**
- **No clear trend**: Points are scattered randomly.
- **Wide spread**: No discernible line or curve.
- **Many outliers**: No obvious relationship between X and Y.
![Correlation](https://articles.outlier.org/_next/image?url=https%3A%2F%2Fimages.ctfassets.net%2Fkj4bmrik9d6o%2F2oArz66jpUDD00bOYo58e9%2F90ee20b033c2695c6884c5c652f75b81%2FOutlier_Graph_NegativeCorrelation-02.png&w=1080&q=75)

In [83]:
#corr_value = recent_data['life_expectancy'].corr(recent_data['co2_per_capita'])
#print(f"Correlation between life expectancy and carbon emissions: {corr_value:.2f}")

## Example Plots

In [4]:
# Using only the x-channel to show distribution
x_chart = alt.Chart(data2000).mark_tick().encode(
    x='children_per_woman:Q'  # Only x-position is encoded
)

# Using only the y-channel 
y_chart = alt.Chart(data2000).mark_tick().encode(
    y='life_expectancy:Q'  # Only y-position is encoded
)

x_chart | y_chart

In [5]:
nominal_color = alt.Chart(data2000).mark_point().encode(
    x='children_per_woman:Q',
    y='life_expectancy:Q',
    color='region:N'  # Color represents region membership
)
# Color with quantitative data
quantitative_color = alt.Chart(data2000).mark_point().encode(
    x='children_per_woman:Q',
    y='life_expectancy:Q',
    color='population:Q'  # Color intensity represents population
)

nominal_color | quantitative_color

In [6]:
# Shape encoding for nominal data
shape = alt.Chart(data2000).mark_point().encode(
    x='children_per_woman:Q',
    y='life_expectancy:Q',
    shape='region:N'  # Different shapes for each region
)

# Opacity encoding
opacity = alt.Chart(data2000).mark_point(filled=True).encode(
    x='children_per_woman:Q',
    y='life_expectancy:Q',
    opacity='population:Q'  # Transparency represents population
)

shape | opacity

In [7]:
# Adding tooltips
tooltip = alt.Chart(data2000).mark_point().encode(
    x='children_per_woman:Q',
    y='life_expectancy:Q',
    tooltip=['country', 'population', 'region']  # Show these fields on hover
)

tooltip

In [95]:
if np.issubdtype(gapminder['year'].dtype, np.datetime64):
    gapminder['year'] = gapminder['year'].dt.year
subset = gapminder[gapminder.year.isin([1952,1962,1972,1982,1992,2002,2012])]

grouped_bar = alt.Chart(subset).mark_bar().encode(
    x = 'year:O',
    y = 'sum(population)',
    color = 'region:N',
    column = 'region:N'
).properties(width=90)

grouped_bar

In [102]:
stacked = alt.Chart(subset).mark_bar().encode(
    x = 'year:O',
    y = 'sum(population):Q',
    color = 'region:N'
)

stackpercent = alt.Chart(subset).mark_bar().encode(
    x = 'year:O',
    y = alt.Y('sum(population):Q', stack = 'normalize'),
    color = 'region:N'
)

stacked | stackpercent

# Exercises

Exercise 1: Single Channel Encoding Create a chart showing the distribution of life expectancy using only tick marks along the y-axis.


In [8]:
#1
alt.Chart(data2000).mark_tick().encode(
    y = "life_expectancy"
)

Exercise 2: Basic Scatter Plot Create a scatter plot showing the relationship between population (population) and GDP per capita (gdpPercap).

In [18]:
#2
datacopy = data2000.copy()
datacopy["gdpPercap"] = datacopy["income"] / datacopy["population"]

alt.Chart(datacopy).mark_point().encode(
    x = "gdpPercap:Q",
    y = "population:Q",
    color = 'region:N',
)

Exercise 4: Bar Chart Practice Create a horizontal bar chart showing the count of countries by region, sorted from most to least countries.

In [17]:
alt.Chart(data2000).mark_bar().encode(
    y = alt.Y('region:N', sort='-x'),
    x = "count():Q"
)

Exercise 5: Data Type Experiment Create the same 1-dimensional chart using fertility data, but try it with both :Q (quantitative) and :N (nominal) data types. Compare the results.

In [19]:
withq = alt.Chart(data2000).mark_tick().encode(
    x='children_per_woman:Q'
)

withn = alt.Chart(data2000).mark_tick().encode(
    x='children_per_woman:N'
)

withq & withn

Create a scatter plot that reveals the relationship between carbon emissions and life expectancy using the 2014 data.

In the code cell below, write code that:

Create a visualization with the following specs:
- Use the **`circle`** mark
- Encode CO₂ per capita (`co2_per_capita`) on the **y channel**
- Encode life expectancy (`life_expectancy`) on the **x channel**
- Encode continent (`region`) on the **color channel**

**Add tooltips** showing `country`, `co2_per_capita`, and `life_expectancy`

In [29]:
recent_data = gm[gm.year == '2014']


alt.Chart(recent_data).mark_circle().encode(
    x = 'life_expectancy:Q',
    y = 'co2_per_capita:Q',
    color = 'region:N',
    tooltip = ['country', 'co2_per_capita', 'life_expectancy']
)

Create a temporal stacked bar chart showing how regional CO2 emissions have changed over decades.
 
The data wrangling has been provided for you. Create a visualization with the following specs:
- Use the `bar` mark
- Encode year (`year`) on the **x channel** as temporal data
- Encode sum of CO2 per capita (`sum(co2_per_capita)`) on the **y channel**
- Encode region (`region`) on the **color channel** with a better color scheme using `alt.Color('region:N', scale=alt.Scale(scheme='category10'))`
- Encode multiple fields on the **tooltip channel**: `year`, `region`, and `co2_per_capita`
- Set chart width to 600 pixels using `.properties(width=600)`



In [31]:
# Data Wrangling

# Convert year (int) back to datetime64
gm['year'] = pd.to_datetime(gm['year'], format='%Y')

# Filter for countries with CO2 data
co2_data = gm[gm.co2_per_capita.notna()]

# Aggregate by continent and year
co2_by_continent = co2_data.groupby(['year', 'region']).agg({
    'co2_per_capita': 'sum'
}).reset_index()


In [33]:
# Create  stacked chart

alt.Chart(co2_by_continent).mark_bar().encode(
    x = 'year:T',
    y = 'sum(co2_per_capita):Q',
    color = alt.Color('region:N', scale=alt.Scale(scheme='category10')),
    tooltip = ['year', 'region', 'co2_per_capita']
).properties(width=600)

#### Follow on
Update the viz above by normalizing the `y` channel to get a better view of the patterns



In [36]:
alt.Chart(co2_by_continent).mark_bar().encode(
    x = 'year:T',
    y = alt.Y('sum(co2_per_capita):Q', stack = 'normalize'),
    color = alt.Color('region:N', scale=alt.Scale(scheme='category10')),
    tooltip = ['year', 'region', 'co2_per_capita']
).properties(width=600)