### here we load the dataset and import the libraries

#### this is so we can easily fetch the libraries and data, we can skip running all cells this way and can just run whats needed

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# Load the data
data = pd.read_csv('dataset/visualizing_global_co2_data.csv')

### see how many countries there are

In [2]:
# there is 278 countries in the dataset
print(data['country'].unique().size)

# this is the list of the countries
print(data['country'].unique())

278
['Afghanistan' 'Africa' 'Africa (GCP)' 'Aland Islands' 'Albania' 'Algeria'
 'American Samoa' 'Andorra' 'Angola' 'Anguilla' 'Antarctica'
 'Antigua and Barbuda' 'Argentina' 'Armenia' 'Aruba' 'Asia' 'Asia (GCP)'
 'Asia (excl. China and India)' 'Australia' 'Austria' 'Azerbaijan'
 'Bahamas' 'Bahrain' 'Bangladesh' 'Barbados' 'Belarus' 'Belgium' 'Belize'
 'Benin' 'Bermuda' 'Bhutan' 'Bolivia' 'Bonaire Sint Eustatius and Saba'
 'Bosnia and Herzegovina' 'Botswana' 'Brazil' 'British Virgin Islands'
 'Brunei' 'Bulgaria' 'Burkina Faso' 'Burundi' 'Cambodia' 'Cameroon'
 'Canada' 'Cape Verde' 'Central African Republic' 'Central America (GCP)'
 'Chad' 'Chile' 'China' 'Christmas Island' 'Colombia' 'Comoros' 'Congo'
 'Cook Islands' 'Costa Rica' "Cote d'Ivoire" 'Croatia' 'Cuba' 'Curacao'
 'Cyprus' 'Czechia' 'Democratic Republic of Congo' 'Denmark' 'Djibouti'
 'Dominica' 'Dominican Republic' 'East Timor' 'Ecuador' 'Egypt'
 'El Salvador' 'Equatorial Guinea' 'Eritrea' 'Estonia' 'Eswatini'
 'Ethiopia' 'Eu

### see when each country has gotten the most data to set a range to use

In [3]:
# Group by country and year, then count non-null entries for each column per row
grouped = data.groupby(['country', 'year']).count()

# Check how many non-null entries there are in the grouped set of data
grouped['count_of_ones'] = grouped.apply(lambda row: (row == 1).sum(), axis=1)

# Reset to make 'country' and 'year' columns
grouped_reset = grouped.reset_index()

# Sort count of ones in descending order for each country to get the highest count first
sorted_grouped = grouped_reset.sort_values(by=['country', 'count_of_ones'], ascending=[True, False])

# Drop duplicates for each country after the first so we are keeping the highest count of ones
highest_sum_of_ones = sorted_grouped.drop_duplicates(subset=['country'], keep='first')

# Count how many times each year appears
year_counts = highest_sum_of_ones['year'].value_counts()

# Grab the most frequent year
most_frequent_year = year_counts.idxmax()
most_frequent_year_count = year_counts.max()

print(f"The most frequent year is {most_frequent_year} with {most_frequent_year_count} occurrences.")

The most frequent year is 1990 with 197 occurrences.


### look at the gdp

In [43]:
# remove the rows with missing values
data_test = data[['country', 'year', 'gdp']].dropna()

# double check the number of countries for the new dataframe size
print(data_test['country'].unique().size)

# average GDP per country
average_gdp = data_test.groupby('country')['gdp'].mean()

# get the minimum and maximum GDP for each country
min_gdp = data_test.groupby('country')['gdp'].idxmin()
max_gdp = data_test.groupby('country')['gdp'].idxmax()

# get the corresponding year
# ! get the row index of the minimum and maximum GDP and set the column names
min_gdp_per_country = data_test.loc[min_gdp, ['country', 'year', 'gdp']].rename(columns={'year': 'min_year', 'gdp': 'min_gdp'})
max_gdp_per_country = data_test.loc[max_gdp, ['country', 'year', 'gdp']].rename(columns={'year': 'max_year', 'gdp': 'max_gdp'})

# make 1 dataframe with all the information
# ! we reset the index to make the dataframe with its own column names
gdp_per_country = average_gdp.reset_index().rename(columns={'gdp': 'mean_gdp'})

# ! merge the min and max gdp into the average
gdp_per_country = gdp_per_country.merge(min_gdp_per_country, on='country')
gdp_per_country = gdp_per_country.merge(max_gdp_per_country, on='country')

# Print the result
print(gdp_per_country)

166
         country      mean_gdp  min_year       min_gdp  max_year       max_gdp
0    Afghanistan  2.146437e+10      1994  7.919857e+09      2017  6.875280e+10
1        Albania  1.160662e+10      1870  4.287330e+08      2018  3.400911e+10
2        Algeria  1.651044e+11      1870  4.304640e+09      2018  5.903176e+11
3         Angola  3.792500e+10      1950  6.905244e+09      2016  1.783339e+11
4      Argentina  2.617168e+11      1850  2.193400e+09      2017  8.504822e+11
..           ...           ...       ...           ...       ...           ...
161      Vietnam  1.621886e+11      1870  8.475040e+09      2018  6.614884e+11
162        World  3.865245e+13      1820  1.138510e+12      2018  1.136302e+14
163        Yemen  4.583473e+10      1950  6.936333e+09      2010  1.181481e+11
164       Zambia  1.571914e+10      1950  2.690862e+09      2018  6.097862e+10
165     Zimbabwe  1.542784e+10      1950  3.186970e+09      1998  2.798427e+10

[166 rows x 6 columns]
