In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scipy import stats
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
import warnings
warnings.filterwarnings('ignore')

# Country Configuration laden
import sys
sys.path.append('./utils')
from country_config import country_colors, g7, eu_core, extra_countries, get_selected_countries

# Plotting-Konfiguration
plt.style.use('default')
plt.rcParams['figure.figsize'] = (12, 8)
plt.rcParams['font.size'] = 10

print("Bibliotheken und Country Configuration geladen.")

Bibliotheken und Country Configuration geladen.


In [2]:
# CO2 & ENERGIE DATENANALYSE

In [7]:
# OWID CO2 & Energy Dataset Understanding
df_co2_energy = pd.read_csv('../data/processed/co2_energy_data.csv')

print(f"CO2 & Energy Dataset analysiert: {df_co2_energy.shape}")
print(f"Zeitraum: {df_co2_energy['Year'].min()} - {df_co2_energy['Year'].max()}")
print(f"Länder: {df_co2_energy['Country'].nunique()}")

# Deskriptive Statistik
key_vars = ['CO2_total', 'CO2_per_capita', 'Primary_energy', 'Population', 'GDP']
print(f"\nDESKRIPTIVE STATISTIK")
desc_stats = df_co2_energy[key_vars].describe()
display(desc_stats)

# Zeitliche Datenverfügbarkeit
year_coverage = df_co2_energy.groupby('Year').size()
print(f"\nZeitliche Datenverfügbarkeit:")
print(f"Durchschnittliche Länder pro Jahr: {year_coverage.mean():.1f}")
print(f"Datenverfügbarkeit: {year_coverage.min()} - {year_coverage.max()} Länder pro Jahr")

# Top CO2-Emittenten Analyse
latest_year = df_co2_energy['Year'].max()
latest_data = df_co2_energy[df_co2_energy['Year'] == latest_year].copy()

print(f"\nCO2-EMISSIONEN ANALYSE ({latest_year})")

# Strategische Länderauswahl anwenden
selected_countries = get_selected_countries(df_co2_energy, 'CO2_total')
print(f"Strategisch ausgewählte Länder für Analyse: {len(selected_countries)}")
print(f"G7: {g7}")
print(f"EU Core: {eu_core}")
print(f"Extra Countries: {extra_countries}")

# Top 15 global vs. strategische Länder
top_emitters_global = latest_data.nlargest(15, 'CO2_total')[['Country', 'CO2_total', 'CO2_per_capita']]
print(f"\nTop 15 CO2-Emittenten global:")
display(top_emitters_global)

# Strategische Länder in den Daten prüfen
selected_data = latest_data[latest_data['Country'].isin(selected_countries)]
print(f"\nStrategische Länder in den Daten: {len(selected_data)} von {len(selected_countries)}")
if len(selected_data) > 0:
    selected_emissions = selected_data[['Country', 'CO2_total', 'CO2_per_capita']].sort_values('CO2_total', ascending=False)
    print(f"CO2-Emissionen der strategischen Länder:")
    display(selected_emissions)

# CO2-Quellen Analyse
co2_sources = ['CO2_coal', 'CO2_oil', 'CO2_gas', 'CO2_cement', 'CO2_flaring', 'CO2_other']
available_sources = [col for col in co2_sources if col in df_co2_energy.columns]

if available_sources:
    print(f"\nCO2-Emissionen nach Quellen (globale Summen):")
    source_totals = latest_data[available_sources].sum().sort_values(ascending=False)
    for source, total in source_totals.items():
        percentage = (total / source_totals.sum()) * 100
        print(f"  {source}: {total:.1f} Mt ({percentage:.1f}%)")

# Energie-Analyse
energy_countries = df_co2_energy[df_co2_energy['Primary_energy'].notna()]
print(f"\nENERGIE-ANALYSE")
print(f"Länder mit Energiedaten: {energy_countries['Country'].nunique()}")

if len(energy_countries) > 0:
    top_energy = latest_data.nlargest(10, 'Primary_energy')[['Country', 'Primary_energy', 'Energy_per_capita']]
    print(f"Top 10 Primärenergie-Verbraucher:")
    display(top_energy)

# Zeitliche Trends  
print(f"\nZEITLICHE ENTWICKLUNG")
# Globale Summen durch Aggregation aller Länder
global_trends = df_co2_energy.groupby('Year')['CO2_total'].sum()
if len(global_trends) > 0:
    co2_start = global_trends.iloc[0]
    co2_end = global_trends.iloc[-1]
    growth_rate = ((co2_end / co2_start) - 1) * 100
    
    print(f"Globale CO2-Entwicklung (aggregiert):")
    print(f"  {global_trends.index[0]}: {co2_start:.1f} Mt")
    print(f"  {global_trends.index[-1]}: {co2_end:.1f} Mt")
    print(f"  Gesamtwachstum: {growth_rate:.1f}%")

CO2 & Energy Dataset analysiert: (12862, 20)
Zeitraum: 1960 - 2018
Länder: 218

DESKRIPTIVE STATISTIK


Unnamed: 0,CO2_total,CO2_per_capita,Primary_energy,Population,GDP
count,12173.0,12102.0,8673.0,12744.0,9343.0
mean,106.05964,5.322128,589.358992,24347080.0,313115900000.0
std,512.499997,12.651802,2404.597315,101894600.0,1175574000000.0
min,0.0,0.0,0.0,552.0,70560000.0
25%,0.566,0.46225,5.683,456010.2,10055940000.0
50%,5.164,2.1185,50.25,4234795.0,35770130000.0
75%,42.162,6.74775,300.531,13494640.0,166059600000.0
max,10332.992,376.469,38409.645,1419009000.0,22294290000000.0



Zeitliche Datenverfügbarkeit:
Durchschnittliche Länder pro Jahr: 218.0
Datenverfügbarkeit: 218 - 218 Länder pro Jahr

CO2-EMISSIONEN ANALYSE (2018)
Strategisch ausgewählte Länder für Analyse: 19
G7: ['United States', 'Germany', 'France', 'Canada', 'Japan', 'United Kingdom']
EU Core: ['Spain', 'Poland', 'Netherlands', 'Sweden']
Extra Countries: ['China', 'India', 'Russia', 'Brazil', 'Vietnam', 'Malaysia', 'Singapore', 'United Arab Emirates']

Top 15 CO2-Emittenten global:


Unnamed: 0,Country,CO2_total,CO2_per_capita
2477,China,10332.992,7.282
12271,United States,5377.797,16.05
5368,India,2593.058,1.886
9498,Russia,1712.494,11.695
5840,Japan,1142.481,9.002
4542,Germany,760.046,9.117
5486,Iran,689.783,8.01
10796,South Korea,670.169,12.977
10088,Saudi Arabia,637.487,20.994
5427,Indonesia,594.101,2.201



Strategische Länder in den Daten: 19 von 19
CO2-Emissionen der strategischen Länder:


Unnamed: 0,Country,CO2_total,CO2_per_capita
2477,China,10332.992,7.282
12271,United States,5377.797,16.05
5368,India,2593.058,1.886
9498,Russia,1712.494,11.695
5840,Japan,1142.481,9.002
4542,Germany,760.046,9.117
2182,Canada,579.079,15.527
1710,Brazil,478.379,2.321
12212,United Kingdom,379.73,5.689
9262,Poland,336.069,8.788



CO2-Emissionen nach Quellen (globale Summen):
  CO2_coal: 14740.6 Mt (41.6%)
  CO2_oil: 11000.3 Mt (31.0%)
  CO2_gas: 7469.9 Mt (21.1%)
  CO2_cement: 1554.3 Mt (4.4%)
  CO2_flaring: 407.1 Mt (1.1%)
  CO2_other: 303.2 Mt (0.9%)

ENERGIE-ANALYSE
Länder mit Energiedaten: 205
Top 10 Primärenergie-Verbraucher:


Unnamed: 0,Country,Primary_energy,Energy_per_capita
2477,China,38409.645,27104.984
12271,United States,26768.986,80595.484
5368,India,9088.107,6638.485
9498,Russia,8432.208,57892.727
5840,Japan,5316.556,42109.379
2182,Canada,4090.168,110439.844
4542,Germany,3787.682,45691.598
10796,South Korea,3503.051,67787.555
1710,Brazil,3475.968,16539.109
10088,Saudi Arabia,3101.677,88573.469



ZEITLICHE ENTWICKLUNG
Globale CO2-Entwicklung (aggregiert):
  1960: 9148.8 Mt
  2018: 35475.3 Mt
  Gesamtwachstum: 287.8%


In [4]:
# WIRTSCHAFTSINDIKATOREN DATENANALYSE

In [9]:
# WDI Economic Indicators Understanding  
df_economic = pd.read_csv('../data/processed/economic_indicators.csv')

print(f"Economic Indicators Dataset analysiert: {df_economic.shape}")
print(f"Zeitraum: {df_economic['Year'].min()} - {df_economic['Year'].max()}")
print(f"Länder: {df_economic['Country'].nunique()}")

# Wirtschaftsentwicklung
econ_latest_year = df_economic['Year'].max()
econ_latest = df_economic[df_economic['Year'] == econ_latest_year]

print(f"\nWIRTSCHAFTSENTWICKLUNG ({econ_latest_year})")

# GDP-Analysen
gdp_cols = [col for col in df_economic.columns if 'GDP' in col]
print(f"Verfügbare GDP-Indikatoren: {gdp_cols}")

# Debugging: GDP-Daten prüfen
print(f"\nDebugging GDP-Daten für {econ_latest_year}:")
print(f"Gesamt Länder: {len(econ_latest)}")
print(f"Länder mit GDP-Daten: {econ_latest['GDP_current_USD'].notna().sum()}")

if 'GDP_current_USD' in df_economic.columns:
    # Alle Jahre mit GDP-Daten prüfen
    gdp_by_year = df_economic.groupby('Year')['GDP_current_USD'].count()
    print(f"GDP-Verfügbarkeit nach Jahren (letzte 5):")
    for year in sorted(gdp_by_year.index)[-5:]:
        print(f"  {year}: {gdp_by_year[year]} Länder")
    
    # Bestes Jahr für GDP-Analyse finden
    best_gdp_year = gdp_by_year.idxmax()
    best_year_data = df_economic[df_economic['Year'] == best_gdp_year]
    
    print(f"\nBestes Jahr für GDP-Analyse: {best_gdp_year} ({gdp_by_year[best_gdp_year]} Länder)")
    
    # Top 10 Volkswirtschaften aus bestem Jahr
    top_economies = best_year_data.nlargest(10, 'GDP_current_USD')[['Country', 'GDP_current_USD', 'GDP_per_capita_USD']]
    if len(top_economies) > 0:
        print(f"Top 10 Volkswirtschaften ({best_gdp_year}):")
        display(top_economies)
    else:
        print("Keine GDP-Daten verfügbar in WDI-Dataset")
else:
    print("GDP_current_USD Spalte nicht gefunden")

# Wirtschaftsstruktur
structure_cols = ['Industry_share_GDP', 'Manufacturing_share_GDP', 'Services_share_GDP']
available_structure = [col for col in structure_cols if col in df_economic.columns]

print(f"\nWIRTSCHAFTSSTRUKTUR")
if available_structure:
    structure_stats = df_economic[available_structure].describe()
    print(f"Wirtschaftsstruktur-Statistik (% des BIP):")
    display(structure_stats)

# Urbanisierung
if 'Urban_population_pct' in df_economic.columns:
    print(f"\nURBANISIERUNG")
    urban_stats = df_economic['Urban_population_pct'].describe()
    print(f"Urbanisierung - Deskriptive Statistik:")
    print(f"  Median: {urban_stats['50%']:.1f}%")
    print(f"  Durchschnitt: {urban_stats['mean']:.1f}%")
    print(f"  Spannweite: {urban_stats['min']:.1f}% - {urban_stats['max']:.1f}%")
    
    # Debugging: Urban-Daten für verschiedene Jahre prüfen
    urban_by_year = df_economic.groupby('Year')['Urban_population_pct'].count()
    best_urban_year = urban_by_year.idxmax()
    print(f"Bestes Jahr für Urbanisierungs-Analyse: {best_urban_year} ({urban_by_year[best_urban_year]} Länder)")
    
    # Extremwerte aus bestem Jahr
    best_urban_data = df_economic[df_economic['Year'] == best_urban_year]
    latest_urban = best_urban_data[best_urban_data['Urban_population_pct'].notna()]
    
    if len(latest_urban) > 0:
        print(f"Extremwerte Urbanisierung ({best_urban_year}):")
        print(f"Höchste:")
        display(latest_urban.nlargest(5, 'Urban_population_pct')[['Country', 'Urban_population_pct']])
        print(f"Niedrigste:")
        display(latest_urban.nsmallest(5, 'Urban_population_pct')[['Country', 'Urban_population_pct']])
    else:
        print("Keine Urbanisierungsdaten verfügbar")

# Handel und Globalisierung
if 'Trade_share_GDP' in df_economic.columns:
    print(f"\nHANDEL UND GLOBALISIERUNG")
    trade_stats = df_economic['Trade_share_GDP'].describe()
    print(f"Handelsintensität (% des BIP):")
    print(f"  Median: {trade_stats['50%']:.1f}%")
    print(f"  Durchschnitt: {trade_stats['mean']:.1f}%")
    print(f"  Maximum: {trade_stats['max']:.1f}%")

Economic Indicators Dataset analysiert: (12695, 12)
Zeitraum: 1960 - 2018
Länder: 220

WIRTSCHAFTSENTWICKLUNG (2018)
Verfügbare GDP-Indikatoren: ['GDP_current_USD', 'GDP_growth_annual', 'GDP_per_capita_USD', 'Industry_share_GDP', 'Services_share_GDP', 'Trade_share_GDP']

Debugging GDP-Daten für 2018:
Gesamt Länder: 1
Länder mit GDP-Daten: 0
GDP-Verfügbarkeit nach Jahren (letzte 5):
  2014: 206 Länder
  2015: 205 Länder
  2016: 203 Länder
  2017: 197 Länder
  2018: 0 Länder

Bestes Jahr für GDP-Analyse: 2005 (208 Länder)
Top 10 Volkswirtschaften (2005):


Unnamed: 0,Country,GDP_current_USD,GDP_per_capita_USD
12132,United States,13093730000000.0,44307.920585
5840,Japan,4755411000000.0,37217.648728
4332,Germany,2861410000000.0,34696.620917
12074,United Kingdom,2520702000000.0,41732.64054
2424,China,2285966000000.0,1753.417829
4042,France,2196126000000.0,34760.187766
5724,Italy,1852662000000.0,31959.262952
2076,Canada,1169358000000.0,36189.588384
10624,Spain,1157276000000.0,26510.717453
6188,"Korea, Rep.",898137200000.0,18639.522205



WIRTSCHAFTSSTRUKTUR
Wirtschaftsstruktur-Statistik (% des BIP):


Unnamed: 0,Industry_share_GDP,Services_share_GDP
count,7502.0,6839.0
mean,26.705643,50.030741
std,12.787149,13.399808
min,2.073173,4.791639
25%,18.108612,40.779807
50%,24.882311,49.878316
75%,32.064024,59.123874
max,90.512958,99.971188



URBANISIERUNG
Urbanisierung - Deskriptive Statistik:
  Median: 49.1%
  Durchschnitt: 50.5%
  Spannweite: 2.1% - 100.0%
Bestes Jahr für Urbanisierungs-Analyse: 1990 (218 Länder)
Extremwerte Urbanisierung (1990):
Höchste:


Unnamed: 0,Country,Urban_population_pct
1248,Bermuda,100.0
2119,Cayman Islands,100.0
4433,Gibraltar,100.0
7681,Monaco,100.0
8087,Nauru,100.0


Niedrigste:


Unnamed: 0,Country,Urban_population_pct
9595,Rwanda,5.416
1829,Burundi,6.271
8145,Nepal,8.854
11885,Uganda,11.076
7043,Malawi,11.56



HANDEL UND GLOBALISIERUNG
Handelsintensität (% des BIP):
  Median: 68.1%
  Durchschnitt: 78.0%
  Maximum: 860.8%


In [8]:
# Vollständigkeit (also nicht fehlend)
co2_completeness = df_co2.notna().mean().sort_values(ascending=False) * 100
print("Vollständigkeit der CO2/Energie-Features:")
print(co2_completeness.round(1))


Vollständigkeit der CO2/Energie-Features:
Country                      100.0
ISO3                         100.0
Year                         100.0
Population                    99.1
CO2_total                     94.6
CO2_per_capita                94.1
CO2_oil                       94.0
CO2_cement                    91.9
GDP                           72.6
CO2_intensity                 71.8
CO2_intensity_luc             70.9
Primary_energy                67.4
Energy_per_capita             67.1
CO2_per_energy                66.7
CO2_coal                      63.8
CO2_flaring                   60.4
Energy_intensity              55.3
CO2_gas                       49.5
CO2_intensity_consumption     26.7
CO2_other                     10.9
dtype: float64


In [9]:
# Vollständigkeit berechnen (%)
econ_completeness = df_economic.notna().mean().sort_values(ascending=False) * 100

# Ergebnis anzeigen
print("Vollständigkeit im Wirtschaftsindikatoren-Datensatz (in %):")
print(econ_completeness.round(1))

Vollständigkeit im Wirtschaftsindikatoren-Datensatz (in %):
Country                 100.0
ISO3                    100.0
Year                    100.0
Population_total         99.7
Urban_population_pct     99.1
CO2_per_capita_WDI       78.9
GDP_current_USD          75.4
GDP_per_capita_USD       75.3
GDP_growth_annual        72.1
Trade_share_GDP          66.8
Industry_share_GDP       59.1
Services_share_GDP       53.9
dtype: float64


In [10]:
import pandas as pd

# Daten laden
df_co2 = pd.read_csv('../data/processed/co2_energy_data.csv')
df_economic = pd.read_csv('../data/processed/economic_indicators.csv')

# Ziel-Länder
target_countries = [
    'China', 'United States', 'India', 'Germany', 'Japan',
    'France', 'Brazil', 'United Kingdom', 'South Korea', 'Russia'
]

# Vollständigkeit je Land und Feature (%)
co2_availability = df_co2.groupby('Country').apply(lambda g: g.notna().mean() * 100)
econ_availability = df_economic.groupby('Country').apply(lambda g: g.notna().mean() * 100)

# Ziel-Länder extrahieren
co2_selected = co2_availability.loc[co2_availability.index.intersection(target_countries)].round(1)
econ_selected = econ_availability.loc[econ_availability.index.intersection(target_countries)].round(1)

# Anzeigen
print("CO₂-Datenverfügbarkeit pro Land (in %):")
display(co2_selected)

print("\nWirtschaftsindikatoren pro Land (in %):")
display(econ_selected)

CO₂-Datenverfügbarkeit pro Land (in %):


  co2_availability = df_co2.groupby('Country').apply(lambda g: g.notna().mean() * 100)
  econ_availability = df_economic.groupby('Country').apply(lambda g: g.notna().mean() * 100)


Unnamed: 0_level_0,Country,ISO3,Year,Population,GDP,CO2_total,CO2_per_capita,CO2_coal,CO2_oil,CO2_gas,CO2_cement,CO2_flaring,CO2_other,Primary_energy,Energy_per_capita,Energy_intensity,CO2_intensity,CO2_per_energy,CO2_intensity_luc,CO2_intensity_consumption
Country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
Brazil,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,49.2,91.5,91.5,91.5,100.0,91.5,100.0,49.2
China,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,81.4,100.0,91.5,91.5,91.5,100.0,91.5,100.0,49.2
France,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,81.4,49.2,91.5,91.5,91.5,100.0,91.5,100.0,49.2
Germany,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,54.2,49.2,91.5,91.5,91.5,100.0,91.5,100.0,49.2
India,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,98.3,100.0,96.6,0.0,91.5,91.5,91.5,100.0,91.5,100.0,49.2
Japan,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,83.1,49.2,91.5,91.5,91.5,100.0,91.5,100.0,49.2
Russia,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,49.2,57.6,57.6,57.6,100.0,57.6,100.0,49.2
South Korea,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,64.4,100.0,42.4,49.2,91.5,91.5,91.5,100.0,91.5,100.0,49.2
United Kingdom,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,49.2,91.5,91.5,91.5,100.0,91.5,100.0,49.2
United States,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,91.5,91.5,91.5,100.0,91.5,100.0,49.2



Wirtschaftsindikatoren pro Land (in %):


Unnamed: 0_level_0,Country,ISO3,Year,CO2_per_capita_WDI,GDP_current_USD,GDP_growth_annual,GDP_per_capita_USD,Industry_share_GDP,Population_total,Services_share_GDP,Trade_share_GDP,Urban_population_pct
Country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
Brazil,100.0,100.0,100.0,93.2,98.3,96.6,98.3,98.3,98.3,100.0,98.3,98.3
China,100.0,100.0,100.0,94.8,100.0,98.3,100.0,100.0,100.0,24.1,100.0,100.0
France,100.0,100.0,100.0,94.8,100.0,98.3,100.0,100.0,100.0,82.8,100.0,100.0
Germany,100.0,100.0,100.0,41.4,82.8,81.0,82.8,46.6,100.0,46.6,82.8,100.0
India,100.0,100.0,100.0,94.8,100.0,98.3,100.0,100.0,100.0,100.0,100.0,100.0
Japan,100.0,100.0,100.0,94.8,100.0,98.3,100.0,39.7,100.0,39.7,100.0,100.0
United Kingdom,100.0,100.0,100.0,94.8,100.0,98.3,100.0,48.3,100.0,48.3,100.0,100.0
United States,100.0,100.0,100.0,94.8,100.0,98.3,100.0,34.5,100.0,34.5,100.0,100.0


In [19]:
import pandas as pd

# Daten laden
df_co2 = pd.read_csv('../data/processed/co2_energy_data.csv')
df_economic = pd.read_csv('../data/processed/economic_indicators.csv')

# Nur Daten für China
china_co2 = df_co2[df_co2['Country'] == 'China']
china_econ = df_economic[df_economic['Country'] == 'China']

# Vollständigkeit berechnen
co2_comp = china_co2.notna().mean() * 100
econ_comp = china_econ.notna().mean() * 100

# Nur Features mit ≥90% anzeigen
co2_comp_filtered = co2_comp[co2_comp >= 90].round(1)
econ_comp_filtered = econ_comp[econ_comp >= 90].round(1)

# In DataFrames umwandeln
df_co2_final = pd.DataFrame(co2_comp_filtered, columns=["Vollständigkeit_CO2 (%)"])
df_econ_final = pd.DataFrame(econ_comp_filtered, columns=["Vollständigkeit_Economic (%)"])

# Ausgabe
print("CO₂-Daten (≥90% vollständig):")
display(df_co2_final)

print("Wirtschaftsindikatoren (≥90% vollständig):")
display(df_econ_final)


CO₂-Daten (≥90% vollständig):


Unnamed: 0,Vollständigkeit_CO2 (%)
Country,100.0
ISO3,100.0
Year,100.0
Population,100.0
GDP,100.0
CO2_total,100.0
CO2_per_capita,100.0
CO2_coal,100.0
CO2_oil,100.0
CO2_gas,100.0


Wirtschaftsindikatoren (≥90% vollständig):


Unnamed: 0,Vollständigkeit_Economic (%)
Country,100.0
ISO3,100.0
Year,100.0
CO2_per_capita_WDI,94.8
GDP_current_USD,100.0
GDP_growth_annual,98.3
GDP_per_capita_USD,100.0
Industry_share_GDP,100.0
Population_total,100.0
Trade_share_GDP,100.0


In [20]:
import pandas as pd

# Datensätze laden
df_co2 = pd.read_csv('../data/processed/co2_energy_data.csv')
df_economic = pd.read_csv('../data/processed/economic_indicators.csv')

# Nur China-Daten extrahieren
china_co2 = df_co2[df_co2['Country'] == 'China']
china_econ = df_economic[df_economic['Country'] == 'China']

# Relevante Spalten aus beiden Datensätzen
co2_columns = [
    'Year', 'Population', 'GDP', 'CO2_total', 'CO2_per_capita', 'CO2_coal', 'CO2_oil', 'CO2_gas', 
    'CO2_cement', 'CO2_other', 'Primary_energy', 'Energy_per_capita', 'Energy_intensity', 
    'CO2_intensity', 'CO2_per_energy', 'CO2_intensity_luc'
]

econ_columns = [
    'Year', 'Population_total', 'GDP_current_USD', 'GDP_growth_annual', 'GDP_per_capita_USD',
    'Industry_share_GDP', 'Trade_share_GDP', 'Urban_population_pct'
]

# Relevante Spalten extrahieren
china_co2_filtered = china_co2[co2_columns]
china_econ_filtered = china_econ[econ_columns]

# DataFrames über 'Year' mergen
df_china = pd.merge(china_co2_filtered, china_econ_filtered, on='Year', how='inner')

# Doppelte/Redundante Spalten entfernen
df_china = df_china.drop(columns=['Population_total', 'GDP_current_USD'])

# Nach Jahr sortieren
df_china = df_china.sort_values('Year').reset_index(drop=True)

# Ergebnis anzeigen
print("Vorhersage-DataFrame für China:")
display(df_china)

Vorhersage-DataFrame für China:


Unnamed: 0,Year,Population,GDP,CO2_total,CO2_per_capita,CO2_coal,CO2_oil,CO2_gas,CO2_cement,CO2_other,...,Energy_per_capita,Energy_intensity,CO2_intensity,CO2_per_energy,CO2_intensity_luc,GDP_growth_annual,GDP_per_capita_USD,Industry_share_GDP,Trade_share_GDP,Urban_population_pct
0,1960,654802100.0,705093000000.0,798.8,1.22,748.376,22.992,1.993,6.514,18.925,...,,,1.133,,1.722,,89.520542,44.391538,8.7341,16.203
1,1961,655773500.0,577128400000.0,570.63,0.87,522.475,22.336,2.818,2.585,20.417,...,,,0.989,,2.593,-27.27,75.805838,31.932159,7.360221,16.708
2,1962,665073700.0,616503000000.0,459.618,0.691,411.925,21.764,2.323,2.497,21.108,...,,,0.746,,2.375,-5.58,70.909412,31.311306,6.960936,17.226
3,1963,684552600.0,705534400000.0,456.779,0.667,406.129,23.27,1.957,3.355,22.069,...,,,0.647,,2.251,10.3,74.313643,33.068974,6.865337,17.757
4,1964,704798300.0,804505000000.0,460.637,0.654,401.574,27.729,2.03,5.032,24.271,...,,,0.573,,1.797,18.18,85.498555,35.328934,6.633104,18.299
5,1965,723415700.0,896126800000.0,500.286,0.692,432.993,34.999,2.11,6.801,23.383,...,2122.266,1.714,0.558,0.326,2.005,16.95,98.486778,35.092272,6.828143,18.086
6,1966,742402200.0,927339400000.0,549.459,0.74,469.765,44.547,2.568,8.387,24.192,...,2246.869,1.8,0.593,0.329,1.691,10.65,104.324566,37.877905,6.729496,17.915
7,1967,761026000.0,933378400000.0,460.226,0.605,383.599,42.499,2.799,6.085,25.244,...,1972.478,1.608,0.493,0.307,1.271,-5.77,96.589532,33.886969,6.253483,17.785
8,1968,781011000.0,912372800000.0,495.507,0.634,412.72,48.651,2.686,5.253,26.197,...,1943.555,1.662,0.543,0.327,1.452,-4.1,91.472718,31.110601,6.220974,17.656
9,1969,802140300.0,1006176000000.0,607.683,0.758,501.177,66.85,3.759,7.613,28.285,...,2302.365,1.834,0.604,0.329,2.12,16.94,100.129903,35.419427,5.453063,17.528


In [37]:
import pandas as pd

# Daten laden
df_co2 = pd.read_csv('../data/processed/co2_energy_data.csv')
df_economic = pd.read_csv('../data/processed/economic_indicators.csv')

# Nur Daten für Germany
china_co2 = df_co2[df_co2['Country'] == 'Germany']
china_econ = df_economic[df_economic['Country'] == 'Germany']

# Vollständigkeit berechnen
co2_comp = china_co2.notna().mean() * 100
econ_comp = china_econ.notna().mean() * 100

# Nur Features mit ≥90% anzeigen
co2_comp_filtered = co2_comp[co2_comp >= 90].round(1)
econ_comp_filtered = econ_comp[econ_comp >= 90].round(1)

# In DataFrames umwandeln
df_co2_final = pd.DataFrame(co2_comp_filtered, columns=["Vollständigkeit_CO2 (%)"])
df_econ_final = pd.DataFrame(econ_comp_filtered, columns=["Vollständigkeit_Economic (%)"])

# Ausgabe
print("CO₂-Daten (≥90% vollständig):")
display(df_co2_final)

print("Wirtschaftsindikatoren (≥90% vollständig):")
display(df_econ_final)

CO₂-Daten (≥90% vollständig):


Unnamed: 0,Vollständigkeit_CO2 (%)
Country,100.0
ISO3,100.0
Year,100.0
Population,100.0
GDP,100.0
CO2_total,100.0
CO2_per_capita,100.0
CO2_coal,100.0
CO2_oil,100.0
CO2_gas,100.0


Wirtschaftsindikatoren (≥90% vollständig):


Unnamed: 0,Vollständigkeit_Economic (%)
Country,100.0
ISO3,100.0
Year,100.0
Population_total,100.0
Urban_population_pct,100.0


In [38]:
import pandas as pd

# Lokale Pfade zu deinen Dateien
df_co2 = pd.read_csv('../data/processed/co2_energy_data.csv')
df_economic = pd.read_csv('../data/processed/economic_indicators.csv')

# Zielländer
selected_countries = [
    "China", "USA", "India", "Germany", "Japan", "France", "Brazil", 
    "United Kingdom", "Vietnam", "Spain", "Indonesia", "Canada"
]

# Filter auf die Zielländer
df_co2_sel = df_co2[df_co2['Country'].isin(selected_countries)]
df_econ_sel = df_economic[df_economic['Country'].isin(selected_countries)]

# Prozentuale Vollständigkeit je Feature pro Land
co2_availability = df_co2_sel.groupby("Country").apply(lambda g: g.notna().mean() * 100)
econ_availability = df_econ_sel.groupby("Country").apply(lambda g: g.notna().mean() * 100)

# Durchschnittliche Verfügbarkeit über alle Länder hinweg
mean_co2 = co2_availability.mean().sort_values(ascending=False)
mean_econ = econ_availability.mean().sort_values(ascending=False)

# Nur die Features mit >=90% mittlerer Vollständigkeit
co2_features_90 = mean_co2[mean_co2 >= 90]
econ_features_90 = mean_econ[mean_econ >= 90]

# Ausgabe zur Übersicht
print("CO₂-Features mit ≥90% durchschnittlicher Verfügbarkeit über alle Länder:")
print(co2_features_90)
print("\nWirtschafts-Features mit ≥90% durchschnittlicher Verfügbarkeit über alle Länder:")
print(econ_features_90)

CO₂-Features mit ≥90% durchschnittlicher Verfügbarkeit über alle Länder:
Country              100.000000
ISO3                 100.000000
Year                 100.000000
Population           100.000000
GDP                  100.000000
CO2_total            100.000000
CO2_per_capita       100.000000
CO2_coal             100.000000
CO2_oil              100.000000
CO2_cement           100.000000
CO2_intensity        100.000000
CO2_intensity_luc    100.000000
CO2_gas               99.845917
CO2_per_energy        91.525424
Primary_energy        91.525424
Energy_per_capita     91.525424
Energy_intensity      91.525424
dtype: float64

Wirtschafts-Features mit ≥90% durchschnittlicher Verfügbarkeit über alle Länder:
Country                 100.000000
ISO3                    100.000000
Year                    100.000000
Urban_population_pct     99.845917
Population_total         99.845917
Trade_share_GDP          94.203284
GDP_per_capita_USD       93.262845
GDP_current_USD          93.262845
GDP_gr

  co2_availability = df_co2_sel.groupby("Country").apply(lambda g: g.notna().mean() * 100)
  econ_availability = df_econ_sel.groupby("Country").apply(lambda g: g.notna().mean() * 100)


In [1]:
### DataFrame für China

In [4]:
import pandas as pd
df_co2_energy = pd.read_csv('../data/processed/co2_energy_data.csv')
df_economic = pd.read_csv('../data/processed/economic_indicators.csv')

# 1. Auf China filtern
df_co2_china = df_co2_energy[df_co2_energy['Country'] == 'China']
df_econ_china = df_economic[df_economic['Country'] == 'China']

# 2. Mergen nach Jahr (inner join)
df_china = pd.merge(df_co2_china, df_econ_china, on=['Country', 'ISO3', 'Year'], how='inner')

# 3. Nur relevante Spalten auswählen (Ausgewählt nach Vollständigkeit)
columns_to_keep = [
    'Year', 'CO2_total', 'CO2_coal', 'CO2_oil', 'CO2_gas', 'CO2_cement',
    'Primary_energy', 'Energy_intensity', 'Population', 'GDP',
    'Urban_population_pct', 'Trade_share_GDP', 'GDP_growth_annual'
]
df_china = df_china[columns_to_keep].dropna()

# 4. Nach Jahr sortieren
df_china = df_china.sort_values('Year').reset_index(drop=True)

df_china.to_csv('../data/processed/china_model_dataset.csv', index=False)
print("Datenset wurde erfolgreich erstellt")

Datenset wurde erfolgreich erstellt
