In [1]:
import pandas as pd

In [2]:
df = pd.read_csv('data/Share of Organic Agricultural land.csv')
df

Unnamed: 0,Domain Code,Domain,Area Code (M49),Area,Element Code,Element,Item Code,Item,Year Code,Year,Unit,Value,Flag,Flag Description
0,RL,Land Use,4,Afghanistan,5110,Area,6671,Agriculture area under organic agric.,2020,2020,1000 ha,0.0977,I,Value imputed by a receiving agency
1,RL,Land Use,882,Samoa,7208,Share in Agricultural land,6671,Agriculture area under organic agric.,2022,2022,%,95.4700,E,Estimated value
2,RL,Land Use,4,Afghanistan,5110,Area,6671,Agriculture area under organic agric.,2021,2021,1000 ha,0.0977,I,Value imputed by a receiving agency
3,RL,Land Use,882,Samoa,7208,Share in Agricultural land,6671,Agriculture area under organic agric.,2023,2023,%,84.5900,E,Estimated value
4,RL,Land Use,4,Afghanistan,5110,Area,6671,Agriculture area under organic agric.,2022,2022,1000 ha,0.0977,I,Value imputed by a receiving agency
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1928,RL,Land Use,716,Zimbabwe,7208,Share in Agricultural land,6671,Agriculture area under organic agric.,2023,2023,%,0.0000,E,Estimated value
1929,RL,Land Use,716,Zimbabwe,5110,Area,6672,Agriculture area certified organic,2020,2020,1000 ha,1.0400,I,Value imputed by a receiving agency
1930,RL,Land Use,716,Zimbabwe,5110,Area,6672,Agriculture area certified organic,2021,2021,1000 ha,1.0199,I,Value imputed by a receiving agency
1931,RL,Land Use,716,Zimbabwe,5110,Area,6672,Agriculture area certified organic,2022,2022,1000 ha,1.0199,I,Value imputed by a receiving agency


### Normalization Step: Add ISO-alpha3 Code

In [None]:
# --- Normalization: Add ISO-alpha3 column ---

# 1. Load UNSD Methodology for mapping
try:
    unsd_df = pd.read_csv('data/UNSD — Methodology.csv', sep=';')
except FileNotFoundError:
    unsd_df = pd.read_csv('../data/UNSD — Methodology.csv', sep=';')

# 2. Create mappings
unsd_df['Country or Area'] = unsd_df['Country or Area'].astype(str).str.strip()
name_to_iso3 = unsd_df.set_index('Country or Area')['ISO-alpha3 Code'].to_dict()

m49_to_iso3 = {}
for idx, row in unsd_df.iterrows():
    try:
        # Handle potential non-numeric or NaN M49 codes
        m49_code = int(row['M49 Code'])
        m49_to_iso3[m49_code] = row['ISO-alpha3 Code']
    except (ValueError, TypeError):
        continue

# 3. Apply mapping
print("Applying ISO-alpha3 normalization...")
if 'Area Code (M49)' in df.columns:
    def get_iso(val):
        try:
            return m49_to_iso3.get(int(val), None)
        except:
            return None
    df['iso_alpha3'] = df['Area Code (M49)'].apply(get_iso)
    print("  Mapped M49 codes to 'iso_alpha3'.")
else:
    print("  Warning: Source column 'Area Code (M49)' not found.")

# Check results
missing_iso = df['iso_alpha3'].isna().sum()
if missing_iso > 0:
    print(f"  Warning: {missing_iso} rows have missing ISO-alpha3 codes.")
    print(df[df['iso_alpha3'].isna()][['Area Code (M49)']].head())


In [3]:
uniques = df.nunique()
cols_to_drop = uniques[uniques <= 1].index
df = df.drop(columns=cols_to_drop)
df

Unnamed: 0,Area Code (M49),Area,Element Code,Element,Item Code,Item,Year Code,Year,Unit,Value,Flag,Flag Description
0,4,Afghanistan,5110,Area,6671,Agriculture area under organic agric.,2020,2020,1000 ha,0.0977,I,Value imputed by a receiving agency
1,882,Samoa,7208,Share in Agricultural land,6671,Agriculture area under organic agric.,2022,2022,%,95.4700,E,Estimated value
2,4,Afghanistan,5110,Area,6671,Agriculture area under organic agric.,2021,2021,1000 ha,0.0977,I,Value imputed by a receiving agency
3,882,Samoa,7208,Share in Agricultural land,6671,Agriculture area under organic agric.,2023,2023,%,84.5900,E,Estimated value
4,4,Afghanistan,5110,Area,6671,Agriculture area under organic agric.,2022,2022,1000 ha,0.0977,I,Value imputed by a receiving agency
...,...,...,...,...,...,...,...,...,...,...,...,...
1928,716,Zimbabwe,7208,Share in Agricultural land,6671,Agriculture area under organic agric.,2023,2023,%,0.0000,E,Estimated value
1929,716,Zimbabwe,5110,Area,6672,Agriculture area certified organic,2020,2020,1000 ha,1.0400,I,Value imputed by a receiving agency
1930,716,Zimbabwe,5110,Area,6672,Agriculture area certified organic,2021,2021,1000 ha,1.0199,I,Value imputed by a receiving agency
1931,716,Zimbabwe,5110,Area,6672,Agriculture area certified organic,2022,2022,1000 ha,1.0199,I,Value imputed by a receiving agency


In [4]:
df = df.drop(columns=['Element Code','Item Code','Year Code','Flag', 'Flag Description'])
df

Unnamed: 0,Area Code (M49),Area,Element,Item,Year,Unit,Value
0,4,Afghanistan,Area,Agriculture area under organic agric.,2020,1000 ha,0.0977
1,882,Samoa,Share in Agricultural land,Agriculture area under organic agric.,2022,%,95.4700
2,4,Afghanistan,Area,Agriculture area under organic agric.,2021,1000 ha,0.0977
3,882,Samoa,Share in Agricultural land,Agriculture area under organic agric.,2023,%,84.5900
4,4,Afghanistan,Area,Agriculture area under organic agric.,2022,1000 ha,0.0977
...,...,...,...,...,...,...,...
1928,716,Zimbabwe,Share in Agricultural land,Agriculture area under organic agric.,2023,%,0.0000
1929,716,Zimbabwe,Area,Agriculture area certified organic,2020,1000 ha,1.0400
1930,716,Zimbabwe,Area,Agriculture area certified organic,2021,1000 ha,1.0199
1931,716,Zimbabwe,Area,Agriculture area certified organic,2022,1000 ha,1.0199


In [5]:
df = df[df['Element'] == 'Share in Agricultural land'].reset_index(drop=True)
df

Unnamed: 0,Area Code (M49),Area,Element,Item,Year,Unit,Value
0,882,Samoa,Share in Agricultural land,Agriculture area under organic agric.,2022,%,95.47
1,882,Samoa,Share in Agricultural land,Agriculture area under organic agric.,2023,%,84.59
2,882,Samoa,Share in Agricultural land,Agriculture area under organic agric.,2020,%,82.96
3,882,Samoa,Share in Agricultural land,Agriculture area under organic agric.,2021,%,82.96
4,438,Liechtenstein,Share in Agricultural land,Agriculture area under organic agric.,2023,%,31.18
...,...,...,...,...,...,...,...
669,512,Oman,Share in Agricultural land,Agriculture area under organic agric.,2021,%,0.00
670,512,Oman,Share in Agricultural land,Agriculture area under organic agric.,2022,%,0.00
671,512,Oman,Share in Agricultural land,Agriculture area under organic agric.,2023,%,0.00
672,894,Zambia,Share in Agricultural land,Agriculture area under organic agric.,2020,%,0.00


In [6]:
uniques = df.nunique()
cols_to_drop = uniques[uniques <= 1].index
df = df.drop(columns=cols_to_drop)
df

Unnamed: 0,Area Code (M49),Area,Year,Value
0,882,Samoa,2022,95.47
1,882,Samoa,2023,84.59
2,882,Samoa,2020,82.96
3,882,Samoa,2021,82.96
4,438,Liechtenstein,2023,31.18
...,...,...,...,...
669,512,Oman,2021,0.00
670,512,Oman,2022,0.00
671,512,Oman,2023,0.00
672,894,Zambia,2020,0.00


In [7]:
df.rename(columns={'Value': 'Share of Organic Agricultural land (%)'}, inplace=True)
df

Unnamed: 0,Area Code (M49),Area,Year,Share of Organic Agricultural land (%)
0,882,Samoa,2022,95.47
1,882,Samoa,2023,84.59
2,882,Samoa,2020,82.96
3,882,Samoa,2021,82.96
4,438,Liechtenstein,2023,31.18
...,...,...,...,...
669,512,Oman,2021,0.00
670,512,Oman,2022,0.00
671,512,Oman,2023,0.00
672,894,Zambia,2020,0.00


In [8]:
df = df.sort_values(by='Area')
df

Unnamed: 0,Area Code (M49),Area,Year,Share of Organic Agricultural land (%)
643,4,Afghanistan,2020,0.00
646,4,Afghanistan,2023,0.00
645,4,Afghanistan,2022,0.00
644,4,Afghanistan,2021,0.00
552,8,Albania,2022,0.06
...,...,...,...,...
672,894,Zambia,2020,0.00
641,716,Zimbabwe,2021,0.01
640,716,Zimbabwe,2020,0.01
642,716,Zimbabwe,2022,0.01


In [9]:
df.rename(columns={'Area': 'Country Name'}, inplace=True)
df

Unnamed: 0,Area Code (M49),Country Name,Year,Share of Organic Agricultural land (%)
643,4,Afghanistan,2020,0.00
646,4,Afghanistan,2023,0.00
645,4,Afghanistan,2022,0.00
644,4,Afghanistan,2021,0.00
552,8,Albania,2022,0.06
...,...,...,...,...
672,894,Zambia,2020,0.00
641,716,Zimbabwe,2021,0.01
640,716,Zimbabwe,2020,0.01
642,716,Zimbabwe,2022,0.01
