#### Import required libraries

In [3]:
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
import random

# Set seeds for reproducibility (important for portfolio)
np.random.seed(42)
random.seed(42)

#### Define Constants 

In [4]:
# Constants
n_rows = 1500

countries = ['Ethiopia', 'Kenya', 'Zambia']
country_weights = [0.50, 0.30, 0.20]

regions = {
    'Ethiopia': ['Oromia', 'Amhara'],
    'Kenya': ['Rift Valley', 'Western', 'Central'],
    'Zambia': ['Central', 'Eastern', 'Southern']
}

crops = ['Maize', 'Teff', 'Sorghum', 'Beans', 'Wheat']

# Field agents (enumerators) - KoBo/SurveyCTO style
agents = [f'ENUM{i:03d}' for i in range(1, 51)]

# Realistic yield ranges kg/ha (smallholder rainfed)
yield_ranges = {
    'Maize': (600, 3500),
    'Teff': (400, 1800),
    'Sorghum': (500, 2200),
    'Beans': (400, 1500),
    'Wheat': (800, 3000)
}

#### GPS Generator Function 

In [5]:
def random_gps(country):
    if country == 'Ethiopia':
        lat = np.random.uniform(6.0, 14.0)  # Oromia/Amhara focus
        lon = np.random.uniform(34.0, 42.0)
    elif country == 'Kenya':
        lat = np.random.uniform(-1.0, 1.5)   # Rift Valley / Western / Central
        lon = np.random.uniform(34.0, 38.0)
    else:  # Zambia
        lat = np.random.uniform(-18.0, -8.0)  # Central/Eastern/Southern
        lon = np.random.uniform(25.0, 33.0)
    
    # Simulate ~6% invalid entries (typos, poor signal)
    if random.random() < 0.06:
        if random.random() < 0.5:
            lat += np.random.uniform(-20, 20)   # nonsense latitude
        else:
            lon += np.random.uniform(-40, 40)   # nonsense longitude
    return round(lat, 6), round(lon, 6)

#### Main Data Generation Loop

In [6]:
data = []

for i in range(n_rows):
    # Country selection
    country = random.choices(countries, weights=country_weights)[0]
    region = random.choice(regions[country])
    
    # District & village - simple realistic placeholders
    district = f'{region[:3]}-Dist{random.randint(1, 40):02d}'
    village = f'Vil-{random.randint(100, 999)}'
    
    # Survey date: recent (2023â€“2026)
    days_ago = random.randint(30, 1095)
    survey_date = datetime.now() - timedelta(days=days_ago)
    
    lat, lon = random_gps(country)
    
    # Crop (Teff only in Ethiopia)
    if country == 'Ethiopia':
        crop = random.choice(crops)
    else:
        crop = random.choice([c for c in crops if c != 'Teff'])
    
    min_y, max_y = yield_ranges[crop]
    yield_val = np.random.normal((min_y + max_y)/2, (max_y - min_y)/8)
    yield_val = max(0, round(yield_val, 0))
    
    # Missing yield ~12%
    if random.random() < 0.12:
        yield_val = np.nan
    
    # Outliers ~4%
    if random.random() < 0.04:
        yield_val *= random.uniform(2.5, 5.0)
    
    insured = random.choices(['Yes', 'No'], weights=[0.40, 0.60])[0]
    premium_usd = round(random.uniform(8, 35), 2) if insured == 'Yes' else 0.00
    
    claim_triggered = 'Yes' if (insured == 'Yes' and random.random() < 0.18) else 'No'
    payout_usd = 0.00
    if claim_triggered == 'Yes':
        payout_usd = round(random.uniform(100, 500), 2)
    # Rare errors: payout without trigger
    if random.random() < 0.015 and claim_triggered == 'No':
        payout_usd = round(random.uniform(50, 350), 2)
    
    agent = random.choice(agents)
    
    # Farmer ID - some duplicates ~5%
    farmer_id = f'F-{country[:3]}-{random.randint(10000, 99999)}'
    if random.random() < 0.05 and data:
        farmer_id = random.choice([d['farmer_id'] for d in data])
    
    # Optional enumerator note (often blank)
    notes = random.choices(
        ['', 'Drought reported', 'Good rains this season', 'GPS weak signal'],
        weights=[0.70, 0.10, 0.10, 0.10]
    )[0]
    
    row = {
        'farmer_id': farmer_id,
        'country': country,
        'region': region,
        'district': district,
        'village': village,
        'survey_date': survey_date.strftime('%Y-%m-%d'),
        'gps_latitude': lat,
        'gps_longitude': lon,
        'crop': crop,
        'farm_size_ha': round(np.random.uniform(0.3, 5.0), 2),
        'yield_last_season_kg_ha': yield_val,
        'insured': insured,
        'premium_paid_usd': premium_usd,
        'claim_triggered': claim_triggered,
        'payout_amount_usd': payout_usd,
        'field_agent_id': agent,
        'enumerator_notes': notes
    }
    data.append(row)

#### Create DataFrame and shuffle

In [7]:
df = pd.DataFrame(data)
df = df.sample(frac=1, random_state=42).reset_index(drop=True)

print(f"Dataset generated: {df.shape[0]} rows, {df.shape[1]} columns")

Dataset generated: 1500 rows, 17 columns


#### Inspection

In [8]:
# Quick view
df.head(8)

Unnamed: 0,farmer_id,country,region,district,village,survey_date,gps_latitude,gps_longitude,crop,farm_size_ha,yield_last_season_kg_ha,insured,premium_paid_usd,claim_triggered,payout_amount_usd,field_agent_id,enumerator_notes
0,F-Zam-77258,Zambia,Central,Cen-Dist35,Vil-508,2024-07-27,-16.324335,27.482468,Sorghum,3.78,1657.0,No,0.0,No,0.0,ENUM034,
1,F-Eth-32245,Ethiopia,Oromia,Oro-Dist12,Vil-917,2024-06-02,11.331258,36.283233,Wheat,4.84,1646.0,No,0.0,No,0.0,ENUM001,
2,F-Eth-36922,Ethiopia,Oromia,Oro-Dist03,Vil-912,2023-02-24,12.494158,39.251829,Beans,4.66,,Yes,26.02,No,0.0,ENUM039,
3,F-Zam-18665,Zambia,Eastern,Eas-Dist19,Vil-267,2023-08-26,-11.197722,25.333383,Wheat,0.7,1714.0,Yes,13.61,No,0.0,ENUM022,
4,F-Zam-83207,Zambia,Eastern,Eas-Dist13,Vil-186,2023-09-15,-13.900951,31.718418,Wheat,2.1,2060.0,No,0.0,No,0.0,ENUM002,
5,F-Eth-55890,Ethiopia,Oromia,Oro-Dist33,Vil-885,2024-12-19,8.088388,36.747094,Sorghum,4.22,,No,0.0,No,0.0,ENUM012,GPS weak signal
6,F-Ken-90010,Kenya,Rift Valley,Rif-Dist40,Vil-562,2024-06-27,0.26114,35.248759,Maize,2.31,2638.0,No,0.0,No,0.0,ENUM032,
7,F-Eth-85452,Ethiopia,Oromia,Oro-Dist05,Vil-706,2025-03-04,7.434026,39.551597,Teff,1.38,1163.0,Yes,29.52,No,0.0,ENUM022,


#### Summary statistics

In [9]:
df.describe(include='all')

Unnamed: 0,farmer_id,country,region,district,village,survey_date,gps_latitude,gps_longitude,crop,farm_size_ha,yield_last_season_kg_ha,insured,premium_paid_usd,claim_triggered,payout_amount_usd,field_agent_id,enumerator_notes
count,1500,1500,1500,1500,1500,1500,1500.0,1500.0,1500,1500.0,1331.0,1500,1500.0,1500,1500.0,1500,1500.0
unique,1421,3,7,271,739,806,,,5,,,2,,2,,50,4.0
top,F-Eth-65723,Ethiopia,Oromia,Oro-Dist08,Vil-943,2025-10-17,,,Maize,,,No,,No,,ENUM043,
freq,3,748,393,19,8,6,,,343,,,912,,1400,,45,1065.0
mean,,,,,,,2.461512,35.633324,,2.6399,1677.139177,,8.5087,,22.66024,,
std,,,,,,,9.079917,5.066329,,1.372582,984.733981,,11.647753,,82.523963,,
min,,,,,,,-31.357092,-2.718131,,0.31,546.0,,0.0,,0.0,,
25%,,,,,,,-0.659641,34.35989,,1.45,1087.5,,0.0,,0.0,,
50%,,,,,,,4.630633,36.209725,,2.61,1511.0,,0.0,,0.0,,
75%,,,,,,,9.750881,38.060027,,3.8225,1986.5,,17.7825,,0.0,,


In [10]:
# Missing values
df.isnull().sum()

farmer_id                    0
country                      0
region                       0
district                     0
village                      0
survey_date                  0
gps_latitude                 0
gps_longitude                0
crop                         0
farm_size_ha                 0
yield_last_season_kg_ha    169
insured                      0
premium_paid_usd             0
claim_triggered              0
payout_amount_usd            0
field_agent_id               0
enumerator_notes             0
dtype: int64

In [12]:
# Value counts for key categoricals
print("Country distribution:\n", df['country'].value_counts(normalize=True))
print("\nCrop distribution:\n", df['crop'].value_counts())
print("\nInsured distribution:\n", df['insured'].value_counts(normalize=True))

Country distribution:
 country
Ethiopia    0.498667
Kenya       0.304667
Zambia      0.196667
Name: proportion, dtype: float64

Crop distribution:
 crop
Maize      343
Sorghum    339
Beans      336
Wheat      331
Teff       151
Name: count, dtype: int64

Insured distribution:
 insured
No     0.608
Yes    0.392
Name: proportion, dtype: float64


#### Export

In [13]:
output_file = 'farmer_survey_2026.csv'
df.to_csv(output_file, index=False)
print(f"\nDataset saved to: {output_file}")
print("Ready for Phase 2: Data Cleaning & Quality Checks")


Dataset saved to: farmer_survey_2026.csv
Ready for Phase 2: Data Cleaning & Quality Checks
