In [1]:
import numpy as np
import requests
from bs4 import BeautifulSoup
import pandas as pd
import altair as alt
import geopandas as gpd
import matplotlib.pyplot as plt
import json
import altair as alt
import dtmapi
import statsmodels.api as sm
from statsmodels.formula.api import ols
from fuzzywuzzy import fuzz, process
from hdx.utilities.easy_logging import setup_logging
from hdx.api.configuration import Configuration
from hdx.data.dataset import Dataset
import os



### IOM DTM

In [3]:
Configuration.create(hdx_site="prod", user_agent="PP434", hdx_read_only=True)

'https://data.humdata.org'

In [4]:
datasets_ipc = Dataset.read_from_hdx("sudan-displacement-situation-countrywide-idps-iom-dtm")
resources = datasets_ipc.get_resources()

for res in resources:
    url, path = res.download('data_processing/dtm/')
    print(f"Resource URL {url} downloaded to {path}")

Resource URL https://data.humdata.org/dataset/44594ae2-dde9-417f-acae-523bc012c162/resource/d641cae6-ba35-416d-9a4f-731d886315d6/download/dtm_sdn_smu-bi-weekly-13-_-17122024_v02_public_hdx.xlsx downloaded to data_processing/dtm/Sudan - Countrywide Mobility Update 13 (Bi-Weekly)1.xlsx
Resource URL https://data.humdata.org/dataset/44594ae2-dde9-417f-acae-523bc012c162/resource/4ea6ae19-4080-405b-bf8a-3003e8ed5341/download/dtm_sdn_smu-bi-weekly-12-_-03122024_v02_public_hdx.xlsx downloaded to data_processing/dtm/Sudan - Countrywide Mobility Update 12 (Bi-Weekly)1.xlsx
Resource URL https://data.humdata.org/dataset/44594ae2-dde9-417f-acae-523bc012c162/resource/2a0a6ab7-722a-4aea-bd31-dd0d949ea7dd/download/dtm_sdn_smu-bi-weekly-11-_-20112024_v02_public_hdx.xlsx downloaded to data_processing/dtm/Sudan - Countrywide Mobility Update 11 (Bi-Weekly)1.xlsx
Resource URL https://data.humdata.org/dataset/44594ae2-dde9-417f-acae-523bc012c162/resource/7a7d1e2f-6411-4e0a-b51c-f1a05a1ed5ca/download/dtm_sdn

In [5]:
idp_admin_data = dtmapi.get_idp_admin2_data(CountryName='Sudan',  FromReportingDate='2021-01-01', ToReportingDate='2025-01-01', to_pandas=True)

In [6]:
# read data 
idp_adm1_2024 = pd.read_excel('data_processing/dtm/Sudan - Countrywide Mobility Update 13 (Bi-Weekly).xlsx', sheet_name='MASTER LIST (ADMIN1)', skiprows=2)
idp_adm2_2024 = pd.read_excel('data_processing/dtm/Sudan - Countrywide Mobility Update 13 (Bi-Weekly).xlsx', sheet_name='MASTER LIST (ADMIN2)', skiprows=2)

rename_dict = {
    'STATE OF DISPLACEMET': 'adm1_name',
    'STATE CODE': 'adm1_pcode',
    'LOCALITY OF DISPLACEMENT': 'adm2_name',
    'LOCALITY_CODE': 'adm2_pcode',
    'LOCALITY_ CODE': 'adm2_pcode',
    'HHs': 'affected_idps_hh',
    'IDPs': 'affected_idps_ind'
}


idp_adm1_2024.rename(columns=rename_dict, inplace=True)
#idp_adm1_2023.rename(columns=rename_dict, inplace=True)
#idp_adm2_2023.rename(columns=rename_dict, inplace=True)
#idp_adm2_2024.rename(columns=rename_dict, inplace=True)

In [7]:
idp_origin_adm1_2024 = idp_adm1_2024.iloc[1:]
#idp_origin_adm1_2023 = idp_adm1_2023.iloc[1:]
#idp_origin_adm2_2023 = idp_adm2_2023.iloc[1:]
#idp_origin_adm2_2024 = idp_adm2_2024.iloc[1:]

In [8]:
idp_origin_adm1_2024.loc[19] = idp_origin_adm1_2024.iloc[:, 2:].sum()
idp_origin_adm1_2024
idp_origin_adm1_2024.loc[19, 'adm1_name'] = 'Total'

In [9]:
state_columns = idp_adm1_2024.columns[4:-2].tolist()
idp_adm1_2024_state = idp_origin_adm1_2024.melt(
        id_vars=['adm1_name', 'adm1_pcode',  'affected_idps_ind', 'affected_idps_hh'],
        value_vars=state_columns,
        var_name='state_origin',
        value_name='affected_idps_state'
    )

In [10]:
idp_origin_2024 = idp_adm1_2024_state[idp_adm1_2024_state['adm1_name'] == 'Total']

### IPC 

GeoJson data

In [11]:
api_key = '3bdbeef9-fc4c-4315-a665-cc56deb11be6'
base_url = 'https://api.ipcinfo.org/'

# Create directory if it doesn't exist
os.makedirs('data_processing/ipc', exist_ok=True)

# Fetch IPC analyses
ipc = requests.get(f'{base_url}analyses?country=SD&type=A&periods=C&key={api_key}').json()

for item in ipc:
    # Try fetching current data ('C') first
    response = requests.get(f"{base_url}areas/{item['id']}/C?format=geojson&key={api_key}")
    
    if response.status_code == 200:
        save_name = f"data_processing/ipc/ipc_geojson_C_{item['id']}.json"
    else:
        print(f"Current data not available for item {item['id']}, fetching projections (P)")
        response = requests.get(f"{base_url}areas/{item['id']}/P?format=geojson&key={api_key}")
        save_name = f"data_processing/ipc/ipc_geojson_P_{item['id']}.json"
    
    if response.status_code == 200:
        try:
            ipc_geojson = response.json()
            with open(save_name, 'w') as f:
                json.dump(ipc_geojson, f)
            print(f"Saved {save_name}")
        except json.JSONDecodeError as e:
            print(f"Error decoding JSON for item {item['id']}: {e}")
        except Exception as e:
            print(f"Error saving file for item {item['id']}: {e}")
    else:
        print(f"Failed to fetch data for item {item['id']}: HTTP {response.status_code}")


Saved data_processing/ipc/ipc_geojson_C_14192889.json
Saved data_processing/ipc/ipc_geojson_C_18151797.json
Current data not available for item 19191589, fetching projections (P)
Saved data_processing/ipc/ipc_geojson_P_19191589.json
Saved data_processing/ipc/ipc_geojson_C_24004104.json
Saved data_processing/ipc/ipc_geojson_C_25857808.json
Saved data_processing/ipc/ipc_geojson_C_58836462.json
Current data not available for item 64768802, fetching projections (P)
Failed to fetch data for item 64768802: HTTP 404
Saved data_processing/ipc/ipc_geojson_C_68887616.json
Current data not available for item 74795267, fetching projections (P)
Saved data_processing/ipc/ipc_geojson_P_74795267.json


In [12]:
#ipc_2024 = requests.get(f"https://api.ipcinfo.org/areas/68887616/C?format=json&key=").json()
#ipc_2024_proj = requests.get(f"https://api.ipcinfo.org/areas/68887616/C?format=json&key=").json()

HDX API

In [14]:
datasets_ipc = Dataset.read_from_hdx("sudan-acute-food-insecurity-country-data")
resources = datasets_ipc.get_resources()

ipc_sdn_adm1 = [res for res in resources if res.get('description') == 'All IPC level 1 data in long form with HXL tags']
ipc_sdn_adm2 = [res for res in resources if res.get('description') == 'All IPC area data in long form with HXL tags']


url1 = ipc_sdn_adm1[0]['alt_url']
url2 = ipc_sdn_adm2[0]['alt_url']

# Download the file
response = requests.get(url1)
with open('data_processing/ipc_sdn_adm1.csv', 'wb') as f:
	f.write(response.content)
	
response = requests.get(url2)
with open('data_processing/ipc_sdn_adm2.csv', 'wb') as f:
	f.write(response.content)



In [15]:
ipc_sdn_adm2 = pd.read_csv('data_processing/ipc_sdn_adm2.csv')
ipc_sdn_adm2 = ipc_sdn_adm2.iloc[1:]
ipc_sdn_adm1 = pd.read_csv('data_processing/ipc_sdn_adm1.csv')
ipc_sdn_adm1 = ipc_sdn_adm1.iloc[1:]

In [16]:
ipc_sdn_adm2 = ipc_sdn_adm2[~ipc_sdn_adm2['Area'].str.contains('IDPs')]
ipc_sdn_adm2 = ipc_sdn_adm2[~ipc_sdn_adm2['Area'].str.contains('IDP')]
ipc_sdn_adm2 = ipc_sdn_adm2[~ipc_sdn_adm2['Area'].str.contains('Refugees')]
ipc_sdn_adm2 = ipc_sdn_adm2[~ipc_sdn_adm2['Area'].str.contains('refugees')]

ipc_sdn_adm1 = ipc_sdn_adm1[~ipc_sdn_adm1['Level 1'].str.contains('IDPs')]
ipc_sdn_adm1 = ipc_sdn_adm1[~ipc_sdn_adm1['Level 1'].str.contains('IDP')]
ipc_sdn_adm1 = ipc_sdn_adm1[~ipc_sdn_adm1['Level 1'].str.contains('Refugees')]
ipc_sdn_adm1 = ipc_sdn_adm1[~ipc_sdn_adm1['Level 1'].str.contains('refugees')]

### ACLED

In [17]:
acled = pd.read_csv('data_processing/ACLED_2020-01-01-2024-12-31-Sudan.csv')

In [18]:
acled = acled[acled['year'] != 2020]
acled['event_date'] = pd.to_datetime(acled['event_date'])
acled['month_year'] = acled['event_date'].dt.to_period('M')
acled['week'] = acled['event_date'].dt.to_period('W').apply(lambda r: r.start_time)

In [19]:
#Monthly Data
acled_monthly= acled.groupby(['month_year','event_type']).agg({'fatalities':'sum', 'event_date':'count'}).reset_index()

#remove riots
acled_monthly = acled_monthly[acled_monthly['event_type'] != 'Riots']
acled_monthly = acled_monthly[acled_monthly['event_type'] != 'Strategic developments']
acled_monthly['event_type'] = acled_monthly['event_type'].replace('Explosions/Remote violence', 'Explosions')

acled_monthly.to_csv('data_processing/acled_monthly.csv', index=False)
acled_monthly['month_year'] = acled_monthly['month_year'].astype(str)

#Weekly Data
#acled_weekly = acled.groupby(['week', 'event_type']).agg({'fatalities':'sum', 'event_date':'count'}).reset_index()
#acled_weekly['week'] = acled_weekly['week'].astype(str)

### COMBINED Analysis

In [20]:
acled_admin2 = acled[(acled['month_year'] >= '2023-04')& (acled['month_year'] <= '2024-05')]
acled_admin2= acled_admin2.groupby('admin2').agg({'fatalities': 'sum', 'event_id_cnty': 'count'}).reset_index()
acled_admin2.reset_index(drop=True, inplace=True)

acled_admin1 = acled[(acled['month_year'] >= '2023-04') & (acled['month_year'] <= '2024-05')]
acled_admin1 = acled_admin1.groupby('admin1').agg({'fatalities': 'sum', 'event_id_cnty': 'count'}).reset_index()
acled_admin1.reset_index(drop=True, inplace=True)

In [21]:
ipc_sdn_adm2_2024 = ipc_sdn_adm2[(ipc_sdn_adm2['Validity period'] == 'current') & (ipc_sdn_adm2['Date of analysis'].str.contains('Apr 2024'))]
ipc_sdn_adm2_2024 = ipc_sdn_adm2_2024[ipc_sdn_adm2_2024['Phase']== '3+']

#splitting areas
ipc_sdn_adm2_2024 = ipc_sdn_adm2_2024.assign(Area=ipc_sdn_adm2_2024['Area'].str.split(r'[,&]')).explode('Area').reset_index(drop=True)
ipc_sdn_adm2_2024['Area'] = ipc_sdn_adm2_2024['Area'].str.strip()


ipc_sdn_adm1_2024 = ipc_sdn_adm1[(ipc_sdn_adm1['Validity period'] == 'current') & (ipc_sdn_adm1['Date of analysis'].str.contains('Apr 2024'))]
ipc_sdn_adm1_2024 = ipc_sdn_adm1_2024[ipc_sdn_adm1_2024['Phase']== '3+']

In [22]:
idp_admin2_2024_may = idp_admin_data[(idp_admin_data['yearReportingDate'] == 2024) & (idp_admin_data['monthReportingDate'] == 5) & (idp_admin_data['operation'] == 'Armed Clashes in Sudan (Overview)') ]
idp_admin2_2024_may = idp_admin2_2024_may.dropna(subset=['admin2Name'])

idp_admin1_2024_may = idp_admin_data[(idp_admin_data['yearReportingDate'] == 2024) & (idp_admin_data['monthReportingDate'] == 5) & (idp_admin_data['operation'] == 'Armed Clashes in Sudan (Overview)') ]
idp_admin1_2024_may = idp_admin1_2024_may.groupby('admin1Name').agg({'numPresentIdpInd':'sum'}).reset_index()

Matching Admin 2 columns names

In [23]:
def clean_location(location):
    """Clean location names by removing common prefixes/suffixes and standardizing format."""
    location = str(location).lower().strip()
    prefixes_to_remove = ['al ', 'el ', 'ar ', 'at ', 'um ', 'ad ', 'as ', 'aj ']
    for prefix in prefixes_to_remove:
        if location.startswith(prefix):
            location = location[len(prefix):]
    return location.strip()

def create_lookup_dict(locations):
    """Create a dictionary mapping cleaned names to original names."""
    return {clean_location(loc): loc for loc in locations if pd.notna(loc)}

def find_best_match(source_loc, target_dict, threshold=80):
    """Find the best matching location from target_dict for source_loc."""
    source_cleaned = clean_location(source_loc)
    best_match = None
    best_score = 0
    best_original = None
    
    for target_cleaned, target_original in target_dict.items():
        # Try different fuzzy matching algorithms
        ratio = fuzz.ratio(source_cleaned, target_cleaned)
        partial_ratio = fuzz.partial_ratio(source_cleaned, target_cleaned)
        token_sort_ratio = fuzz.token_sort_ratio(source_cleaned, target_cleaned)
        
        # Use the highest score among different matching methods
        score = max(ratio, partial_ratio, token_sort_ratio)
        
        if score > best_score and score >= threshold:
            best_score = score
            best_match = target_cleaned
            best_original = target_original
    
    return best_original, best_score

def match_locations(iom_locations, acled_locations, ipc_locations):
    """Match locations from IOM and ACLED to IPC locations."""
    results = []
    ipc_lookup = create_lookup_dict(ipc_locations)
    
    # Process IOM locations
    for iom_loc in iom_locations:
        if pd.isna(iom_loc):
            continue
        ipc_match, iom_score = find_best_match(iom_loc, ipc_lookup)
        if ipc_match:  # Only add if we found a match
            results.append({
                'Source': 'IOM',
                'Original_Location': iom_loc,
                'IPC_Original_Name': ipc_match,
                'Match_Score': iom_score
            })
    
    # Process ACLED locations
    for acled_loc in acled_locations:
        if pd.isna(acled_loc):
            continue
        ipc_match, acled_score = find_best_match(acled_loc, ipc_lookup)
        if ipc_match:  # Only add if we found a match
            results.append({
                'Source': 'ACLED',
                'Original_Location': acled_loc,
                'IPC_Original_Name': ipc_match,
                'Match_Score': acled_score
            })
    
    # Convert results to DataFrame
    df_results = pd.DataFrame(results)
    
    # Sort by match score descending
    df_results = df_results.sort_values('Match_Score', ascending=False)
    
    return df_results

# Extract unique locations from each dataset
iom_locations = idp_admin2_2024_may['admin2Name'].dropna().unique()
acled_locations = acled_admin2['admin2'].dropna().unique()
ipc_locations = ipc_sdn_adm2_2024['Area'].dropna().unique()

# Perform matching
matches_df = match_locations(iom_locations, acled_locations, ipc_locations)

# Create mapping dictionaries for each source
iom_mapping = matches_df[matches_df['Source'] == 'IOM'].set_index('Original_Location')['IPC_Original_Name'].to_dict()
acled_mapping = matches_df[matches_df['Source'] == 'ACLED'].set_index('Original_Location')['IPC_Original_Name'].to_dict()

# Apply mappings to original dataframes
idp_admin2_2024_may['admin2Name_IPC'] = idp_admin2_2024_may['admin2Name'].map(iom_mapping)
acled_admin2['admin2_IPC'] = acled_admin2['admin2'].map(acled_mapping)

In [24]:
# List of admin2Name_IPC values to be set to NaN
admin2_to_nan = [
    'Al Ganab', 'Kas', 'Reifi Shamal Ad Delta', 'Ar Rahad', 'As Salam - WK', 
    'Reifi Nahr Atbara', 'Gharb Bara', 'Foro Baranga', 'Reifi Khashm Elgirba', 
    'Wasat Jabal Marrah', 'Gharb Jabal Marrah', 'Al Kamlin', 'Reifi Gharb Kassala', 
    'Al Lait', 'Atbara', 'Sharg Sennar'
]

# Update the dataframe
idp_admin2_2024_may.loc[idp_admin2_2024_may['admin2Name'].isin(admin2_to_nan), 'admin2Name_IPC'] = np.nan

# if admin2Name = Bara, then admin2Name_IPC = Bara
idp_admin2_2024_may.loc[idp_admin2_2024_may['admin2Name'] == 'Bara', 'admin2Name_IPC'] = 'Bara'

In [25]:
# List of admin2Name_IPC values to be set to NaN
admin2_to_nan = ['Al Buram', 'Kas', 'Gharb Jabal Marrah',
'Wasat Jabal Marrah' , 'Reifi Gharb Kassala', 'Reifi Shamal Ad Delta', 'Sharg Sennar']

# Update the dataframe
acled_admin2.loc[acled_admin2['admin2'].isin(admin2_to_nan), 'admin2Name_IPC'] = np.nan


In [26]:
# Save mapping results
matches_df.to_csv('data_processing/location_matches_with_originals.csv', index=False)
idp_admin2_2024_may.to_csv('data_processing/idp_admin2_2024_may_mapped.csv', index=False)
acled_admin2.to_csv('data_processing/acled_admin2_mapped.csv', index=False)


### IPC + ACLED

In [27]:
#Adm 1
acled_admin1['admin1'] = acled_admin1['admin1'].replace('Al Jazirah', 'Aj Jazirah')
acled_ipc_adm1 = pd.merge(acled_admin1, ipc_sdn_adm1_2024, left_on=['admin1'], right_on=['Level 1'], how='inner')

In [28]:
#Adm 2
acled_ipc_adm2 = pd.merge(acled_admin2, ipc_sdn_adm2_2024, left_on=['admin2_IPC'], right_on=['Area'], how='inner')

In [29]:
#save acled_ipc_adm2 to csv
acled_ipc_adm2.to_csv('data_processing/acled_ipc_adm2.csv', index=False)

### IPC + IDP

In [30]:
#Adm 2
idp_ipc_adm2 = pd.merge(idp_admin2_2024_may, ipc_sdn_adm2_2024, left_on=['admin2Name_IPC'], right_on=['Area'], how='inner')
idp_ipc_adm1 = pd.merge(idp_admin1_2024_may, ipc_sdn_adm1_2024, left_on=['admin1Name'], right_on=['Level 1'], how='inner')

In [31]:
acled_idp_ipc_adm2 = pd.merge(acled_ipc_adm2, idp_ipc_adm2, left_on=['Area'], right_on=['Area'], how='inner')

acled_idp_ipc_adm2 = acled_idp_ipc_adm2[['Area', 'Level 1_x','fatalities', 'event_id_cnty', 'numPresentIdpInd', 'Number_x']]
acled_idp_ipc_adm2.rename(columns={'Number_x':'phase3plus'}, inplace=True)
acled_idp_ipc_adm2.rename(columns={'Level 1_x':'Level 1'}, inplace=True)

In [32]:
#reset the Area combinations
acled_idp_ipc_adm2 = acled_idp_ipc_adm2.groupby('phase3plus').agg({
    'Area': lambda x: ', '.join(x),
    'fatalities': 'sum',
    'event_id_cnty': 'sum',
    'numPresentIdpInd': 'sum',
    'Level 1': 'first'
}).reset_index()


acled_idp_ipc_adm2.to_csv('data_processing/acled_idp_ipc_adm2.csv', index=False)
acled_idp_ipc_adm2.to_csv('data/acled_idp_ipc_adm2.csv', index=False)

In [33]:
acled_idp_ipc_adm1 = pd.merge(acled_ipc_adm1, idp_ipc_adm1, left_on=['Level 1'], right_on=['Level 1'], how='outer')
acled_idp_ipc_adm1 = pd.merge(acled_idp_ipc_adm1, idp_origin_2024, left_on=['Level 1'], right_on=['state_origin'], how='outer')

acled_idp_ipc_adm1 = acled_idp_ipc_adm1[['Level 1', 'fatalities', 'event_id_cnty', 'numPresentIdpInd', 'Number_x', 'affected_idps_state']]
acled_idp_ipc_adm1.rename(columns={'Number_x':'phase3plus'}, inplace=True)
acled_idp_ipc_adm1.to_csv('data_processing/acled_idp_ipc_adm1.csv', index=False)
acled_idp_ipc_adm1.to_csv('data/acled_idp_ipc_adm1.csv', index=False)

In [40]:
#Admin 1 correlations
print(acled_idp_ipc_adm1['phase3plus'].corr(acled_idp_ipc_adm1['event_id_cnty']))
print(acled_idp_ipc_adm1['phase3plus'].corr(acled_idp_ipc_adm1['numPresentIdpInd']))
print(acled_idp_ipc_adm1['phase3plus'].corr(acled_idp_ipc_adm1['fatalities']))

0.6452683691222222
0.17711988087371816
0.48353427830695067


In [41]:
#Admin 1 correlations
print(acled_idp_ipc_adm2['phase3plus'].corr(acled_idp_ipc_adm2['event_id_cnty']))
print(acled_idp_ipc_adm2['phase3plus'].corr(acled_idp_ipc_adm2['numPresentIdpInd']))
print(acled_idp_ipc_adm2['phase3plus'].corr(acled_idp_ipc_adm2['fatalities']))

0.6090965614633176
0.2465361220056956
0.44934711278346723
