In [61]:
pip install pandas openpyxl


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.3.1[0m[39;49m -> [0m[32;49m25.0.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [62]:
import pandas as pd
import re
import warnings

In [63]:

# Step 1: Reading the Excel File
file_path = "./IDH Sustainable Procurement Practices_Updated 5th Mar'25.xlsx"
xls = pd.ExcelFile(file_path)
sheet_name = "Procurement practices"
df = pd.read_excel(xls, sheet_name=sheet_name)

# Cleaning up headers
df.columns = df.iloc[1].astype(str).str.split("\n").str[0]  # Keep only the first line of headers
df = df[2:].reset_index(drop=True)

# Rename first three columns
df.columns.values[1:4] = ["Area", "Procurement Processes", "Practice"]

# Exclude columns AJ to AO if they exist
columns_to_exclude = df.loc[:, 'AJ':'AO'].columns.tolist() if 'AJ' in df.columns and 'AO' in df.columns else []
df = df.drop(columns=columns_to_exclude, errors='ignore')

# Remove empty columns and rows
df = df.dropna(axis=1, how='all').dropna(axis=0, how='all')

# Remove second row if it contains specific keywords
if any(df.iloc[0].astype(str).str.contains("1 = High // 5 = Low|Implementation Cost / Effort", na=False)):
    df = df.iloc[1:].reset_index(drop=True)

# Define columns to convert with snake_case renaming
columns_to_convert = {
    "Intervention Definition": "intervention_definition",
    "Enabling Conditions": "enabling_conditions",
    "Business Rationale": "business_rationale",
    "Farmer Rationale": "farmer_rationale",
    "Risks & Trade Offs": "risks_n_trade_offs",
    "Intervention Impact Income": "intervention_impact_income",
    "Intervention Impact Environment": "intervention_impact_env",
    "Source / Evidence": "source_or_evidence"
}

In [64]:
def text_to_html(text):
    if pd.isna(text) or text.strip() == "":
        return ""
    
    lines = text.split("\n")
    formatted_lines = []

    for line in lines:
        line = line.strip()
        
        # Handle numbered/bullet lists within a paragraph
        if re.match(r"^\d+\.\s", line):
            if "http" in line:
                formatted_lines.append(f'<a href="{line[3:]}" target="_blank">{line}</a><br/>')
            else:
                formatted_lines.append(f"<strong>{line}</strong>")
        
        # Convert titles to headings
        elif any(keyword in line for keyword in ["Definition:", "Enabling conditions", "Business Rationale", "Farmer Rationale", "Risks:"]):
            formatted_lines.append(f"<h3>{line.replace(':', '')}</h3>")
        
        # Convert URLs
        elif re.search(r"https?://\S+", line):
            line = re.sub(r"(https?://\S+)", r'<a href="\1" target="_blank">\1</a>', line)
            formatted_lines.append(f"<p>{line}</p>")
        
        # Bold 'Additional Details:'
        elif "Additional Details:" in line:
            formatted_lines.append(f"<b>{line}</b>")
        else:
            formatted_lines.append(f"<p>{line}</p>")
    
    return "\n".join(formatted_lines)

In [65]:
# Apply conversion to selected columns that exist in dataframe
df_selected = df[[col for col in columns_to_convert.keys() if col in df.columns]].copy()
for col in df_selected.columns:
    df_selected[col] = df_selected[col].apply(text_to_html)

# Rename selected columns
df_selected.rename(columns=columns_to_convert, inplace=True)
df.rename(columns=columns_to_convert, inplace=True)

# Merge back with original dataframe
for formatted_col in columns_to_convert.values():
    if formatted_col in df_selected.columns:
        df[formatted_col] = df_selected[formatted_col]

# Step 3: Export as CSV
output_csv_path = "../../master/procurement_practices.csv"
df.to_csv(output_csv_path, index=False, encoding="utf-8")

In [66]:
### Assessment Questions

# Suppress warnings related to Data Validation extension
warnings.simplefilter(action='ignore', category=UserWarning)

# Load the "Dashboard" sheet
df_dashboard = pd.read_excel(xls, sheet_name="Dashboard")

# Extract relevant columns and rename them
df_dashboard_cleaned = df_dashboard.iloc[2:, [1, 3, 5]]
df_dashboard_cleaned.columns = ["Question", "Select answer", "Definitions"]

# Remove empty rows
df_dashboard_cleaned = df_dashboard_cleaned.dropna(how='all')

# Identify the "Top procurement practices" row
top_procurement_index = df_dashboard_cleaned[df_dashboard_cleaned["Question"].str.contains(
    "Top procurement practices", na=False, case=False)].index.min()

# Filter out rows before "Top procurement practices"
df_filtered = df_dashboard_cleaned.loc[:top_procurement_index - 1] if top_procurement_index else df_dashboard_cleaned

# Split "Definitions" into multiple rows
df_expanded = df_filtered.assign(Definitions=df_filtered["Definitions"].str.split("; ")).explode("Definitions")

# Split "Definitions" into "Select answer" and "Description"
df_expanded[['Select answer', 'Description']] = df_expanded['Definitions'].str.split(":", n=1, expand=True)

# Remove the old "Definitions" column and strip spaces
df_expanded = df_expanded.drop(columns=["Definitions"]).apply(lambda x: x.str.strip() if x.dtype == "object" else x)

# Save the final cleaned data
df_expanded.to_csv("../../master/procurement_questions.csv", index=False)