# Ethiopian food security Classification Notebook
This notebook loads data and prepares a Ethiopian food security
classification workflow.

In [None]:
import pandas as pd
import numpy as np
print('Notebook loaded successfully')

In [None]:
# Load CSV file
df = pd.read_csv(r"C:/Users/DELL/Documents/NLP/wfp_food_prices_eth.csv")

# View first 5 rows
print(df.head())

# Access a column
print(df.columns)
print(df.shape)

In [None]:
# remove metadata row
df = df[df['date'] != '#date']

# convert date column
df['date'] = pd.to_datetime(df['date'])

# convert numeric columns
df['price'] = pd.to_numeric(df['price'])
df['usdprice'] = pd.to_numeric(df['usdprice'])

In [None]:
print(f' regional distribution:{df['admin1'].value_counts()}')
df['commodity'].value_counts()

In [None]:
# =============================================================
# COMPLETE DATA CLEANING AND NLP PREPARATION PIPELINE
# =============================================================

import pandas as pd
import numpy as np
import re
from collections import Counter
from sklearn.feature_extraction.text import TfidfVectorizer


# -----------------------------
# 2Ô∏è‚É£ Remove Duplicates
# -----------------------------
df = df.drop_duplicates()

# -----------------------------
# 3Ô∏è‚É£ Check Missing Values
# -----------------------------
missing_values = df.isnull().sum()
print("Missing values per column:\n", missing_values)

# -----------------------------
# 4Ô∏è‚É£ Clean & Normalize Text Columns
# -----------------------------
text_cols = ['commodity', 'category', 'market', 'admin1', 'admin2', 'unit']

for col in text_cols:
    df[col] = df[col].astype(str)  # ensure string type
    df[col] = df[col].str.lower().str.strip()  # lowercase and strip spaces

# -----------------------------
# 5Ô∏è‚É£ Remove Special Characters from Commodity Names
# -----------------------------
df['commodity_clean'] = df['commodity'].apply(lambda x: re.sub(r'[^a-zA-Z ]', '', x))

# -----------------------------
# 6Ô∏è‚É£ Tokenize Commodity Names
# -----------------------------
df['tokens'] = df['commodity_clean'].str.split()

# -----------------------------
# 7Ô∏è‚É£ Word Frequency Analysis (Top 10 words in commodities)
# -----------------------------
all_words = Counter(word for tokens in df['tokens'] for word in tokens)
print("Top 10 words in commodity names:\n", all_words.most_common(10))

# -----------------------------
# AUTOMATIC STANDARDIZATION USING RULES
# -----------------------------

# Convert to lowercase and remove special characters first
df['commodity_clean'] = df['commodity'].str.lower().str.strip()
df['commodity_clean'] = df['commodity_clean'].apply(lambda x: re.sub(r'[^a-zA-Z ]', '', x))

# Create an empty column for standardized commodities
df['commodity_standard'] = ''

# Define simple rules using for loop
for i, row in df.iterrows():
    name = row['commodity_clean']

    # cereals
    if 'maize' in name:
        df.at[i, 'commodity_standard'] = 'maize'
    elif 'sorghum' in name:
        df.at[i, 'commodity_standard'] = 'sorghum'
    elif 'teff' in name:
        df.at[i, 'commodity_standard'] = 'teff'
    elif 'wheat' in name:
        df.at[i, 'commodity_standard'] = 'wheat'

    # livestock
    elif 'goat' in name:
        df.at[i, 'commodity_standard'] = 'goat'
    elif 'sheep' in name:
        df.at[i, 'commodity_standard'] = 'sheep'

    # other foods
    elif 'butter' in name:
        df.at[i, 'commodity_standard'] = 'butter'
    elif 'beans' in name:
        df.at[i, 'commodity_standard'] = 'beans'
    elif 'potatoes' in name:
        df.at[i, 'commodity_standard'] = 'potatoes'
    elif 'kocho' in name:
        df.at[i, 'commodity_standard'] = 'kocho'
    elif 'wage' in name:
        df.at[i, 'commodity_standard'] = 'wage'

    # If nothing matches, keep original
    else:
        df.at[i, 'commodity_standard'] = name

# Check the result
print(df[['commodity', 'commodity_clean', 'commodity_standard']].head(10))
print("\nStandardized commodity counts:\n", df['commodity_standard'].value_counts().head(10))

# -----------------------------
# 9Ô∏è‚É£ Commodity Counts
# -----------------------------
commodity_counts = df['commodity_standard'].value_counts()
print("\nTop Commodities:\n", commodity_counts.head(10))

# -----------------------------
# üîü Category Counts
# -----------------------------
category_counts = df['category'].value_counts()
print("\nCategory Distribution:\n", category_counts)

# -----------------------------
# 1Ô∏è‚É£1Ô∏è‚É£ Vectorize Commodity Names for NLP
# -----------------------------
vectorizer = TfidfVectorizer()
X_text = vectorizer.fit_transform(df['commodity_standard'])

print("\nTF-IDF vector shape:", X_text.shape)
print("Example feature names:", vectorizer.get_feature_names_out()[:10])

# -----------------------------
# 1Ô∏è‚É£2Ô∏è‚É£ Summary
# -----------------------------
print("Text columns cleaned, tokenized, and vectorized its ready for NLP analysis.")

In [None]:
# ============================================================
# NEXT STAGE ANALYSIS
# ============================================================

import matplotlib.pyplot as plt
from sklearn.cluster import KMeans

# ===============================
# 1Ô∏è‚É£ Commodity Demand Analysis
# ===============================
commodity_demand = df['commodity_standard'].value_counts()

plt.figure()
commodity_demand.head(10).plot(kind='bar')
plt.title("Top 10 Most Demanded Commodities")
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

# ===============================
# 2Ô∏è‚É£ Category Consumption Patterns
# ===============================
category_demand = df['category'].value_counts()

plt.figure()
category_demand.plot(kind='bar')
plt.title("Demand by Food Category")
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

# ===============================
# 3Ô∏è‚É£ Average Price by Commodity
# ===============================

# Most expensive commodities
plt.figure()
price_stats['mean'].sort_values().tail(10).plot(kind='bar')
plt.title("Most Expensive Commodities")
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

# Cheapest commodities
plt.figure()
price_stats['mean'].sort_values().head(10).plot(kind='bar')
plt.title("Most Affordable Commodities")
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

# =========================================================
# 4Ô∏è‚É£ PRICE VOLATILITY (FOOD RISK INDICATOR)
# =========================================================

volatility = df.groupby('commodity_standard')['price'].std().sort_values(ascending=False)

print("\n MOST VOLATILE (RISKY) FOODS:\n")
print(volatility.head(10))

plt.figure()
volatility.head(10).plot(kind='bar')
plt.title("Price Volatility Risk")
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

# =========================================================
# 5Ô∏è‚É£ INFLATION / PRICE SPIKE DETECTION
# =========================================================

df['price_change'] = df.groupby('commodity_standard')['price'].pct_change()

spikes = df[df['price_change'] > 0.30]

print("\n PRICE SPIKES (>30% increase):")
print(spikes[['commodity_standard','price_change']].head())

# =========================================================
# 6Ô∏è‚É£ FOOD SECURITY RISK INDEX
# =========================================================

# risk = high price + high volatility
risk_df = pd.DataFrame({
    'avg_price': df.groupby('commodity_standard')['price'].mean(),
    'volatility': volatility
})

risk_df['risk_score'] = risk_df['avg_price'] * risk_df['volatility']
risk_df = risk_df.sort_values('risk_score', ascending=False)

print("\n FOOD SECURITY RISK FOODS:\n")
print(risk_df.head(10))

# ===============================
# 4Ô∏è‚É£ Seasonal Price Trends
# ===============================
df['date'] = pd.to_datetime(df['date'])
df['month'] = df['date'].dt.month

monthly_price = df.groupby('month')['price'].mean()

plt.figure()
monthly_price.plot(marker='o')
plt.title("Seasonal Price Trend")
plt.xlabel("Month")
plt.ylabel("Average Price")
plt.tight_layout()
plt.show()

# ===============================
# 6Ô∏è‚É£ Market Distribution Analysis
# ===============================
market_counts = df['market'].value_counts().head(10)

plt.figure()
market_counts.plot(kind='bar')
plt.title("Top Markets by Activity")
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

plt.figure()
category_price.plot(kind='bar')
plt.title('Average Price by Category')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

# ===============================
# 7Ô∏è‚É£ Food Price Volatility (Risk Indicator)
# ===============================
price_volatility = df.groupby('commodity_standard')['price'].std().sort_values(ascending=False)

print("\nMost Volatile Commodities:\n")
print(price_volatility.head(10))

# ===============================
# 8Ô∏è‚É£ Insight Summary
# ===============================
print("\nKEY INSIGHTS")
print("‚Ä¢ High demand commodities indicate staple foods.")
print("‚Ä¢ Category demand shows nutrition dependence.")
print("‚Ä¢ Seasonal peaks may indicate drought or shortages.")
print("‚Ä¢ Volatile prices signal food security risks.")
print("‚Ä¢ Clusters reveal similar food groups and substitutes.")

In [None]:
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score

num_clusters = 10
kmeans = KMeans(n_clusters=num_clusters, random_state=42)
df['commodity_cluster'] = kmeans.fit_predict(X_text)

score = silhouette_score(X_text, df['commodity_cluster'])
print("Silhouette Score:", score)

df[['commodity_standard', 'commodity_cluster']].tail(15)

In [None]:
for i in range(num_clusters):
    print(f"\nCluster {i}:")
    print(df[df['commodity_cluster'] == i]['commodity_standard'].head(10).tolist())

In [None]:
import pandas as pd

cluster_words = []
for i in range(num_clusters):
    words = " ".join(df[df['commodity_cluster']==i]['commodity_standard']).split()
    common = Counter(words).most_common(5)
    cluster_words.append([w for w,_ in common])

pd.DataFrame(cluster_words, index=[f"Cluster {i}" for i in range(num_clusters)])

In [None]:
import seaborn as sns

market_counts = df.groupby(['admin1','commodity_standard']).size().unstack(fill_value=0)
plt.figure(figsize=(16,6))
sns.heatmap(market_counts, cmap='YlGnBu')
plt.title('Commodity Distribution by Region')
plt.show()