# Assigning Classes to FCS and HDDS for Classification Task
In this Python notebook, we will assign classes for FCS and HDDS, we set three classes Poor, Boordeline, and Acceptance. The first thing is to check the distribution of the samples of the household survey data and assign new thresholds based on the Percentile Approach since we can not use the initial threshold set by WFP. 

## Import Libraries

In this section, we import essential libraries and modules required for data preprocessing, analysis, and visualization tasks. These libraries provide robust functionalities and tools that streamline the data analysis workflow and enable us to manipulate and explore the dataset efficiently.

In [1]:
import pandas as pd
from pandas import read_csv
from pandas import read_excel
import numpy as np
import seaborn as sns
import folium
import geopandas as gpd
import matplotlib.pyplot as plt
from matplotlib.patches import Patch
import plotly.express as px

In [None]:
def concat_and_save_to_csv(df_list, file_name='concatenated_data.csv'):
    """
    Concatenates a list of DataFrames into a single DataFrame and saves it to a CSV file.

    Parameters:
        df_list (list): List of DataFrames to concatenate.
        file_name (str): Name of the CSV file to save the concatenated DataFrame. Default is 'concatenated_data.csv'.
    """
    # Concatenate the DataFrames
    concatenated_df = pd.concat(df_list, ignore_index=True)

    # Save the concatenated DataFrame to a CSV file
    concatenated_df.to_csv(file_name, index=False)

    print(f"Concatenated DataFrame saved to {file_name}")

In [7]:
def checking_classification_thresholds(df, column='fcs'):
    """
    Prints classification thresholds based on percentiles of a column in a DataFrame.

    Parameters:
        df (DataFrame): The DataFrame for which to print classification thresholds.
        column (str): The name of the column used for computing thresholds. Default is 'fcs'.
    """
    # Calculate the percentiles with one decimal place
    lower_bound = np.round(np.percentile(df[column], 33.33), 1)
    upper_bound = np.round(np.percentile(df[column], 66.67), 1)
    
    # Print classification thresholds
    print(f"FCS Thresholds:\n"
          f"  - Poor: < {lower_bound}\n"
          f"  - Borderline: {lower_bound} - {upper_bound}\n"
          f"  - Acceptance: >= {upper_bound}")

In [2]:
def assign_class(df, column_name, thresholds):
    """
    Assigns a class to each row in the DataFrame based on the specified column and custom thresholds.

    Parameters:
        df (DataFrame): The DataFrame to which the class will be assigned.
        column_name (str): The name of the column used for classification.
        thresholds (dict): A dictionary containing custom lower and upper bounds for classification.
                           Example: {'Poor': (lower_bound_poor, upper_bound_poor),
                                     'Borderline': (lower_bound_borderline, upper_bound_borderline)}

    Returns:
        DataFrame: The original DataFrame with additional columns 'fcg' and 'class' assigned.
    """
    # Sort the thresholds by the lower bound
    sorted_thresholds = sorted(thresholds.items(), key=lambda x: x[1][0])
    
    # Define classification function based on custom thresholds
    def classify(value):
        for label, (lower, upper) in sorted_thresholds:
            if value < lower:
                return label, 1  
            elif lower <= value < upper:
                return label, sorted_thresholds.index((label, (lower, upper))) + 1
        return 'Acceptance', len(sorted_thresholds) + 1  
    
    # Apply classification function to the specified column
    df['fcg'], df['class'] = zip(*df[column_name].apply(classify))
    
    return df

In [3]:
def assign_class_hdds_and_save(input_file, column_name, thresholds):
    """
    Assigns a class_hdds to each row in the DataFrame based on the specified column and custom thresholds and saves the DataFrame as an Excel file.

    Parameters:
        input_file (str): The path to the input Excel file.
        column_name (str): The name of the column used for classification.
        thresholds (dict): A dictionary containing custom lower and upper bounds for classification.
                           Example: {'Poor': (lower_bound_poor, upper_bound_poor),
                                     'Borderline': (lower_bound_borderline, upper_bound_borderline)}

    Returns:
        DataFrame: The original DataFrame with an additional column 'class_hdds' assigned.
    """
    # Load the Excel file or csv file
    df = pd.read_excel(input_file)
    df = pd.read_csv(input_file)

    # Sort the thresholds by the lower bound
    sorted_thresholds = sorted(thresholds.items(), key=lambda x: x[1][0])
    
    # Define classification function based on custom thresholds
    def classify(value):
        for idx, (label, (lower, upper)) in enumerate(sorted_thresholds):
            if lower <= value < upper:
                return idx + 1
        return len(sorted_thresholds) + 1  # For values above all defined thresholds
    
    # Apply classification function to the specified column
    df['class_hdds'] = df[column_name].apply(classify)
    
    # Generate the output file name
    base, ext = os.path.splitext(input_file)
    output_file = f"{base}_classified{ext}"
    
    # Save the DataFrame as an Excel file
    df.to_excel(output_file, index=False)
    
    return df


# Tanzania Data Proprocessing

#### Merge the All Dataset Files into A Single File

In [11]:
#define data directory
tz_dir = 'Tanzania/data/'

#load the data files
df1 = read_csv(tz_dir + 'tz_2010_2011_final.csv', delimiter = ',')
df2 = read_csv(tz_dir + 'tz_2012_2013_final.csv', delimiter = ',')
df3 = read_csv(tz_dir + 'tz_2014_2015_final.csv', delimiter = ',')
df4 = read_csv(tz_dir + 'tz_2019_2020_final.csv', delimiter = ',')
df5 = read_csv(tz_dir + 'tz_2020_2022_final.csv', delimiter = ',')
df6 = read_csv(tz_dir + 'tz_2023_final.csv', delimiter = ',')

#create a list of dataframes
df_list = [df1, df2, df3, df4, df5, df6]

#call a function to merge the files
#concat_and_save_to_csv(df_list, file_name='Tanzania/data/all_tz_data.csv')

#### Checking the Lower and Upper bound to classify the FCS

In [21]:
#load the merged file
data = read_csv(tz_dir + 'all_tz_data.csv', delimiter = ',')
data.tail()

Unnamed: 0,year,region,district,fcs,hdds,count
878,2023,Tanga,Korogwe Town,20.5,4.0,1
879,2023,Tanga,Lushoto,62.43,8.43,7
880,2023,Tanga,Muheza,74.33,8.33,6
881,2023,Tanga,Pangani,67.25,8.25,4
882,2023,Tanga,Tanga,79.33,8.5,6


In [22]:
checking_classification_thresholds(data,'fcs')

FCS Thresholds:
  - Poor: < 50.6
  - Borderline: 50.6 - 59.3
  - Acceptance: >= 59.3


#### Assign Classes based on threshold

The threshold are like FCS Thresholds:
  - Poor: < 50.6
  - Borderline: 50.6 - 59.3
  - Acceptance: >= 59.3
  
Let then modify a litle bit then we make like:
  - Poor: < 50.5
  - Borderline: 50.5 - 59.5
  - Acceptance: >= 59.5

In [23]:
thresholds = {'Poor': (0, 50.5), 'Borderline': (50.5, 59.5)}
data_final = assign_class(data, 'fcs', thresholds)

In [24]:
data_final.head()

Unnamed: 0,year,region,district,fcs,hdds,count,fcg,class
0,2010/2011,Arusha,Arusha,65.29,8.14,7,Acceptance,3
1,2010/2011,Arusha,Arusha Urban,68.51,8.38,40,Acceptance,3
2,2010/2011,Arusha,Karatu,52.47,7.4,15,Borderline,2
3,2010/2011,Arusha,Meru,69.47,8.15,20,Acceptance,3
4,2010/2011,Arusha,Monduli,52.05,6.16,19,Borderline,2


In [25]:
data_final['fcg'].value_counts()

fcg
Borderline    304
Poor          292
Acceptance    287
Name: count, dtype: int64

In [26]:
data_final['class'].value_counts()

class
2    304
1    292
3    287
Name: count, dtype: int64

In [27]:
#save the final data
data_final.to_csv(tz_dir+ 'final_data/tz_data_final.csv', index=False)

### Assign HDDS class

In [28]:
checking_classification_thresholds(data,'hdds') #checking classification thresholod for hdds

FCS Thresholds:
  - Poor: < 6.8
  - Borderline: 6.8 - 7.5
  - Acceptance: >= 7.5


In [30]:
input_file = 'Tanzania/rep_epa_2011-2023.xlsx'
thresholds = {
    'Poor': (0, 6.8),
    'Borderline': (6.8, 7.5),
    'Acceptance': (7.5, 12)
}
df_classified = assign_class_hdds_and_save(input_file, 'hdds', thresholds)

In [31]:
df_classified.head()

Unnamed: 0,year,region,district,fcs,hdds,count,fcg,class,DISTRICT_ID,class_hdds
0,2011,Arusha,Arusha,65.29,8.14,7,Acceptance,3,1,3
1,2011,Arusha,Arusha Urban,68.51,8.38,40,Acceptance,3,2,3
2,2011,Arusha,Karatu,52.47,7.4,15,Borderline,2,3,2
3,2011,Arusha,Meru,69.47,8.15,20,Acceptance,3,4,3
4,2011,Arusha,Monduli,52.05,6.16,19,Borderline,2,5,1


## ## Final Processing of Rwanda Data 

#### Merge the All Dataset Files into A Single File

In [14]:
#define data directory
rw_dir = 'Rwanda/data/'

#load data
df1 = read_csv(rw_dir + 'rw_2006_final.csv', delimiter = ',')
df2 = read_csv(rw_dir + 'rw_2012_final.csv', delimiter = ',')
df3 = read_csv(rw_dir + 'rw_2015_final.csv', delimiter = ',')
df4 = read_csv(rw_dir + 'rw_2018_final.csv', delimiter = ',')
df5 = read_csv(rw_dir + 'rw_2021_final.csv', delimiter = ',')

#dataframe list
df_list = [df1, df2, df3, df4, df5]

#merge the dataset in a single file
#concat_and_save_to_csv(df_list, file_name='Rwanda/data/all_rw_data.csv')

#### Checking the Lower and Upper bound to classify the FCS

In [32]:
data = read_csv(rw_dir + 'all_rw_data.csv', delimiter = ',')

In [33]:
data.head()

Unnamed: 0,year,province,district,fcs,hdds,count
0,2006,Amajyaruguru,Burera,34.11,3.71,59
1,2006,Amajyaruguru,Gakenke,42.05,4.15,61
2,2006,Amajyaruguru,Gicumbi,39.73,3.7,66
3,2006,Amajyaruguru,Musanze,37.83,4.05,108
4,2006,Amajyaruguru,Rulindo,44.52,4.54,46


In [34]:
checking_classification_thresholds(data,'fcs')

FCS Thresholds:
  - Poor: < 42.1
  - Borderline: 42.1 - 47.1
  - Acceptance: >= 47.1


#### Assign Classes based on threshold

The threshold are like FCS Thresholds:
   - Poor: < 42.1
  - Borderline: 42.1 - 47.1
  - Acceptance: >= 47.1
  
Let then modify a litle bit then we make like:
  - Poor: < 42
  - Borderline: 42 - 47
  - Acceptance: >= 47

In [66]:
thresholds = {'Poor': (0, 42), 'Borderline': (42, 47)}
data_final = assign_class(data, 'fcs', thresholds)

In [67]:
data_final.head()

Unnamed: 0,year,province,district,fcs,hdds,count,fcg,class
0,2006,Amajyaruguru,Burera,34.11,3.71,59,Poor,1
1,2006,Amajyaruguru,Gakenke,42.05,4.15,61,Borderline,2
2,2006,Amajyaruguru,Gicumbi,39.73,3.7,66,Poor,1
3,2006,Amajyaruguru,Musanze,37.83,4.05,108,Poor,1
4,2006,Amajyaruguru,Rulindo,44.52,4.54,46,Borderline,2


#### Dataset of 2010 - 2011

In [68]:
data_final['fcg'].value_counts()

fcg
Acceptance    54
Borderline    49
Poor          44
Name: count, dtype: int64

In [69]:
data_final['class'].value_counts()

class
3    54
2    49
1    44
Name: count, dtype: int64

In [70]:
#save the final data
data_final.to_csv(rw_dir+ 'final_data/rw_data_final.csv', index=False)

#### Assign the HDDS class

In [35]:
checking_classification_thresholds(data,'hdds') #checking classification thresholod for hdds

FCS Thresholds:
  - Poor: < 4.8
  - Borderline: 4.8 - 5.5
  - Acceptance: >= 5.5


In [36]:
input_file = 'Rwanda/rep_epa_2006-2021.xlsx'
thresholds = {
    'Poor': (0, 4.8),
    'Borderline': (4.8, 5.5),
    'Acceptance': (5.5, 12)
}
df_classified = assign_class_hdds_and_save(input_file, 'hdds', thresholds)

In [37]:
df_classified.head()

Unnamed: 0,year,province,district,fcs,hdds,count,fcg,class,DISTRICT_ID,class_hdds
0,2006,Amajyaruguru,Burera,34.11,3.71,59,Poor,1,1,1
1,2006,Amajyaruguru,Gakenke,42.05,4.15,61,Borderline,2,2,1
2,2006,Amajyaruguru,Gicumbi,39.73,3.7,66,Poor,1,3,1
3,2006,Amajyaruguru,Musanze,37.83,4.05,108,Poor,1,4,1
4,2006,Amajyaruguru,Rulindo,44.52,4.54,46,Borderline,2,5,1


#### Assign FCS and HDDS for burkina Faso 

In [4]:
input_file = 'rep_epa_2009-2018.xlsx'

In [5]:
df = pd.read_excel(input_file)
df.head()

Unnamed: 0,REGION,PROVINCE,COMMUNE,ID_COM,ANNEE,fcs,hdds,sca_inf,fcg,class_fcs,class_hdds,count
0,BOUCLE DU MOUHOUN,BALE,BAGASSI,1,2009,56.28,5.92,0.0,Borderline,2,3,25
1,BOUCLE DU MOUHOUN,BALE,BAGASSI,1,2010,65.733333,6.866667,0.0,Acceptance,3,3,30
2,BOUCLE DU MOUHOUN,BALE,BAGASSI,1,2011,58.552632,6.526316,0.0,Borderline,2,3,19
3,BOUCLE DU MOUHOUN,BALE,BAGASSI,1,2012,56.42,6.08,0.0,Borderline,2,3,25
4,BOUCLE DU MOUHOUN,BALE,BAGASSI,1,2013,66.62,6.4,0.0,Acceptance,3,3,25


In [9]:
checking_classification_thresholds(df,'hdds') #checking classification thresholod for hdds

FCS Thresholds:
  - Poor: < 4.9
  - Borderline: 4.9 - 5.9
  - Acceptance: >= 5.9


In [52]:
thresholds = {
    'Poor': (0, 4.9),
    'Borderline': (4.9, 5.9),
    'Acceptance': (5.9, 12)
}
df_classified = assign_class_hdds_and_save(input_file, 'sda', thresholds)

In [53]:
df_classified.head()

Unnamed: 0,REGION,PROVINCE,COMMUNE,ID_COM,ANNEE,sca,sda,sca_inf,count,fcg,class,class_hdds
0,BOUCLE DU MOUHOUN,BALE,BAGASSI,1,2009,56.28,5.92,0.0,25,Borderline,2,3
1,BOUCLE DU MOUHOUN,BALE,BAGASSI,1,2010,65.733333,6.866667,0.0,30,Acceptance,3,3
2,BOUCLE DU MOUHOUN,BALE,BAGASSI,1,2011,58.552632,6.526316,0.0,19,Borderline,2,3
3,BOUCLE DU MOUHOUN,BALE,BAGASSI,1,2012,56.42,6.08,0.0,25,Borderline,2,3
4,BOUCLE DU MOUHOUN,BALE,BAGASSI,1,2013,66.62,6.4,0.0,25,Acceptance,3,3


In [48]:
data_final.to_excel(input_file, index=False)