# **Cities/Regions Encoding**

### **What to Expect ?**

This notebook aims to do the following 
 - Encode the cities
 - Encode the regions

This will be done by creating 2 maps
 - one for cities encoding
 - one for regions encoding

And replacing the string values by their encodings

## **0. prerequisites**

In [18]:
## necessary imports here
import pandas as pd
import json
import os

In [19]:
## helper functions/consts

def getFilePath(fileName, isSaveMapContext):
    templateToUse = "C:\\Users\\mohamedanas.neji\\OneDrive - Medius\\Desktop\\Housing_pricing\\utils\\" if isSaveMapContext else "C:\\Users\\mohamedanas.neji\\OneDrive - Medius\\Desktop\\Housing_pricing\\data\\raw\\" 
    return f"{templateToUse}{fileName}"

## dataset_clean.csv has location and city but city is more precise so we will be removing location in a seperate notebook
## tunisia-real-estate.csv has duplicated columns (Delegation and Locality) so we will be removing locality in another notebook
## tunisie_annonce_data(1).csv doesnt have a column for cities so we won't do them
FILES_TO_CITIES_COLUMN = {
    "dataset_clean.csv": "city", 
    "Property Prices in Tunisia.csv": "region",
    "tayara.csv": "region",
    "TechnocasaDataset.csv": "Subtitle",
    "tunisia-real-estate.csv": "Delegation"
}

## TechnocasaDataset.csv doesn't have a column for region so we won't be doing them
FILES_TO_REGION_COLUMN = {
    "dataset_clean.csv": "governorate", 
    "Property Prices in Tunisia.csv": "city",
    "tayara.csv": "city",
    "tunisia-real-estate.csv": "Governorate",
    "tunisie_annonce_data (1).csv": "Gouvernorat"
}

## **1. cities encoding**

### a- Create the cities encoding map

In [20]:
all_cities = set()
value_changes_map = {"mohamadia": "mohamedia", "mannouba": "manouba", "mégrine": "megrine", "kabbaria": "kabaria", "ouerdia": "el ouardia", "el omrane superieur": "omrane superieur"}
for file, cityColumn in FILES_TO_CITIES_COLUMN.items():
    filePath = getFilePath(file, isSaveMapContext=False)
    data = pd.read_csv(filePath)
    has_changed = False
    nan_count = data[cityColumn].isna().sum()
    data_cleaned = data
    if(nan_count > 0):
        has_changed = True
        # Remove rows with NaN in the city column
        data_cleaned = data.dropna(subset=[cityColumn])
        print(f"File '{file}' has {len(data) - len(data_cleaned)} rows removed due to missing '{cityColumn}' values.")
    
    # Some cities needs to be changed to ave the same name (sahitek sidahmed hhhhhh)
    for value_to_replace, replacement_value in value_changes_map.items():
        if value_to_replace in data[cityColumn].values:
            has_changed = True
            data_cleaned[cityColumn] = data_cleaned[cityColumn].replace(value_to_replace, replacement_value)
            print(f"Replaced '{value_to_replace}' with '{replacement_value}' in '{file}'.")
    
    if has_changed:
        # Update the CSV with cleaned data
        data_cleaned.to_csv(filePath, index=False)
    
    # Add unique cities to the set
    all_cities.update(data_cleaned[cityColumn].unique())

print(all_cities)

# Create a consistent mapping
all_cities = sorted(all_cities)  # Sort for consistency
city_to_num = {city: idx for idx, city in enumerate(all_cities)}

print(city_to_num)

## Save the mapping in a json file
JsonFileName = "city_mapping.json"
JsonFilePath = getFilePath(JsonFileName, isSaveMapContext=True)
if not os.path.exists(JsonFilePath):
    with open(JsonFilePath, "w") as json_file:
        json.dump(city_to_num, json_file, indent=4)
    print(f"JSON file '{JsonFilePath}' created.")
else:
    print(f"JSON file '{JsonFilePath}' already exists.")

Replaced 'mohamadia' with 'mohamedia' in 'dataset_clean.csv'.
Replaced 'mégrine' with 'megrine' in 'dataset_clean.csv'.
Replaced 'el omrane superieur' with 'omrane superieur' in 'dataset_clean.csv'.
Replaced 'mohamadia' with 'mohamedia' in 'tunisia-real-estate.csv'.
Replaced 'mannouba' with 'manouba' in 'tunisia-real-estate.csv'.
Replaced 'kabbaria' with 'kabaria' in 'tunisia-real-estate.csv'.
Replaced 'ouerdia' with 'el ouardia' in 'tunisia-real-estate.csv'.
{'bab bhar', 'manouba ville', 'ben arous', 'cite el khadra', 'denden', 'lac 2', 'oued ellil', 'sidi thabet', 'ksar said', 'manouba', 'jardins de carthage', 'hammam lif', 'medina jedida', 'kalaat landalous', 'rades', 'chotrana 3', 'la goulette', 'ennasr', 'borj louzir', 'ariana', 'menzah 9', 'menzah 7', 'tunis', 'sidi daoud', 'soukra', 'borj cedria', 'carthage', 'omrane superieur', 'megrine', 'manar 1', 'chotrana 2', 'raoued', 'menzah', 'mutuelleville', 'cite olympique', 'sidi bou said', 'ouardia', 'fouchana', 'menzah 6', 'ezzahra'

### b- Encode the cities

In [21]:
for file, cityColumn in FILES_TO_CITIES_COLUMN.items():
    filePath = getFilePath(file, isSaveMapContext=False)

    data = pd.read_csv(filePath)
    
    # Replace city names with numbers
    data[cityColumn] = data[cityColumn].map(city_to_num)
    
    # Save the processed file
    data.to_csv(filePath, index=False)


## **2. Regions Encoding**

### a- Create the regions encoding map

In [22]:
all_regions = set()
for file, regionColumn in FILES_TO_REGION_COLUMN.items():
    filePath = getFilePath(file, isSaveMapContext=False)

    data = pd.read_csv(filePath)

    nan_count = data[regionColumn].isna().sum()
    data_cleaned = data
    has_changed = False
    if(nan_count > 0):
        has_changed = True
        # Remove rows with NaN in the city column
        data_cleaned = data.dropna(subset=[regionColumn])
        print(f"File '{file}' has {len(data) - len(data_cleaned)} rows removed due to missing '{regionColumn}' values.")

    if has_changed:
        # Update the CSV with cleaned data
        data_cleaned.to_csv(filePath, index=False)
    
    # Add unique regions to the set
    all_regions.update(data_cleaned[regionColumn].unique())

print(all_regions)

# Create a consistent mapping
all_regions = sorted(all_regions)  # Sort for consistency
region_to_num = {region: idx for idx, region in enumerate(all_regions)}

print(region_to_num)

## Save the mapping in a json file
JsonFileName = "region_mapping.json"
JsonFilePath = getFilePath(JsonFileName, isSaveMapContext=True)
if not os.path.exists(JsonFilePath):
    with open(JsonFilePath, "w") as json_file:
        json.dump(city_to_num, json_file, indent=4)
    print(f"JSON file '{JsonFilePath}' created.")
else:
    print(f"JSON file '{JsonFilePath}' already exists.")

File 'tayara.csv' has 1 rows removed due to missing 'city' values.
{'ariana', 'ben arous', 'manouba', 'tunis'}
{'ariana': 0, 'ben arous': 1, 'manouba': 2, 'tunis': 3}
JSON file 'C:\Users\mohamedanas.neji\OneDrive - Medius\Desktop\Housing_pricing\utils\region_mapping.json' created.


### b- Encode the regions

In [23]:
for file, regionColumn in FILES_TO_REGION_COLUMN.items():
    filePath = getFilePath(file, isSaveMapContext=False)

    data = pd.read_csv(filePath)
    
    # Replace city names with numbers
    data[regionColumn] = data[regionColumn].map(region_to_num)
    
    # Save the processed file
    data.to_csv(filePath, index=False)