# **Cities/Regions Encoding**

### **What to Expect ?**

This notebook aims to do the following 
 - Encode the cities
 - Encode the regions

This will be done by creating 2 maps
 - one for cities encoding
 - one for regions encoding

And replacing the string values by their encodings

## **0. prerequisites**

In [1]:
## necessary imports here
import pandas as pd
import json
import os
import numpy as np

In [2]:
## helper functions/consts

def getFilePath(fileName, isSaveMapContext):
    templateToUse = "C:\\Users\\mohamedanas.neji\\OneDrive - Medius\\Desktop\\Housing_pricing\\utils\\" if isSaveMapContext else "C:\\Users\\mohamedanas.neji\\OneDrive - Medius\\Desktop\\Housing_pricing\\data\\raw\\" 
    return f"{templateToUse}{fileName}"

## dataset_clean.csv has location and city but city is more precise so we will be removing location in a seperate notebook
## tunisia-real-estate.csv has duplicated columns (Delegation and Locality) so we will be removing locality in another notebook
## tunisie_annonce_data(1).csv doesnt have a column for cities so we won't do them
FILES_TO_CITIES_COLUMN = {
    "dataset_clean.csv": "city", 
    "Property Prices in Tunisia_Cleaned.csv": "region",
    "tayara.csv": "region",
    "TechnocasaDataset.csv": "Subtitle",
    "tunisia-real-estate.csv": "Delegation"
}

## TechnocasaDataset.csv doesn't have a column for region so we won't be doing them
FILES_TO_REGION_COLUMN = {
    "dataset_clean.csv": "governorate", 
    "Property Prices in Tunisia_Cleaned.csv": "city",
    "tayara.csv": "city",
    "tunisia-real-estate.csv": "Governorate",
    "tunisie_annonce_data (1).csv": "Gouvernorat"
}

FILES_TO_PRICE_COLUMN = {
    "dataset_clean.csv": "price_tnd", 
    "Property Prices in Tunisia_Cleaned.csv": "price",
    "tayara.csv": "price",
    "tunisia-real-estate.csv": "Price",
    "tunisie_annonce_data (1).csv": "Price",
    "TechnocasaDataset.csv": "Price"
}

## **1. cities encoding**

### a- Create the cities encoding map

In [3]:
city_prices_map = {}
value_changes_map = {"mohamadia": "mohamedia", "mannouba": "manouba", "mégrine": "megrine", "kabbaria": "kabaria", "ouardia": "el ouardia","ouerdia": "el ouardia", "el omrane superieur": "omrane superieur"}
for file, cityColumn in FILES_TO_CITIES_COLUMN.items():
    filePath = getFilePath(file, isSaveMapContext=False)
    priceColumn = FILES_TO_PRICE_COLUMN[file]
    data = pd.read_csv(filePath)
    has_changed = False
    nan_count = data[cityColumn].isna().sum()
    data_cleaned = data
    if(nan_count > 0):
        has_changed = True
        # Remove rows with NaN in the city column
        data_cleaned = data.dropna(subset=[cityColumn])
        print(f"File '{file}' has {len(data) - len(data_cleaned)} rows removed due to missing '{cityColumn}' values.")
    
    # Some cities needs to be changed to ave the same name (sahitek sidahmed hhhhhh)
    for value_to_replace, replacement_value in value_changes_map.items():
        if value_to_replace in data[cityColumn].values:
            has_changed = True
            data_cleaned[cityColumn] = data_cleaned[cityColumn].replace(value_to_replace, replacement_value)
            print(f"Replaced '{value_to_replace}' with '{replacement_value}' in '{file}'.")
    
    if has_changed:
        # Update the CSV with cleaned data
        data_cleaned.to_csv(filePath, index=False)
    
    # Add city-price pairs to the dictionary
    for _, row in data_cleaned.iterrows():
        city = row[cityColumn]
        price = row[priceColumn]
        if city in city_prices_map:
            city_prices_map[city].append(price)
        else:
            city_prices_map[city] = [price]      

# get the average of prices for each city
city_to_avg_prices = {city: np.average(city_prices_map[city]) for city in city_prices_map.keys()}
print(city_to_avg_prices)

# Create a consistent mapping
sorted_cities = sorted(city_to_avg_prices.items(), key=lambda x: x[1], reverse=True)  # Sort by valuation descending
city_to_num = {city: idx for idx, (city, _) in enumerate(sorted_cities, start=1)}  # Start encoding from 1

print(city_to_num)

## Save the mapping in a json file
JsonFileName = "city_mapping.json"
JsonFilePath = getFilePath(JsonFileName, isSaveMapContext=True)
if not os.path.exists(JsonFilePath):
    with open(JsonFilePath, "w") as json_file:
        json.dump(city_to_num, json_file, indent=4)
    print(f"JSON file '{JsonFilePath}' created.")
else:
    print(f"JSON file '{JsonFilePath}' already exists.")

Replaced 'mohamadia' with 'mohamedia' in 'dataset_clean.csv'.
Replaced 'mégrine' with 'megrine' in 'dataset_clean.csv'.
Replaced 'el omrane superieur' with 'omrane superieur' in 'dataset_clean.csv'.
Replaced 'ouardia' with 'el ouardia' in 'Property Prices in Tunisia_Cleaned.csv'.
Replaced 'mannouba' with 'manouba' in 'tunisia-real-estate.csv'.
Replaced 'kabbaria' with 'kabaria' in 'tunisia-real-estate.csv'.
Replaced 'ouerdia' with 'el ouardia' in 'tunisia-real-estate.csv'.
{'menzah': 460251.18483412324, 'marsa': 517180.875739645, 'bardo': 349817.0731707317, 'mornaguia': 245000.0, 'soukra': 453755.64416058396, 'kram': 601161.125, 'mnihla': 319704.54545454547, 'ariana ville': 347088.90995260666, 'omrane': 349384.6153846154, 'tunis': 334365.04761904763, 'ezzahra': 292863.1263157895, 'manouba': 217466.43835616438, 'megrine': 395213.9534883721, 'mornag': 208782.60869565216, 'el ouardia': 285014.81481481483, 'omrane superieur': 268500.0, 'boumhel el bassatine': 328196.77419354836, 'ettadhame

### b- Encode the cities

In [4]:
for file, cityColumn in FILES_TO_CITIES_COLUMN.items():
    filePath = getFilePath(file, isSaveMapContext=False)

    data = pd.read_csv(filePath)
    
    # Replace city names with numbers
    data[cityColumn] = data[cityColumn].map(city_to_num)
    
    # Save the processed file
    data.to_csv(filePath, index=False)


## **2. Regions Encoding**

### a- Create the regions encoding map

In [22]:
all_regions = set()
for file, regionColumn in FILES_TO_REGION_COLUMN.items():
    filePath = getFilePath(file, isSaveMapContext=False)
    priceColumn = FILES_TO_PRICE_COLUMN[file]
    data = pd.read_csv(filePath)

    nan_count = data[regionColumn].isna().sum()
    data_cleaned = data
    has_changed = False
    if(nan_count > 0):
        has_changed = True
        # Remove rows with NaN in the city column
        data_cleaned = data.dropna(subset=[regionColumn])
        print(f"File '{file}' has {len(data) - len(data_cleaned)} rows removed due to missing '{regionColumn}' values.")

    if has_changed:
        # Update the CSV with cleaned data
        data_cleaned.to_csv(filePath, index=False)
    
    # Add unique regions to the set
    all_regions.update(data_cleaned[regionColumn].unique())

print(all_regions)

# Create a consistent mapping
all_regions = sorted(all_regions)  # Sort for consistency
region_to_num = {region: idx for idx, region in enumerate(all_regions)}

print(region_to_num)

## Save the mapping in a json file
JsonFileName = "region_mapping.json"
JsonFilePath = getFilePath(JsonFileName, isSaveMapContext=True)
if not os.path.exists(JsonFilePath):
    with open(JsonFilePath, "w") as json_file:
        json.dump(city_to_num, json_file, indent=4)
    print(f"JSON file '{JsonFilePath}' created.")
else:
    print(f"JSON file '{JsonFilePath}' already exists.")

File 'tayara.csv' has 1 rows removed due to missing 'city' values.
{'ariana', 'ben arous', 'manouba', 'tunis'}
{'ariana': 0, 'ben arous': 1, 'manouba': 2, 'tunis': 3}
JSON file 'C:\Users\mohamedanas.neji\OneDrive - Medius\Desktop\Housing_pricing\utils\region_mapping.json' created.


### b- Encode the regions

In [23]:
for file, regionColumn in FILES_TO_REGION_COLUMN.items():
    filePath = getFilePath(file, isSaveMapContext=False)

    data = pd.read_csv(filePath)
    
    # Replace city names with numbers
    data[regionColumn] = data[regionColumn].map(region_to_num)
    
    # Save the processed file
    data.to_csv(filePath, index=False)