## This Notebook includes the following :
 - renaming existing columns
 - deleting columns that don't exist in the template sent
 - adding new columns from the template that don't exist in the file (Property Prices in Tunisia_Cleaned.csv)
 - reorder the columns to match the template

This is done in a step to make all datasets follow the same template in order to facilitate datasets merge

In [29]:
import pandas as pd

In [28]:
## importing file_utils 
import sys

# setting path
sys.path.append('../')

from utils.file_utils.file_utils import getUnprocessedDataSetFilePath

In [36]:
# Load the CSV file
REPO_PATH = "C:\\Users\\mohamedanas.neji\\OneDrive - Medius\\Desktop\\Housing_pricing"
FILE_PATH = getUnprocessedDataSetFilePath("Property Prices in Tunisia_Cleaned.csv", REPO_PATH)

df = pd.read_csv(FILE_PATH)

# Display the first 5 rows of the dataframe
df.head()

Unnamed: 0,category,room_count,bathroom_count,size,type,price,region
0,Appartements,2.0,2.0,113.0,À Vendre,170000.0,28
1,Appartements,3.0,2.0,138.0,À Vendre,340000.0,52
2,Appartements,2.0,1.0,80.0,À Vendre,230000.0,31
3,Appartements,4.0,2.0,200.0,À Vendre,440000.0,31
4,Appartements,2.0,1.0,89.0,À Vendre,136000.0,83


In [37]:
# Rename existing columns
df.rename(columns={
    'size': 'surface',
    'region': 'city',
    'room_count': 'rooms',
    'bathroom_count': 'bathrooms',
}, inplace=True)

df.head()

Unnamed: 0,category,rooms,bathrooms,surface,type,price,city
0,Appartements,2.0,2.0,113.0,À Vendre,170000.0,28
1,Appartements,3.0,2.0,138.0,À Vendre,340000.0,52
2,Appartements,2.0,1.0,80.0,À Vendre,230000.0,31
3,Appartements,4.0,2.0,200.0,À Vendre,440000.0,31
4,Appartements,2.0,1.0,89.0,À Vendre,136000.0,83


In [38]:
# Delete columns that don't exist in the template
columns_to_drop = [col for col in df.columns if col not in ['surface', 'city', 'rooms', 'bathrooms', 'price']]
df.drop(columns=columns_to_drop, inplace=True)

df.head()

Unnamed: 0,rooms,bathrooms,surface,price,city
0,2.0,2.0,113.0,170000.0,28
1,3.0,2.0,138.0,340000.0,52
2,2.0,1.0,80.0,230000.0,31
3,4.0,2.0,200.0,440000.0,31
4,2.0,1.0,89.0,136000.0,83


In [39]:
# Add new columns from the template that don't exist in the file
new_columns = ['parking', 'pool', 'vue_panoramique', 'jardin', 'climatisation', 'chauffage_central', 'ascenseur']
for col in new_columns:
    df[col] = None  # Initialize new columns with None or NaN

df.head()

Unnamed: 0,rooms,bathrooms,surface,price,city,parking,pool,vue_panoramique,jardin,climatisation,chauffage_central,ascenseur
0,2.0,2.0,113.0,170000.0,28,,,,,,,
1,3.0,2.0,138.0,340000.0,52,,,,,,,
2,2.0,1.0,80.0,230000.0,31,,,,,,,
3,4.0,2.0,200.0,440000.0,31,,,,,,,
4,2.0,1.0,89.0,136000.0,83,,,,,,,


In [40]:
# Reorder the columns to match the template
column_order = ['surface', 'city', 'rooms', 'bathrooms', 'parking', 'pool', 'vue_panoramique', 'jardin', 'climatisation', 'chauffage_central', 'ascenseur', 'price']
df = df[column_order]

df.head()

Unnamed: 0,surface,city,rooms,bathrooms,parking,pool,vue_panoramique,jardin,climatisation,chauffage_central,ascenseur,price
0,113.0,28,2.0,2.0,,,,,,,,170000.0
1,138.0,52,3.0,2.0,,,,,,,,340000.0
2,80.0,31,2.0,1.0,,,,,,,,230000.0
3,200.0,31,4.0,2.0,,,,,,,,440000.0
4,89.0,83,2.0,1.0,,,,,,,,136000.0


In [41]:
## overwrite the file
df.to_csv(FILE_PATH, index=False)