# Imports

In [1]:
import pandas as pd # to interact with dataframes 
from sklearn.preprocessing import LabelEncoder, OrdinalEncoder #for preprocessing
import joblib #to save encoders and models 
import os #to interact with hardware and create folders 

# Constants

In [2]:
# Initialize filepaths 
DESTINATION = 'Data/'
ORDINAL_ENCODER_PATH = 'Objects/Encoders/OrdinalEncoder/'
LABEL_ENCODER_PATH = 'Objects/Encoders/LabelEncoder/'

# Reading the Data

In [3]:
data = pd.read_csv('_All_Cities_Cleaned.csv') # read the data 

In [4]:
data.head() # see first 5 rows of the data 

Unnamed: 0,seller_type,bedroom,layout_type,property_type,locality,price,area,furnish_type,bathroom,city
0,OWNER,2.0,BHK,Apartment,Bodakdev,20000.0,1450.0,Furnished,2.0,Ahmedabad
1,OWNER,1.0,RK,Studio Apartment,CG Road,7350.0,210.0,Semi-Furnished,1.0,Ahmedabad
2,OWNER,3.0,BHK,Apartment,Jodhpur,22000.0,1900.0,Unfurnished,3.0,Ahmedabad
3,OWNER,2.0,BHK,Independent House,Sanand,13000.0,1285.0,Semi-Furnished,2.0,Ahmedabad
4,OWNER,2.0,BHK,Independent House,Navrangpura,18000.0,1600.0,Furnished,2.0,Ahmedabad


In [5]:
# create a copy of the subsets of the dataframes for each city
ahmedabad = data[data['city'] == 'Ahmedabad'].copy()
bangalore = data[data['city'] == 'Bangalore'].copy()
chennai = data[data['city'] == 'Chennai'].copy()
delhi = data[data['city'] == 'Delhi'].copy()
hyderabad = data[data['city'] == 'Hyderabad'].copy()
kolkata = data[data['city'] == 'Kolkata'].copy()
mumbai = data[data['city'] == 'Mumbai'].copy()
pune = data[data['city'] == 'Pune'].copy()

In [6]:
df_list = [ahmedabad,bangalore,chennai,delhi,hyderabad,kolkata,mumbai,pune] #store the dataframes in a list
cities = ['AHEMDABAD','BANGALORE','CHENNAI','DELHI','HYDERABAD','KOLKATA','MUMBAI','PUNE'] # store the city names in a list
df_dict = dict(zip(cities,df_list)) #create a dictionary of dataframes with cities as key

In [7]:
def preproccess(df_dict):
    for city, df in df_dict.items():
        # Handling duplicates and dropping irrelevant columns 
        df.drop(['city'],axis=1,inplace=True)
        df.drop_duplicates(inplace=True)

        # Converting columns to upper case 
        cols = ['SELLER TYPE','BEDROOM','LAYOUT TYPE','PROPERTY TYPE','LOCALITY','PRICE','AREA','FURNISH TYPE','BATHROOM']
        df.columns = cols

        # Handling furniture type
        if not os.path.exists(ORDINAL_ENCODER_PATH):
            os.makedirs(ORDINAL_ENCODER_PATH)
        enc = OrdinalEncoder(categories=[['Unfurnished','Semi-Furnished','Furnished']]) #build the encoder 
        df['FURNISH TYPE'] = enc.fit_transform(df[['FURNISH TYPE']]) #do the transformation
        if not os.path.exists(os.path.join(ORDINAL_ENCODER_PATH, 'furniture_encoders')): #check if the desired file path exists
            os.makedirs(os.path.join(ORDINAL_ENCODER_PATH, 'furniture_encoders')) #if not then make one 
        joblib.dump(enc, os.path.join(ORDINAL_ENCODER_PATH, 'furniture_encoders', f'{city}_furnish_type_encoder.pkl')) #dump the encoder

        # handling seller_type, layout_type, property_type
        # group the dataframes by that column and then see the mean of price 
        # sort the categories in ascending order of the price and store those categories as a list
        cat_seller = df.groupby(by=['SELLER TYPE'])['PRICE'].mean().sort_values(ascending=True).index.values.tolist()
        cat_layout_type = df.groupby(by=['LAYOUT TYPE'])['PRICE'].mean().sort_values(ascending=True).index.values.tolist()
        cat_property_type = df.groupby(by=['PROPERTY TYPE'])['PRICE'].mean().sort_values(ascending=True).index.values.tolist()
        # create ordinal encoder for each column as the categories as categories for the ordinal encoder
        seller_enc = OrdinalEncoder(categories=[cat_seller])
        layout_enc = OrdinalEncoder(categories=[cat_layout_type])
        property_enc = OrdinalEncoder(categories=[cat_property_type])
        # fit the encoder to that column 
        df['SELLER TYPE'] = seller_enc.fit_transform(df[['SELLER TYPE']])
        df['LAYOUT TYPE'] = layout_enc.fit_transform(df[['LAYOUT TYPE']])
        df['PROPERTY TYPE'] = property_enc.fit_transform(df[['PROPERTY TYPE']])
        # check if the path exists for that encoder, if not then create the path
        if not os.path.exists(os.path.join(ORDINAL_ENCODER_PATH, 'seller_type')):
            os.makedirs(os.path.join(ORDINAL_ENCODER_PATH, 'seller_type'))
        
        if not os.path.exists(os.path.join(ORDINAL_ENCODER_PATH, 'layout_type')):
            os.makedirs(os.path.join(ORDINAL_ENCODER_PATH, 'layout_type'))
        
        if not os.path.exists(os.path.join(ORDINAL_ENCODER_PATH, 'property_type')):
            os.makedirs(os.path.join(ORDINAL_ENCODER_PATH, 'property_type'))
        # dump the encoder in that file path 
        joblib.dump(seller_enc, os.path.join(ORDINAL_ENCODER_PATH, 'seller_type', f'{city}_seller_type_encoder.pkl'))
        joblib.dump(layout_enc, os.path.join(ORDINAL_ENCODER_PATH, 'layout_type',f'{city}_layout_type_encoder.pkl'))
        joblib.dump(property_enc, os.path.join(ORDINAL_ENCODER_PATH, 'property_type',f'{city}_property_type_encoder.pkl'))   

        # Handling locality
        # convert the locality to upper case for consistency
        df['LOCALITY'] = df['LOCALITY'].str.upper()
        # check if the path exists, if not then create the folder 
        if not os.path.exists(LABEL_ENCODER_PATH):
            os.makedirs(LABEL_ENCODER_PATH)
        # initialize the label encoder 
        enc = LabelEncoder()
        # fit and transform the locality column 
        df['LOCALITY'] = enc.fit_transform(df['LOCALITY'])
        # save the encoder to the desired filepath 
        joblib.dump(enc, os.path.join(LABEL_ENCODER_PATH, f'{city}_locality_encoder.pkl'))

        # Removing outliers 
        desc = df['PRICE'].describe()
        iqr = desc.loc['75%'] - desc.loc['25%']
        lower_limit = desc.loc['25%'] - 1.5*iqr
        upper_limit = desc.loc['75%'] + 1.5*iqr
        df = df[(df['PRICE']>=lower_limit)&(df['PRICE']<=upper_limit)]

        # Saving the dataframes 
        # checking if the path exists, if not then create a path
        if not os.path.exists('Data/'):
            os.makedirs('Data/')
        # save the dataframe to the path 
        df.to_csv(os.path.join(DESTINATION,f'{city}.csv'),index=False)

In [8]:
preproccess(df_dict) # call the preprocess function 