In [1]:
import pandas as pd
import numpy as np
import matplotlib
from matplotlib import pyplot as plt
import xlrd

In [2]:
df = pd.read_excel('cleaned_data.xlsx')

## Part 1: Getting Characteristic Rules
### Step 1: Removing columns with no higher purpose

In [3]:
df=df.drop(['Property_Name','Property_id','Project_URL','builder_id','Builder_name','description'], axis=1)
df.columns

Index(['Property_type', 'Property_status', 'Price_per_unit_area', 'Posted_On',
       'Property_building_status', 'City_id', 'City_name', 'No_of_BHK',
       'Locality_ID', 'Locality_Name', 'Longitude', 'Latitude', 'Price',
       'Size', 'Sub_urban_ID', 'Sub_urban_name', 'is_furnished',
       'listing_domain_score', 'is_plot', 'is_RERA_registered', 'is_Apartment',
       'is_ready_to_move', 'is_commercial_Listing', 'is_PentaHouse',
       'is_studio', 'Listing_Category'],
      dtype='object')

### Step 2: Generalization
#### Generalizing Price_per_unit_area

In [4]:
df['Price_per_unit_area'] = df['Price_per_unit_area'].astype(str)
df['Price_per_unit_area']=df['Price_per_unit_area'].str.replace(',','')#removed all the commas
df['Price_per_unit_area']=df['Price_per_unit_area'].astype(float)#converted string to integer
df['Price_per_unit_area']=df['Price_per_unit_area'].mask(df['Price_per_unit_area']<=4000,0)
df['Price_per_unit_area']=df['Price_per_unit_area'].mask(df['Price_per_unit_area'].between(4000,8000),1)
df['Price_per_unit_area']=df['Price_per_unit_area'].mask(df['Price_per_unit_area']>8000,2)
df['Price_per_unit_area']=df['Price_per_unit_area'].replace({0:'Low',1:'Medium',2:'High'})
df['Price_per_unit_area']

0         Medium
1         Medium
2         Medium
3            Low
4         Medium
           ...  
143703      High
143704      High
143705      High
143706      High
143707      High
Name: Price_per_unit_area, Length: 143708, dtype: object

#### Generalizing Price

In [5]:
df['Price']=df['Price'].astype(str)
df['Price']=df['Price'].str.replace(',','')#removed all the commas
df['Price']=df['Price'].astype(float)
df['Price']=df['Price'].mask(df['Price']<=6000000,0)
df['Price']=df['Price'].mask(df['Price'].between(6000000,15000000),1)
df['Price']=df['Price'].mask(df['Price']>15000000,2)
df['Price']=df['Price'].replace({0:'Low',1:'Medium',2:'High'})
df['Price']

0         Medium
1           High
2         Medium
3            Low
4         Medium
           ...  
143703       Low
143704       Low
143705       Low
143706       Low
143707       Low
Name: Price, Length: 143708, dtype: object

#### Generalizing Size

In [6]:
df['Size']=df['Size'].astype(str)
df['Size'] = df['Size'].str.replace(',','')
df['Size'] = df['Size'].str.replace(' sq ft','')
df['Size'] = df['Size'].astype(float)
df['Size']=df['Size'].mask(df['Size']<=2000,0)
df['Size']=df['Size'].mask(df['Size'].between(2000,10000),1)
df['Size']=df['Size'].mask(df['Size']>10000,2)
df['Size']=df['Size'].replace({0:'Low',1:'Medium',2:'High'})


In [7]:
df['Size'].head(100)

0        Low
1     Medium
2     Medium
3        Low
4        Low
       ...  
95       Low
96    Medium
97       Low
98       Low
99    Medium
Name: Size, Length: 100, dtype: object

In [8]:
# concept hierarchy->"locality->sub-urban->city"
city_no=df['City_name'].unique()
suburban_no=df['Sub_urban_name'].unique()
localities_no=df['Locality_Name'].unique()
print(len(city_no),len(suburban_no),len(localities_no))

8 84 1244


In [9]:
#considering generalization threshold as 10
#since there are 8 cities so cities will not be generalized
#both suburban and localities will be generalized to cities
#since we already have a column for cities we will drop both suburban and localities
df=df.drop(['Locality_Name','Locality_ID','Sub_urban_name','Sub_urban_ID'],axis=1)
df.columns

Index(['Property_type', 'Property_status', 'Price_per_unit_area', 'Posted_On',
       'Property_building_status', 'City_id', 'City_name', 'No_of_BHK',
       'Longitude', 'Latitude', 'Price', 'Size', 'is_furnished',
       'listing_domain_score', 'is_plot', 'is_RERA_registered', 'is_Apartment',
       'is_ready_to_move', 'is_commercial_Listing', 'is_PentaHouse',
       'is_studio', 'Listing_Category'],
      dtype='object')

In [10]:
#Since price can be calculated from price_per_unit_area and size therefore price column should be removed.
df=df.drop(['Price'],axis=1)
df.columns

Index(['Property_type', 'Property_status', 'Price_per_unit_area', 'Posted_On',
       'Property_building_status', 'City_id', 'City_name', 'No_of_BHK',
       'Longitude', 'Latitude', 'Size', 'is_furnished', 'listing_domain_score',
       'is_plot', 'is_RERA_registered', 'is_Apartment', 'is_ready_to_move',
       'is_commercial_Listing', 'is_PentaHouse', 'is_studio',
       'Listing_Category'],
      dtype='object')