# Aim: To Predict Monthly House Rent in Mumbai City with given Data. 

## Approch: 
- Import Dataset and have a Quick look at it.
- Analyze and Clean Data. Look into each and every feature and make them clean for use.
- Remove Outliers and exception data by visualizing features.
- Visualize and have a look how features related with monthly rent.
- Prepare hand engineered features and Scale Data for Machine learning model.
- Try Different ML models and calculate their accuracy scores.
- Fine tune the model using GridsearchCV and Select best model.

#In this notebook we will focus on cleaning and preparing data. 

[Machine Learning modeling notebook](https://github.com/G0rav/House_Rent_Predictor/blob/main/HouseRentModel.ipynb)



# 1.Importing Data and required packages.

In [None]:
from google.colab import drive
drive.mount('/content/drive')

ModuleNotFoundError: No module named 'google'

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import warnings                                             # only to ignore python warnings
%matplotlib inline
warnings.simplefilter("ignore")

In [None]:
data = pd.read_csv('/content/drive/MyDrive/Datasets/Mumbai_house_rent_99acers/Mumbai_99acers.csv')
data.head(3)

# 2.Quick Analyzing Data. 
<a href = '#top'>Back on Top</a>

In [None]:
data.info()

In [None]:
df = data.copy()

###  Dropping useless features. (In my view, as per requirements) 

In [None]:
df = df.drop(df.iloc[:,[0,4,5,12,13,15,16,17,18,19,20,21]],axis = 1)
df.head()

In [None]:
df.isnull().sum()

In [None]:
df.dtypes

### Removing useless spaces from non-numeric columns, just in case if they have. 

In [None]:
for i in df.columns.drop(['built-up area','monthly rent']):
    df[i] = df[i].apply(lambda x: x.strip())

In [None]:
df.head()

# 3.Feature Engineering

 <a href = '#top'>Back on Top </a>

## Handling categorical columns

<a href = '#top'>Back on Top </a>

### Merging societies having count less than 10.

In [None]:
society_count = pd.DataFrame(df['society'].value_counts())
society_count

In [None]:
society_count[society_count['society']<10].shape

In [None]:
society_count_less_than_10_index_list = society_count[society_count['society']<10].index
df['society'] = df['society'].apply(lambda x: 'other society' if x in society_count_less_than_10_index_list else x)

In [None]:
df['society'].value_counts()

In [None]:
sc = pd.DataFrame(df['society'].value_counts().iloc[1:,])

plt.figure(figsize=(8,28))
sns.barplot(x = 'society', y = sc.index, data=sc)
plt.show()

### type column

In [None]:
df['type'].value_counts()

### Merging locations having count less than 10. only to simplify our model.

In [None]:
location_count = pd.DataFrame(df['location'].value_counts())
location_count

In [None]:
location_count_less_than_10_index = location_count[location_count['location']<10].index
df['location'] = df['location'].apply(lambda x: 'other location' if x in location_count_less_than_10_index else x)

In [None]:
df['location'].value_counts()

In [None]:
location_count = pd.DataFrame(df['location'].value_counts().iloc[1:,])

plt.figure(figsize=(8,28))
sns.barplot(x = 'location', y = location_count.index, data=location_count)
plt.show()

### furnishing

In [None]:
df.furnishing.value_counts()

In [None]:
plt.pie(df.furnishing.value_counts(), labels = df.furnishing.value_counts().index)
plt.show()

### Age

In [None]:
df.age.value_counts()

## Handling numerical data

<a href = '#top'>Back on Top </a>

### Extracting only numerals from bedrooms column.

In [None]:
df['bedrooms'].value_counts()

In [None]:
df['bedrooms'] = df['bedrooms'].apply(lambda x: x.strip(' ')[0])
df['bedrooms'].value_counts()

### Making floor and total floors = 0 for Independent House/Villa

In [None]:
df[df['type'] == 'Independent House/Villa']

In [None]:
type_index_list = df[df['type'] == 'Independent House/Villa'].index

for i in type_index_list:
    df['floor'][i] = 0
    df['total floors'][i] = 0

### Cleaning floor column

In [None]:
df['floor'].unique()

In [None]:
df.replace({'floor': ['Ground','ried']}, 0, inplace =True)
df.replace({'floor': '40+'},41, inplace =True)
df.replace({'floor': 'Not Mentioned'}, np.NaN, inplace =True)
df = df.replace({'floor': '[A-Za-z]'}, ' ', regex = True)
df['floor'] = df['floor'].astype(str)
df['floor'] = df['floor'].apply(lambda x: x.strip())
df = df.dropna()

In [None]:
df = df.drop(df[df['floor']=='nan'].index, axis=0)
df['floor'].unique()

In [None]:
df['total floors'].unique()

In [None]:
(df['total floors'] == 'Not Mentioned').sum()

In [None]:
df[df['total floors'] == 'Not Mentioned']

In [None]:
df.replace({'total floors': 'Not Mentioned'}, np.NaN, inplace = True)
df = df.dropna()

In [None]:
df['total floors'].unique()

# 4.EDA

<a href = '#top'>Back on Top </a>

## EDA of numerical datatypes

<a href = '#top'>Back on Top </a>

In [None]:
df.info()

In [None]:
#At last convert all columns contains numerals into integer datatype
df['bedrooms'] = df['bedrooms'].astype(int)
df['floor'] = df['floor'].astype(int)
df['total floors'] = df['total floors'].astype(int)

In [None]:
df.head()

In [None]:
def plot_num(df, columns):
  data = df

  for column in columns:
      plt.figure(figsize=(16,4))

      plt.subplot(1,3,1)
      sns.histplot(data[column], kde=True)
      plt.title(f'{column}  Distribution')

      plt.subplot(1,3,2)
      sns.boxplot(y = data[column], showmeans=True)
      plt.title(f'{column}  Distribution')

      plt.subplot(1,3,3)
      counts, bins = np.histogram(data[column])
      cdf = np.cumsum (counts)
      plt.plot (bins[1:], cdf/cdf[-1])
      plt.yticks(np.arange(0,1.05,.05))
      plt.title(f'{column}  cdf')
      plt.show()

In [None]:
int_columns = df.columns[df.dtypes!='object']
plot_num(df,int_columns)

In [None]:
# Quantiles
for column in int_columns:
    print(f'For {column}:')

    print('3º Quartile:', df[column].quantile(q = 0.75))
    print('95 Quantile:', df[column].quantile(q = 0.95))
    print('Max:', df[column].quantile(q = 1.00),'\n')

In [None]:
df['bedrooms'] = df['bedrooms'].apply(lambda x: x if x<5 else 5)
df['total floors'] = df['total floors'].apply(lambda x: x if x<41 else 41)

In [None]:
df['built-up area'].quantile(q = 0.99)

In [None]:
df.drop(df[df['built-up area']>3500].index, axis=0, inplace=True)

In [None]:
int_columns = df.columns[df.dtypes!='object']
plot_num(df, int_columns)

In [None]:
sns.histplot(df['monthly rent'], kde=True)
plt.ticklabel_format(style='plain')
plt.xticks(rotation=45)
plt.show()

In [None]:
print('95th', df['monthly rent'].quantile(q = 0.95))
print('97th', df['monthly rent'].quantile(q = 0.97))
print('99th', df['monthly rent'].quantile(q = 0.99))

In [None]:
df.drop(df[df['monthly rent']>200000].index, axis=0, inplace=True)

In [None]:
int_columns = df.columns[df.dtypes!='object']
plot_num(df, int_columns)

# 5.Exporting now
Finally Cleaned Pheww.. :) 


<a href = '#top'>Back on Top </a>

In [None]:
df

In [None]:
df.to_csv('/content/drive/MyDrive/Datasets/Mumbai_house_rent_99acers/Mumbai_99acers_cleaned.csv', index=False)

###Log transformation

In [None]:
df1 = df.copy()
int_columns = df1.columns[df1.dtypes!='object']

for column in int_columns:
  df1[column] = df1[column].apply(lambda x: np.log(x) if x!=0 else x)

In [None]:
plot_num(df1,int_columns)

In [None]:
df['monthly rent'] = df['monthly rent'].apply(np.log)

In [None]:
df.to_csv('/content/drive/MyDrive/Datasets/Mumbai_house_rent_99acers/Mumbai_99acers_logtransformed.csv', index=False)

# Created by: Gaurav Jain

<a href = 'https://www.linkedin.com/in/gaurav2022/'> Linkedin </a>

<a href = 'https://github.com/G0rav'> Github </a>

<a href = 'https://www.kaggle.com/gaurav2022'> Kaggle </a>
