In [None]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
%matplotlib inline
import matplotlib

In [None]:
###Reading the data
data=pd.read_csv('/kaggle/input/house-data-prediction/Bengaluru_House_Data.csv')
data.head()

### Data Preparation and Cleaning

In [None]:
data.shape

In [None]:
for x in data.columns:
    print(data[x].value_counts())

In [None]:
data.isnull().sum()

In [None]:
data.columns

In [None]:
###Dropping of columns :-
#1)society column-Due to non availability of lot of data(missiing values)
#2)availability column-Assuming that  availability does not effect the price of house
#3)balcony column-Due to lot of missing values and also assuming number of balconies does not effect the price 
#4)area_type column- It is just the description of space of the house and  not the actual square fit of the property

data_new=data.drop(['area_type','availability','balcony','society'],axis='columns')
data_new.head()

In [None]:
### Checking for missing values in remaining columns
data_new.isna().sum()

In [None]:
##since the number of NA values are less compared to overall data present so we can drop the rows
##containing the missing values
data_new=data_new.dropna(axis='rows')
data_new.isnull().sum()

In [None]:
data_new['size'].unique()

In [None]:
###Taking only the number value present in the size of the house
data_new['bhk']=data_new['size'].apply(lambda x:x.split(' ')[0])
data_new.head()

In [None]:
## Checking for unique values in total_sqft columns
data_new.total_sqft.unique()

In [None]:
#For data in range in total_sqft we will replace it with the average by creating a User defined function
def convert_sqft(x):
    tokens=x.split('-')
    if len(tokens)==2:
        return (float(tokens[0])+float(tokens[1]))/2
    try:
        return float(x)
    except:
        return None

In [None]:
#apply the UDF to the total_sqft column 
data_new['total_sqft']=data_new['total_sqft'].apply(convert_sqft)

In [None]:
data_new.head()

Creating price per sqft column as it defines the price of the house

In [None]:
data_new['price_sqft']=data_new['price']*1000000/data_new['total_sqft']

In [None]:
data_new.head()

In [None]:
data_new.location.nunique()
#These are the unique(caterogircally different values) in location before our cleaning

In [None]:
#Stripping any whitespaces
data_new.location.apply(lambda x:x.strip())

In [None]:
#Basic data observation
location_stats=data_new.groupby("location")["location"].count().sort_values(ascending=False)
print(location_stats)

In [None]:
len(location_stats[location_stats<=10])
#We see out of 1304 we have 1063 locations which occur less than 10 times so to avoid creating extra variables and there
#dummies we can group them all together as others

In [None]:
#Grouping the location as 'others' which occur very rarely
data_new.location=data_new.location.apply(lambda x:"other" if x in location_stats[location_stats<=10] else x)
len(data_new.location.unique())
data_new.head()

In [None]:
data_new=data_new.drop(columns='size',axis=1)
data_new.head()

In [None]:
data_new['bhk']=data_new['bhk'].astype(float)

### Creating important UDF's for the continuous and categorical variables

In [None]:
# UDF for continuous varibale summary
def continuous_var_summary( x ):
    
    # freq and missings
    n_total = x.shape[0]
    n_miss = x.isna().sum()
    perc_miss = n_miss * 100 / n_total
    
    # outliers - iqr
    q1 = x.quantile(0.25)
    q3 = x.quantile(0.75)
    iqr = q3 - q1
    lc_iqr = q1 - 1.5 * iqr
    uc_iqr = q3 + 1.5 * iqr
    
    
    return pd.Series( [ x.dtype, x.nunique(), n_total, x.count(), n_miss, perc_miss,
                       x.sum(), x.mean(), x.std(), x.var(), 
                       lc_iqr, uc_iqr, 
                       x.min(), x.quantile(0.01), x.quantile(0.05), x.quantile(0.10), 
                       x.quantile(0.25), x.quantile(0.5), x.quantile(0.75), 
                       x.quantile(0.90), x.quantile(0.95), x.quantile(0.99), x.max() ], 
                     
                    index = ['dtype', 'cardinality', 'n_tot', 'n', 'nmiss', 'perc_miss',
                             'sum', 'mean', 'std', 'var',
                        'lc_iqr', 'uc_iqr',
                        'min', 'p1', 'p5', 'p10', 'p25', 'p50', 'p75', 'p90', 'p95', 'p99', 'max']) 

### Exploratory analysis

In [None]:
data_new.dtypes

In [None]:
data_new.isna().sum()

In [None]:
## We have missing values in total_sqft and price_sqft columns so we can fill the na values with mean using the UDF for 
## missing value so we can drop those rows as well
data_new=data_new.dropna(axis=0)

In [None]:
#separating x varible and the target Y variable
cols=['total_sqft', 'bath', 'price_sqft', 'bhk']
xvar_continuous=data_new[cols]
xvar_categorical=data_new['location']
yvar=data_new['price']

Data Exploration For Continuous variables

In [None]:
# summaries of continuous variables
xvar_continuous.select_dtypes(['float64']).apply(continuous_var_summary)

In [None]:
#We can see on comparing p99 and max value for the continuous x variables that there is a huge jump and thus all 
#the x variable are subjected to presence of outlier therefore we will do outlier treatment
xvar_continuous=xvar_continuous.apply( lambda x: x.clip(lower = x.quantile(0.01),upper = x.quantile(0.99)))

Data Exploration For Categorical Variable

In [None]:
###Summaries for categorical values
xvar_categorical.describe()

In [None]:
# An utility function to create dummy variable
def create_dummies(df, colname):
    col_dummies = pd.get_dummies(df[colname], prefix = colname, drop_first = True)
    df = pd.concat([df, col_dummies], axis = 1)
    df.drop(colname, axis = 1, inplace = True )
    return df

In [None]:
#We would use dummy variable creation for converting location categorical variablr to numerical
dummies=pd.get_dummies(xvar_categorical)
dummies.head()

Appending all the variables into a single dataframe

In [None]:
data_final=pd.concat([xvar_continuous,yvar,dummies],axis=1)
data_final.head()

#### Splitting the data into X features and Y Target variable


In [None]:
Y_final=data_final['price']
X_final=data_final[data_final.columns.difference(['price'])]

In [None]:
from sklearn.model_selection import train_test_split
X_train,X_test,Y_train,Y_test=train_test_split(X_final,Y_final,train_size=0.7,random_state=1234)

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
lr_clf=LinearRegression()
lr_clf.fit(X_train,Y_train)
lr_clf.score(X_test,Y_test)

#### Defining various models

In [None]:
#Defining various models with random hyperparameters

#1. Linear Regressor model
from sklearn.linear_model import LinearRegression
lr=LinearRegression()

#2. Decision Tree Regressor
from sklearn.tree import DecisionTreeRegressor
dt=DecisionTreeRegressor(random_state=123,criterion='mse',max_depth=10)

#3.Random Forest Regressor
from sklearn.ensemble import RandomForestRegressor
rf=RandomForestRegressor(random_state=123,n_estimators=100,max_depth=10)

#4.from sklearn.neighbors import KNeighborsRegressor
from sklearn.neighbors import KNeighborsRegressor
knn=KNeighborsRegressor(n_neighbors=10)

#5.XgBoost
from xgboost import XGBRegressor
xgb=XGBRegressor(random_state=123)


In [None]:
# Create a function that returns train accuracy of different models.
def train_accuracy(model):
    model.fit(X_train,Y_train)
    train_accuracy = model.score(X_train, Y_train)
    train_accuracy = np.round(train_accuracy*100, 2)
    return train_accuracy

In [None]:
# Making the summary table of train accuracy.
train_accuracy =pd.DataFrame({'Train_accuracy(%)':[train_accuracy(lr), train_accuracy(dt), train_accuracy(rf), train_accuracy(knn),train_accuracy(xgb)]})
train_accuracy.index = ['LinearReg', 'DecisionTree','RandomForest','KNN','XgBoost']
sorted_train_accuracy = train_accuracy.sort_values(by = 'Train_accuracy(%)', ascending = False)

In [None]:
#Training Accuracy of the Regressors
sorted_train_accuracy

##### Hence a high accuracy models were build with basic data cleaning steps and machine learning algorithms.