In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Lasso,Ridge
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score
import matplotlib.pyplot as plt
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
import pickle as pkl

In [2]:
df = pd.read_csv('mumbai-house-prices.csv')
df.head()

Unnamed: 0.1,Unnamed: 0,Price,Area,Location,No. of Bedrooms,New/Resale,Gymnasium,Lift Available,Car Parking,Maintenance Staff,24x7 Security,Children's Play Area,Clubhouse,Intercom,Landscaped Gardens,Indoor Games,Gas Connection,Jogging Track,Swimming Pool
0,0,4850000,720,Kharghar,1,0,0,1,1,1,1,0,0,0,0,0,0,0,0
1,1,4500000,600,Kharghar,1,0,1,1,1,1,1,0,1,0,0,0,0,1,1
2,2,6700000,650,Kharghar,1,0,1,1,1,1,1,1,1,1,0,0,0,1,1
3,3,4500000,650,Kharghar,1,0,0,1,1,1,1,0,0,1,1,0,0,0,0
4,4,5000000,665,Kharghar,1,0,0,1,1,1,1,0,0,1,1,0,0,0,0


In [3]:
df = df.drop(['Unnamed: 0','New/Resale','Gymnasium', 'Lift Available', 'Car Parking', 'Maintenance Staff','24x7 Security',
               "Children's Play Area", 'Clubhouse', 'Intercom','Landscaped Gardens', 'Indoor Games', 'Gas Connection',
               'Jogging Track','Swimming Pool'],axis=1)
df.head()
df.shape

(6347, 4)

In [4]:
df_temp = df.drop('Location',axis=1)
df_temp.corr()['Price']

Price              1.000000
Area               0.722336
No. of Bedrooms    0.594865
Name: Price, dtype: float64

In [5]:
df_temp.describe()

Unnamed: 0,Price,Area,No. of Bedrooms
count,6347.0,6347.0,6347.0
mean,15154010.0,1004.327084,1.910036
std,20159430.0,556.375703,0.863304
min,2000000.0,200.0,1.0
25%,5300000.0,650.0,1.0
50%,9500000.0,905.0,2.0
75%,17500000.0,1182.0,2.0
max,420000000.0,8511.0,7.0


In [6]:
# Using IQR
def remove_outlier(df,cols):
    newdf = None
    for col in cols:
        q1,q3 = df[col].quantile(0.25),df[col].quantile(0.75)
        iqr = q3-q1
        low = q1 - 1.5*iqr
        high = q3+1.5*iqr
        newdf = df[(df[col]<=high) & (df[col]>=low)]
        df = newdf
    return df

In [7]:
df['Price'] = np.log(df['Price'])

In [8]:
df_outlier_removed = remove_outlier(df,['Area','No. of Bedrooms'])
df_outlier_removed.describe()

Unnamed: 0,Price,Area,No. of Bedrooms
count,6008.0,6008.0,6008.0
mean,16.026443,914.106525,1.799434
std,0.774861,351.789133,0.726134
min,14.508658,200.0,1.0
25%,15.444751,640.0,1.0
50%,16.012735,880.0,2.0
75%,16.588099,1131.0,2.0
max,18.538464,1980.0,3.0


In [9]:
df_temp = df_outlier_removed.drop('Location',axis=1)
df_temp.corr()['Price']

Price              1.000000
Area               0.652071
No. of Bedrooms    0.672782
Name: Price, dtype: float64

In [10]:
df_outlier_removed.columns

Index(['Price', 'Area', 'Location', 'No. of Bedrooms'], dtype='object')

In [11]:
# Get the value counts of each location
location_counts = df_outlier_removed['Location'].value_counts()

# Identify locations with counts less than 10
locations_less_than_10 = location_counts[location_counts < 10].index

# Replace the identified locations with 'Other'
df_outlier_removed.loc[df_outlier_removed['Location'].isin(locations_less_than_10), 'Location'] = 'Other'

df_outlier_removed.reset_index(drop=True,inplace=True)

In [12]:
x = df_outlier_removed.drop(['Price'],axis=1)
y = df_outlier_removed['Price']
x,y

(      Area       Location  No. of Bedrooms
 0      720       Kharghar                1
 1      600       Kharghar                1
 2      650       Kharghar                1
 3      650       Kharghar                1
 4      665       Kharghar                1
 ...    ...            ...              ...
 6003   700          Other                1
 6004   900     Thane West                2
 6005   900     Thane West                2
 6006  1380         Boisar                3
 6007   700  Badlapur East                1
 
 [6008 rows x 3 columns],
 0       15.394489
 1       15.319588
 2       15.717618
 3       15.319588
 4       15.424948
           ...    
 6003    14.725783
 6004    16.489659
 6005    16.489659
 6006    15.226498
 6007    14.827111
 Name: Price, Length: 6008, dtype: float64)

In [13]:
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.2,random_state=12)

In [14]:
step1 = ColumnTransformer(transformers=[
    ('coln_transform', OneHotEncoder(sparse=False, drop='first'), [1])
], remainder='passthrough')

step2 = RandomForestRegressor(n_estimators=25,random_state=12)
pipe = Pipeline([
    ('step1',step1),
    ('step2',step2)
])
pipe.fit(x_train,y_train)
y_pred = pipe.predict(x_test)
r2_score(y_test,y_pred)



0.8511536955369221

In [15]:
df_outlier_removed

Unnamed: 0,Price,Area,Location,No. of Bedrooms
0,15.394489,720,Kharghar,1
1,15.319588,600,Kharghar,1
2,15.717618,650,Kharghar,1
3,15.319588,650,Kharghar,1
4,15.424948,665,Kharghar,1
...,...,...,...,...
6003,14.725783,700,Other,1
6004,16.489659,900,Thane West,2
6005,16.489659,900,Thane West,2
6006,15.226498,1380,Boisar,3


In [16]:
np.exp(pipe.predict([[650,"Bhandup West",2]])[0])



14166813.323164614

In [20]:
from tensorflow.keras.layers import Dense
from tensorflow.keras.models import Sequential

In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.optimizers import Adam

# Assuming you have your data loaded as x_train, y_train, x_test, y_test

# Define the ColumnTransformer for one-hot encoding and standardization
column_transformer = ColumnTransformer(transformers=[
    ('coln_transform', OneHotEncoder(sparse=False, drop='first'), [1]),
    ('scaler', StandardScaler(), [0, 2])  # Apply standardization to numerical features
], remainder='passthrough')

# Apply the same data preprocessing to both the training and test data
x_train_transformed = column_transformer.fit_transform(x_train)
x_test_transformed = column_transformer.transform(x_test)

# Define the ANN model
ann_model = Sequential([
    Dense(64, activation='relu', input_dim=x_train_transformed.shape[1]),  # Adjust input_dim based on the number of features
    Dense(64, activation='relu'),
    Dense(1)  # Output layer with 1 neuron for regression
])

# Compile the ANN model
ann_model.compile(optimizer=Adam(learning_rate=0.001), loss='mean_squared_error')

# Fit the ANN model to the transformed data
ann_model.fit(x_train_transformed, y_train, epochs=50, batch_size=32, verbose=0)  # Adjust the number of epochs

# Make predictions with the ANN
y_pred = ann_model.predict(x_test_transformed)



In [None]:
ann_r2 = r2_score(y_test, y_pred)
print(f"ANN R-squared Score: {ann_r2:.2f}")

y_train_pred = np.argmax(ann_model.predict(x_train), axis=-1)
y_test_pred = np.argmax(ann_model.predict(x_test), axis=-1)

# Calculate training and test scores (accuracy in this case)
train_score = accuracy_score(y_train, y_train_pred)
test_score = accuracy_score(y_test, y_test_pred)

print(f"Training Score: {train_score:.2f}")
print(f"Test Score: {test_score:.2f}")

In [41]:
ann_model.summary()

Model: "sequential_7"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_23 (Dense)            (None, 64)                5888      
                                                                 
 dense_24 (Dense)            (None, 64)                4160      
                                                                 
 dense_25 (Dense)            (None, 1)                 65        
                                                                 
Total params: 10,113
Trainable params: 10,113
Non-trainable params: 0
_________________________________________________________________


In [42]:
input_data = [[650,"Mulund West",2]]
input_data  = column_transformer.transform(input_data)
np.exp(ann_model.predict(input_data))





array([[19262454.]], dtype=float32)