In [None]:
#importing modules
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures

In [None]:
#using drive function to link data stored in google drive
from google.colab import drive
drive.mount('/content/drive')

In [None]:
data=pd.read_csv("/content/drive/MyDrive/Colab Notebooks/Housing prices/housing.csv")

In [None]:
data.head()

In [None]:
data.info()

In [None]:
data=data.drop(["ocean_proximity"],axis=1)
data.info()

In [None]:
#cleaning the data - rather than deleting the rows with null values, I replaced it with interpolation method
data['total_bedrooms'] = data['total_bedrooms'].interpolate().astype('float64') 
#just using interpolate resulted in change in datatype of that particular column
data.info()

In [None]:
data.hist(bins=60,figsize=(16,9));plt.show()

In [None]:
#defining correlation matrix in order to find relation b/w each feature with other
'''
one thing to remember :
don't select/deselect features just on basis of correlation matrix,
feature knowledge and model performance to be taken care as well
'''
def corrMat(df,id=False):
    
    corr_mat = df.corr().round(2)
    f, ax = plt.subplots(figsize=(6,6))
    mask = np.zeros_like(corr_mat,dtype=bool)
    mask[np.triu_indices_from(mask)] = True
    sns.heatmap(corr_mat,mask=mask,vmin=-1,vmax=1,center=0, 
                cmap='plasma',square=False,lw=2,annot=True,cbar=False);plt.show()  
corrMat(data)

In [None]:
#removing outliers in median_house_value & housing_median_age, simply by aquiring the max value of that column and not including it in new dataset
max_val=data["median_house_value"].max()
new_data=data[data["median_house_value"]!=max_val]

max_val=new_data["housing_median_age"].max()
new_data=new_data[new_data["housing_median_age"]!=max_val]
new_data.hist(bins=120,figsize=(16,9));plt.show()

In [None]:
#adding new data to the dataset for better training of model
new_data["diag_coordinate"]=new_data["longitude"]+new_data["latitude"]
new_data["bedperroom"]=new_data["total_bedrooms"]/new_data["total_rooms"]
new_data.hist(bins=120,figsize=(16,9))
plt.show()

In [None]:
#since the above hist graphs for these were leftskewed 
#and +1 if zero values exits (since log(1) is not defined)
new_data["total_rooms"]=np.log(new_data["total_rooms"] +1) 
new_data["total_bedrooms"]=np.log(new_data["total_bedrooms"] +1)
new_data["population"]=np.log(new_data["population"] +1)
new_data["households"]=np.log(new_data["households"] +1)
new_data["bedperroom"]=np.log(new_data["bedperroom"] +1)

new_data.hist(bins=120,figsize=(16,9))
plt.show()

In [None]:
#separating the data in input and output format
X,Y=new_data.drop(['median_house_value'],axis=1) , new_data['median_house_value']

In [None]:
#normalizing the input features of dataset
x_norm=(X-X.mean())/X.std()
x_norm.head()

In [None]:
Y.head()

In [None]:
#converting pandas series to pandas dataframe
y_norm=Y.to_frame()
y_norm=(y_norm-y_norm.mean())/y_norm.std()
y_norm.head()

In [None]:
#splitting the dataset into training and testing set
x_train,x_test,y_train,y_test=train_test_split(x_norm,y_norm,test_size=0.2,random_state=0)

In [None]:
#declaring a polynomial model
poly_model=PolynomialFeatures(degree=4)
poly_x_train=poly_model.fit_transform(x_train)
poly_model.fit(poly_x_train,y_train)
regression_model=LinearRegression()
regression_model.fit(poly_x_train,y_train)

#predicting using regression model
poly_x_test=poly_model.fit_transform(x_test)
y_pred=regression_model.predict(poly_x_test)
regression_model.coef_
mean_squared_error(y_test,y_pred,squared=False)

In [None]:
#using multiple degree polynomials to check which fits the data the best
total_degrees=[1,2,3,4]
plt_mean_squared_error=[]

for degri in total_degrees:
  poly_model=PolynomialFeatures(degree=degri)

  poly_x_train=poly_model.fit_transform(x_train)
  poly_model.fit(poly_x_train,y_train)

  regression_model=LinearRegression()
  regression_model.fit(poly_x_train,y_train)
  y_pred=regression_model.predict(poly_x_train)
  plt_mean_squared_error.append(mean_squared_error(y_train,y_pred,squared=False))

plt.scatter(total_degrees,plt_mean_squared_error, color="green")
plt.plot(total_degrees,plt_mean_squared_error, color="red")
plt.plot()

In [None]:
poly_x_test=poly_model.fit_transform(x_test)
y_pred=regression_model.predict(poly_x_test)
regression_model.coef_
mean_squared_error(y_test,y_pred,squared=False)

In [None]:
#Fatest_poly=poly_reg.fit_transform(x_test)
#(pol_reg.predict(test_poly)-y_test).mean()

In [None]:
from sklearn.preprocessing import MinMaxScaler
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.callbacks import ReduceLROnPlateau
from tensorflow.keras import regularizers

In [None]:
# Split data into training and testing sets
train_data, test_data, train_labels, test_labels = train_test_split(x_norm,y_norm,test_size=0.2,random_state=0)
# Normalize the data
scaler = MinMaxScaler()
train_data = scaler.fit_transform(train_data)
test_data = scaler.transform(test_data)


In [None]:
# Define the neural network architecture
model = keras.Sequential([
    keras.layers.Dense(256, activation='relu',kernel_regularizer=regularizers.l1(0.001), input_shape=[train_data.shape[1]]),
    keras.layers.Dense(256, kernel_regularizer=regularizers.l1(0.001), activation='relu'),
    keras.layers.Dense(128, kernel_regularizer=regularizers.l1(0.001), activation='relu'),
    keras.layers.Dense(64, kernel_regularizer=regularizers.l1(0.001), activation='relu'),
    keras.layers.Dense(1)
])


In [None]:
# Compile the model
model.compile(optimizer='adam',
              loss='mean_squared_error',
              metrics=['mean_absolute_error'])


In [None]:
# Train the model
early_stopping = EarlyStopping(monitor='val_loss', patience=20)
lr_scheduler = ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=5, min_lr=1e-6)

history = model.fit(train_data, train_labels,batch_size=64, epochs=500,validation_data=(test_data,test_labels), callbacks=[early_stopping])