# ML Project House Price Prediction

In [12]:
# Step 1 Load Important Modules
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
# ==============================
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
print('Done')

Done


In [13]:
# Step 2 Load dataset
try:
  from sklearn.datasets import fetch_california_housing
  raw_data = fetch_california_housing()
  print('Done')
except:
  try:
    raw_data = pd.read_csv('/content/house_data.csv')
  except:
    raw_data = pd.read_csv('house_data.csv')
  finally:
    print('Done')

Done


FileNotFoundError: [Errno 2] No such file or directory: 'house_data.csv'

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
try:
  X = raw_data['data']
  y = raw_data['target']
  print('Done')
except:
  X = raw_data.iloc[:,:-1]
  y = raw_data.iloc[:,-1]
  print('Done')
# Equation of Line: y = M*X + C: Best Line Find

In [None]:
try:
  df = pd.DataFrame(X,columns = raw_data['feature_names'])
except:
  df = X

df.sample(3)

# Step 3: EDA

In [None]:
df.head()

In [None]:
df.tail()

In [None]:
df.info()

In [None]:
# Machine Learning Each X data much be numerical

In [None]:
# checking null values

df.isna().sum()
# Since all values are zero hence no missing data

In [None]:
r,c = df.shape
print('Total rows',r)
print('Total Columns',c)

In [None]:
plt.figure(figsize = (30,20))

first = 1
for i in df:
  plt.subplot(2,4,first)
  plt.title(f'Analysis by: {i}')
  sns.histplot(data = df, x = i, color = 'r',
               kde = True)
  plt.xticks(rotation = 45)
  first += 1
plt.savefig('Numerical_Analysis.jpg', dpi  = 1000)
plt.show()

In [None]:
# to check correlation b/w X features
df.corr()

In [None]:
# to check correlation by visuals
sns.pairplot(data = df)
plt.show()

In [None]:
sns.heatmap(df.corr(), annot = True)
plt.show()

In [None]:
plt.hist(y, color = 'b', label = 'House Price')
plt.legend()
plt.show()

In [None]:
# print(raw_data['DESCR'])

In [None]:
print(y)

In [None]:
# House Price Ans: 2.5  = 2.5 * 100000
# 4.5 = 4.5*100000


In [None]:
final_X = df.iloc[:, :-2]

In [None]:
# Here we are using -2 Ignore last two columns

Step 4 : Convert data to standard Scaling

In [None]:
from sklearn.preprocessing import StandardScaler
# ML: Works on distance algos, it is
# important to keep every data at same scale

final_X = df.iloc[:, :-2]

scaler = StandardScaler()
scaled_X = scaler.fit_transform(final_X)
print('Done')

In [None]:
#

# Step 5: ML Model Creation

In [None]:
# train test split: divide data into two parts

In [None]:
from sklearn.model_selection import train_test_split

X_train,X_test,y_train, y_test = train_test_split(scaled_X, y)
print('Done')

In [None]:
all_models = [LinearRegression,
              SVR,
              DecisionTreeRegressor,
              RandomForestRegressor,
              KNeighborsRegressor]

all_models_name = ['Linear regression',
                   'Support Vector',
                   'Decision Tree',
                   'Random Forest',
                   'KNN']
print('Done')

In [None]:
model_dict = {'Model_Name':[],
              'ML_Model':[],
              'Score':[],
              'MAE':[],
              'MSE':[],
              'RMSE':[]}

counter = 0
for i in all_models:
  print('Training: ',all_models_name[counter])
  ml_model = i()

  # Training
  ml_model.fit(X_train,y_train)

  # prediction
  y_pred = ml_model.predict(X_test)

  # Model Score
  score = ml_model.score(X_test,y_test)

  # Errors Comparison
  mae = mean_absolute_error(y_test,y_pred)
  mse = mean_squared_error(y_test,y_pred)
  rmse = mse**0.5
  # RMSE: root mean squared Error

  # data append

  model_dict['Model_Name'].append(all_models_name[counter])
  counter += 1

  model_dict['ML_Model'].append(ml_model)
  model_dict['Score'].append(score)
  model_dict['MAE'].append(mae)
  model_dict['MSE'].append(mse)
  model_dict['RMSE'].append(rmse)

print('Done')

In [None]:
#

# Step 6: Model Comparison

In [None]:
compare_df = pd.DataFrame(model_dict)

In [None]:
compare_df

In [None]:
# Because RandomForest has highest score,
# we will use RF
# as final Model

In [None]:
plt.title('All ML Model Comparison')
chart = sns.barplot(data = compare_df,
            x= 'Model_Name',
            y = 'Score',
            hue = 'Model_Name',
            palette = sns.color_palette('rainbow'))

for i in chart.containers:
  plt.bar_label(i)
plt.xticks(rotation = 45)
plt.show()

# Step 7: Final Model Save

In [None]:
# Random Forest

final_model = compare_df.iloc[3,1]
print('Done')

# Step 8: Save necessary objects

In [None]:
# Model save
import pickle
with open('chatgpt.pkl', 'wb') as f:
  pickle.dump(final_model,f)
  print('Model Saved Successfully')


# Step 9 : Website Deployment using Streamlit

In [None]:
# pip install streamlit
print('Done')