In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# A Notebook to Predict the final price of each home

## Import Dependencies

In [3]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

## Load the Data

In [4]:
df = pd.read_csv("/kaggle/input/house-prices-advanced-regression-techniques/train.csv")
test_data = pd.read_csv("/kaggle/input/house-prices-advanced-regression-techniques/test.csv")

In [5]:
print("Shape is: ", df.shape)
df.head()

In [6]:
print("Shape is: ", test_data.shape)
test_data.head()

In [7]:
df.info()

## Encode Categorical Data

In [8]:
df['MSZoning'].value_counts()

In [9]:
# Check Categorical Variables
cat = df.select_dtypes(include='O').keys()
cat

In [10]:
# Encode Categorical Variables
for column_name in df.select_dtypes(include='O').keys():
    print("Encoding Column: ", column_name)
    le = LabelEncoder()
    df[column_name] = le.fit_transform(df[column_name].astype(str))

# Encode Categorical Variables
for column_name in test_data.select_dtypes(include='O').keys():
    print("Encoding Column: ", column_name)
    le = LabelEncoder()
    test_data[column_name] = le.fit_transform(test_data[column_name].astype(str))

In [11]:
del df['Id']

In [12]:
df.head()

In [13]:
test_data.head()

## Handle Missing Values

In [14]:
df.isnull().sum()

In [15]:
# Drop Features with High Missing Values
df = df.drop(['FireplaceQu','Fence','Alley','MiscFeature','PoolQC'], axis=1)
test_data = test_data.drop(['FireplaceQu','Fence','Alley','MiscFeature','PoolQC'], axis=1)

In [16]:
# Fill Nan Values of Categorical features with Mode, others with mean

for column_name in df.columns:
    print("Evaluating Column: ", column_name)
    if column_name in df.select_dtypes(include='O').keys():
        df[column_name] = df[column_name].fillna(df[column_name].mode())
    else:
        df[column_name] = df[column_name].fillna(df[column_name].mean())

for column_name in test_data.columns:
    print("Evaluating Column: ", column_name)
    if column_name in test_data.select_dtypes(include='O').keys():
        test_data[column_name] = test_data[column_name].fillna(test_data[column_name].mode())
    else:
        test_data[column_name] = test_data[column_name].fillna(test_data[column_name].mean())

In [17]:
df.isnull().sum()

In [18]:
test_data.isnull().sum()

In [19]:
print("Shape of Train Data: ", df.shape)
df.head()

In [20]:
print("Shape of Test Data: ", test_data.shape)
test_data.head()

## Train-Test Split

In [21]:
predict = "SalePrice"
df_train = df.drop(columns=[predict])
y = df[[predict]].values.ravel()

In [22]:
X_train, X_test, y_train, y_test = train_test_split(df_train, y, test_size=0.20, random_state=42)

In [23]:
X_train.shape

In [24]:
y_train.shape

## Build the Model

In [26]:
from xgboost import XGBRegressor
model_1 = XGBRegressor()
model_1.fit(X_train, y_train) 
print(model_1.score(X_test, y_test))

In [27]:
from sklearn.ensemble import RandomForestRegressor
model_2 = RandomForestRegressor()
model_2.fit(X_train, y_train) 
print(model_2.score(X_test, y_test))

In [28]:
from sklearn.ensemble import GradientBoostingRegressor
model_3 = GradientBoostingRegressor()
model_3.fit(X_train, y_train) 
print(model_3.score(X_test, y_test))

In [29]:
from lightgbm import LGBMRegressor
model_4 = LGBMRegressor()
model_4.fit(X_train, y_train) 
print(model_4.score(X_test, y_test))

In [30]:
from catboost import CatBoostRegressor
model_5 = CatBoostRegressor()
model_5.fit(X_train, y_train) 
print(model_5.score(X_test, y_test))

In [31]:
from sklearn.linear_model import LinearRegression
model_6 = LinearRegression()
model_6.fit(X_train, y_train) 
print(model_6.score(X_test, y_test))

## Test The Model

### Final Model is an Ensemble of all 6 ML models with Mean of Output of all Models taken

In [32]:
results_df = pd.DataFrame(columns = ["Id", "SalePrice"])
count = 0
for index, row in test_data.iterrows():
    print(count)
    count = count + 1
    row = row.to_frame()
    row = row.T
    id = row["Id"]
    row = row.drop(["Id"], axis = 1)
    prediction = (model_1.predict(row)[0] + model_2.predict(row)[0] + model_3.predict(row)[0] + model_4.predict(row)[0] + model_5.predict(row)[0] + model_6.predict(row)[0] ) /6
    to_append = [list(id)[0], prediction]
    a_series = pd.Series(to_append, index = results_df.columns)
    results_df = results_df.append(a_series, ignore_index=True)

In [33]:
print("Shape of Train Data: ", results_df.shape)
results_df.head()

In [34]:
results_df["Id"] = results_df["Id"].astype(int)

In [35]:
results_df.dtypes

In [36]:
results_df.to_csv("Results.csv", index = False, encoding='utf-8')