# Importing the standard libraries

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import sklearn
import seaborn as sns

In [None]:
train_data=pd.read_csv("C:\\Users\\aarus\\Downloads\\house-prices-advanced-regression-techniques\\train.csv")
test_data=pd.read_csv("C:\\Users\\aarus\\Downloads\\house-prices-advanced-regression-techniques\\test.csv")

In [None]:
train_data.shape, test_data.shape

In [None]:
df_train=pd.DataFrame(train_data)
df_test=pd.DataFrame(test_data)

**The below code is to display all 1460 columns and 81 rows in both train data and test data**

In [None]:
#pd.set_option("display.max_column", None) 
#pd.set_option("display.max_row",None)

# Understanding the training and test data

In [None]:
df_train.head()

In [None]:
df_train.info()

In [None]:
df_train.describe()

In [None]:
df_train.isnull().sum() #Check for null values

In [None]:
dict={'Id':[2920]}    #Adding another row to the training data
dfadd=pd.DataFrame(dict)
df_test=df_test.append(dfadd,ignore_index = True)
df_test

**Concatinate the two dataframes**

In [None]:
df=pd.concat([df_train,df_test])
df.shape
df

In [None]:
df.info()

In [None]:
#Total features with numeric values
numeric_features=df.select_dtypes(include=["number"]).columns
numeric_features.shape[0]

In [None]:
#Total features with string or categorical value
string_features=df.select_dtypes(include=["object"]).columns
string_features.shape[0]

# Visualising the data

- dataprep is a library that helps in performing EDA on a dataset

In [None]:
from dataprep.eda import plot
from dataprep.eda import plot_missing
sns.set(style="whitegrid", color_codes=True)
sns.set(font_scale=1)
plot(df["LotFrontage"])

In [None]:
plot(df["OverallQual"])

In [None]:
plot(df["LotArea"])

**Plotting correlation tells us**
- The relation of features with our target variable
- The more the value of corr is the more that feature is related to the target variable

In [None]:
#Plotting the correlation of all the features with SalePrice
from dataprep.eda import plot_correlation
plot_correlation(df, "SalePrice", value_range=[0, 1])

**Plotting the missing values with the SalePrice**

In [None]:
plot_missing(df)

In [None]:
# we can drop the id column because it's of no use as a feature
#df.drop(columns=['Id'])

In [None]:
null_percentage=df.isnull().sum()/2920*100
null_percentage

In [None]:
print(null_percentage[null_percentage>50]) #Features with the most missing values

**Making a copy of our dataset *df* so that our original data is not tampered with**

In [None]:
df_copy=df.copy()
df_copy.shape

# Dealing with null values

In [None]:
mszoning_mode=df_copy["MSZoning"].mode()[0]
df_copy["MSZoning"].replace(np.nan,mszoning_mode, inplace=True)

In [None]:
alley_val="NA"
df_copy["Alley"].replace(np.nan, alley_val, inplace=True)

In [None]:
lotfrontage_mean=df_copy["LotFrontage"].mean()
df_copy["LotFrontage"].replace(np.nan,lotfrontage_mean, inplace=True)
df_copy["LotFrontage"].isnull().sum()

In [None]:
#Changing the utilities
utility_mode=df_copy["Utilities"].mode()[0]
df_copy["Utilities"].replace(np.nan, utility_mode, inplace=True)

In [None]:
#Changing the Exterior1st & Exterior2nd
exterior1st_mode=df["Exterior1st"].mode()[0]
df_copy["Exterior1st"].replace(np.nan, exterior1st_mode, inplace=True)
exterior2nd_mode=df["Exterior2nd"].mode()[0]
df_copy["Exterior2nd"].replace(np.nan, exterior2nd_mode, inplace=True)
df_copy["Exterior1st"].isnull().sum()

In [None]:
#Changing MasVnrtype & MasVnrArea
masvnrtype_mode=df["MasVnrType"].mode()[0]
df_copy["MasVnrType"].replace(np.nan,masvnrtype_mode, inplace=True )
df_copy["MasVnrType"].isnull().sum()

In [None]:
df_copy["MasVnrArea"].replace(np.nan,0 , inplace=True )
df_copy["MasVnrArea"].isnull().sum()

In [None]:
#Change the basement features
df_copy["BsmtFinSF1"].fillna(0, inplace=True)
df_copy["BsmtFinSF2"].fillna(0, inplace=True)
df_copy["BsmtUnfSF"].fillna(0, inplace=True)
df_copy["TotalBsmtSF"].fillna(0, inplace=True)
df_copy["BsmtFullBath"].fillna(0, inplace=True)
df_copy["BsmtHalfBath"].fillna(0, inplace=True)

In [None]:
Electrical_mode=df["Electrical"].mode()[0]
df_copy["Electrical"].replace(np.nan,Electrical_mode, inplace=True )
df_copy["Electrical"].isnull().sum()

In [None]:
KitchenQual_mode=df["KitchenQual"].mode()[0]
df_copy["KitchenQual"].replace(np.nan,KitchenQual_mode, inplace=True )
df_copy["KitchenQual"].isnull().sum()

In [None]:
KitchenQual_mode=df["KitchenQual"].mode()[0]
df_copy["KitchenQual"].replace(np.nan,KitchenQual_mode, inplace=True )
df_copy["KitchenQual"].isnull().sum()

In [None]:
Functional_mode=df["Functional"].mode()[0]
df_copy["Functional"].replace(np.nan,Functional_mode, inplace=True )
df_copy["Functional"].isnull().sum()

In [None]:
SaleType_mode=df["SaleType"].mode()[0]
df_copy["SaleType"].replace(np.nan,SaleType_mode, inplace=True )
df_copy["SaleType"].isnull().sum()

In [None]:
df_copy["Fireplaces"].fillna("NA", inplace=True)
df_copy["PoolArea"].fillna("NA", inplace=True)
df_copy["PoolQC"].fillna("NA", inplace=True)
df_copy["Fence"].fillna("NA", inplace=True)
df_copy["MiscFeature"].fillna("NA", inplace=True)

In [None]:
df_copy["GarageType"].fillna("NA", inplace=True)
df_copy["GarageYrBlt"].fillna(0, inplace=True)
df_copy["GarageFinish"].fillna("NA", inplace=True)
df_copy["GarageCars"].fillna(0, inplace=True)
df_copy["GarageArea"].fillna(0, inplace=True)
df_copy["GarageQual"].fillna("NA", inplace=True)
df_copy["GarageCond"].fillna("NA", inplace=True)

In [None]:
df_copy.head()

In [None]:
df_copy.isnull().sum()

# Preprocessing the data 

- Converting the values with categorical data to numerical data because computer understands numerical data

In [None]:
from sklearn.preprocessing import OneHotEncoder  
df_encode=df_copy.copy()
object_features=df_encode.select_dtypes(include="object").columns.tolist()
df_encode.shape

In [None]:
df_encode.head()

In [None]:
df_encode[object_features].head(2) #Getting all the data with categorical values

In [None]:
df_encode=pd.get_dummies(df_encode, columns=object_features, prefix=object_features,drop_first=True)

In [None]:
df_encode["MSZoning_FV"].value_counts()

In [None]:
df_encode.shape

In [None]:
df_encode.select_dtypes(include="object").columns.tolist()

**Filling the remaining null values with 0**

In [None]:
df_modified=df_encode.fillna(0)

In [None]:
len_train=len(df_train)
len_train

# Splitting the data into training and testing data

In [None]:
x_train=df_modified[:len_train].drop(columns=["SalePrice"])
y_train=df_modified["SalePrice"][:len_train]
x_test=df_modified[len_train:].drop("SalePrice",axis=1)

In [None]:
x_test[1:100]

# Feature Scaling

In [None]:
from sklearn.preprocessing import StandardScaler
scaler=StandardScaler()
scaler.fit(x_train)
x_train=scaler.transform(x_train)
x_test=scaler.transform(x_test)

# Importing RandomForestRegressor algorithm

In [None]:
from sklearn.ensemble import RandomForestRegressor
  
#create regressor object
model= RandomForestRegressor(n_estimators = 100)
  
# fit the regressor with x_train and y_train data
model.fit(x_train, y_train)  

# Making Predictions

In [None]:
y_pred = model.predict(x_test)

In [None]:
y_pred

In [None]:
df_modified["Id"][len_train:] #since we also need ID

In [None]:
df_result=pd.DataFrame(y_pred,df_modified["Id"][len_train:]) 

In [None]:
df_result

In [None]:
df_result=df_result.rename(columns={0: 'SalesPrice'})
df_result

In [None]:
df_result.shape

# Visualising the result dataframe

In [None]:
plt.hist(df_result, 20,
         density=True,
         histtype='bar',
         facecolor='b',
         alpha=0.5)

plt.show()

In [None]:
df_result.to_csv('Housingpred_vals.csv') #converting to csv file