<h1 style="text-align: center;" class="list-group-item list-group-item-action active">Basic Libraries & Data</h1>

# Import Basic Libraries

In [1]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

# Import Data & Overview

In [2]:
df_train = pd.read_csv('../input/house-prices-advanced-regression-techniques/train.csv')
df_test = pd.read_csv('../input/house-prices-advanced-regression-techniques/test.csv')

In [3]:
df_train.head()

In [4]:
df_train.info()

In [5]:
df_train.describe().round(2)

conclusion:
* <span style="color:Red">There are missing values.</span>
* There are columns we may not need.

<h1 style="text-align: center;" class="list-group-item list-group-item-action active">Exploratory Data Analysis</h1>

# Exploratory Data Analysis

In [6]:
# Check if all variables in both data (train, test) are identical except for the response variable ('SalePrice')
(df_train.columns.drop('SalePrice') == df_test.columns).any()

In [7]:
# drop 'Id' columns from data train dataframe
df_train.drop(["Id"], axis=1, inplace=True)
# drop 'Id' column from test dataframe and save it in (id_test_list) to use it in submission.
id_test_list = df_test["Id"].tolist()
df_test.drop(["Id"], axis=1, inplace=True)

In [8]:
# Define Numeric and Categorical columns.
numerical_cols = []
categorical_cols = []

for col in df_train.columns:
    if df_train[col].dtype in('int64','float64'):
        numerical_cols.append(df_train[col].name)
    else:
        categorical_cols.append(df_train[col].name)

In [9]:
# save numerical and categorigal data in independent dataframes for train and test data.

numerical_df_train = df_train[numerical_cols]
categorical_df_train = df_train[categorical_cols]

numerical_df_test = df_test[numerical_cols[0:-1]]
categorical_df_test = df_test[categorical_cols]

In [10]:
numerical_df_train.columns

In [11]:
categorical_df_test.columns

# EDA with numerical data

## 1. Investigate the distributions.

In [12]:
numerical_df_train.hist(figsize=(15,20), bins=30, color='blue', edgecolor='black');

In [13]:
# We notice that there are some columns that center most of their values around a single value
# drop columns with low variance (since they don’t meaningfully contribute to the model’s predictive capability)

from sklearn.feature_selection import VarianceThreshold

thresholder = VarianceThreshold(threshold=0.15)   # column where 85% of the values are constant
data_high_variance = thresholder.fit(numerical_df_train)

In [14]:
# drop column where 85% of the values are constant

high_variance_list = []
for col in numerical_df_train.columns:
    if col not in numerical_df_train.columns[thresholder.get_support()]:
        high_variance_list.append(col)

high_variance_list

In [15]:
df_train.drop(high_variance_list, axis=1, inplace=True)
df_test.drop(high_variance_list, axis=1, inplace=True)

## 2. Investigate the correlations.

In [16]:
# plot correlation heatmap
plt.figure(figsize = (20,15))

corr_matrix = numerical_df_train.corr()
mask =  np.triu(np.ones_like(corr_matrix, dtype=bool))
corr_matrix[(corr_matrix < 0.3) & (corr_matrix > -0.3)] = 0
sns.heatmap(corr_matrix, annot=True, mask=mask, linewidths=0.5,cmap='Blues', vmin=0, vmax=1)

In [17]:
# variables that have a low correlation with 'SalePrice' [less than 0.25 or -0.25]
condition  = numerical_df_train.corr()['SalePrice'] < 0.25
condition2 = numerical_df_train.corr()['SalePrice'] > -0.25
low_corr_cols = (numerical_df_train.corr()[condition & condition2]['SalePrice'].index).to_list()
low_corr_cols

In [18]:
# variables that have a high correlation with 'SalePrice'.
high_corr_cols = [elem for elem in (numerical_df_train.columns).to_list() if elem not in low_corr_cols]
high_corr_cols

In [19]:
# drop variables that have a low correlation with 'SalePrice'.
for i in range(len(low_corr_cols)):
    if i in df_train.columns:
        df_train.drop(low_corr_cols, axis=1, inplace=True)
        df_test.drop(low_corr_cols, axis=1, inplace=True)

In [20]:
# plot the correlation of each feature with SalePrice (only high correlation feature)
for col in high_corr_cols:
    sns.jointplot(x=numerical_df_train.loc[:,col],y=numerical_df_train.loc[:,'SalePrice'], kind='reg', color='blue');

conclusion:
* <span style="color:Red">There are outlires that must be dealt with, as they may lead to misleading conclusions (This is shown in the scatter charts).</span>
* There are some feather that have a strong correlation with the response variable (SalePrice) and others have a weak correlation.

# EDA with categorical data

In [21]:
# create categorical dataframe and add 'SalePrice' column.
categorical_cols.append('SalePrice')
categorical_df_train = df_train[categorical_cols]
categorical_df_train.columns

## 1. Investigate the distributions.

In [22]:
fig, axes = plt.subplots(15, 3, figsize=(18, 50))
i = 0
j = 0
for col in categorical_df_train.columns:
    if j==3:
        i += 1
        j = 0
        sns.countplot(x=categorical_df_train[col], data=categorical_df_train, ax=axes[i,j])
    else:
        sns.countplot(x=categorical_df_train[col], data=categorical_df_train, ax=axes[i,j])
    j += 1

In [23]:
# variables are highly dominated by one feature (more than 90%).

high_dominated_features = []
for col in categorical_df_train.columns:
    if (categorical_df_train[col].value_counts().max()/categorical_df_train[col].count()) > 0.9:
        high_dominated_features.append(col)
        
high_dominated_features

In [24]:
df_train.drop(high_dominated_features, axis=1, inplace=True)
df_test.drop(high_dominated_features, axis=1, inplace=True)

## 2. Describe 'SalePrice' with each categorical feature.

In [25]:
fig, axes = plt.subplots(15, 3, figsize=(18, 50))
i = 0
j = 0
for col in categorical_df_train.columns:
    if j==3:
        i += 1
        j = 0
        sns.boxplot(x=col, y="SalePrice", data=categorical_df_train, ax=axes[i,j])
    else:
        sns.boxplot(x=col, y="SalePrice", data=categorical_df_train, ax=axes[i,j])
    j += 1

In [26]:
df_train.columns

In [27]:
df_test.columns

<h1 style="text-align: center;" class="list-group-item list-group-item-action active">Data Preprocessing</h1>

# Data Cleaning

## 1. deleting duplicate values

In [28]:
print('number of duplicate values in numerical_df_train dataframe: ',numerical_df_train.duplicated().sum())
print('number of duplicate values in numerical_df_test dataframe: ',numerical_df_test.duplicated().sum())
print('number of duplicate values in categorical_df_train dataframe: ',categorical_df_train.duplicated().sum())
print('number of duplicate values in numerical_df_test dataframe: ',categorical_df_test.duplicated().sum())

In [29]:
df_train.drop_duplicates(inplace=True)
df_test.drop_duplicates(inplace=True)

In [30]:
# confirm changes
print('number of duplicate values in df_train dataframe: ',df_train.duplicated().sum())
print('number of duplicate values in df_test dataframe: ',df_test.duplicated().sum())

## 2. Missing values

In [31]:
fig, axes = plt.subplots(2, 2, figsize=(15,20))
sns.heatmap(numerical_df_train.isnull(), ax=axes[0,0])
sns.heatmap(numerical_df_test.isnull(), ax=axes[0,1])
sns.heatmap(categorical_df_train.isnull(), ax=axes[1,0])
sns.heatmap(categorical_df_test.isnull(), ax=axes[1,1])

In [32]:
# drop columns with missing more than 30%
def drop_missing(df):
    i = 0
    for col in df:
        if (df[col].isnull().sum()/1460) > 0.3:
            df.drop(col, axis=1, inplace=True)
            print('column',col,'is dropped')
            i += 1
    if i == 0:
        print('no column dropped')

In [33]:
drop_missing(df_train)

In [34]:
drop_missing(df_test)

In [35]:
def fill_null(df):
    for col in df:
        if (col in numerical_cols) & (df[col].isnull().any()):
            df[col].fillna(df[col].mean(), inplace = True)
            print('fillna numerical column: ',col)
        if (col in categorical_cols) & (df[col].isnull().any()):
            df[col].fillna(df[col].mode().iloc[0], inplace = True)
            print('fillna categorical column: ',col)

In [36]:
fill_null(df_train)

In [37]:
fill_null(df_test)

In [38]:
# confirm changes
fig, axes = plt.subplots(1, 2, figsize=(15,15))
sns.heatmap(df_train.isnull(), ax=axes[0])
sns.heatmap(df_test.isnull(), ax=axes[1])

## 3. Detect and remove outlires

In [39]:
fig, axes = plt.subplots(13, 3, figsize=(18, 50))
i = 0
j = 0
for col in numerical_df_train.columns:
    if j==3:
        i += 1
        j = 0
        sns.boxplot(x=numerical_df_train[col],data=numerical_df_train, palette="Set2", ax=axes[i,j])
    else:
        sns.boxplot(x=numerical_df_train[col],data=numerical_df_train, palette="Set2", ax=axes[i,j])
    j += 1

In [40]:
Q1 = np.percentile(df_train['SalePrice'], 25, interpolation = 'midpoint')
Q3 = np.percentile(df_train['SalePrice'], 75, interpolation = 'midpoint')
IQR = Q3 - Q1
# Upper bound
upper = np.where(df_train['SalePrice'] >= (Q3+1.5*IQR))
# lower bound
lower = np.where(df_train['SalePrice'] <= (Q1-1.5*IQR))
# drop outlires
df_train.drop(upper[0], errors='ignore', inplace = True)
df_train.drop(lower[0], errors='ignore', inplace = True)

In [41]:
'''
def remove_outlires(df, columns_list):
    for col in columns_list:
        Q1 = np.percentile(df[col], 25, interpolation = 'midpoint')
        Q3 = np.percentile(df[col], 75, interpolation = 'midpoint')
        IQR = Q3 - Q1
        # Upper bound
        upper = np.where(df[col] >= (Q3+1.5*IQR))
        # lower bound
        lower = np.where(df[col] <= (Q1-1.5*IQR))
        # drop outlires
        df.drop(upper[0], errors='ignore', inplace = True)
        df.drop(lower[0], errors='ignore', inplace = True)
'''

In [42]:
fig, axes = plt.subplots(13, 3, figsize=(18, 50))
i = 0
j = 0
for col in df_train:
    if col in numerical_cols:
        if j==3:
            i += 1
            j = 0
            sns.boxplot(x=df_train[col],data=df_train, palette="Set2", ax=axes[i,j])
        else:
            sns.boxplot(x=df_train[col],data=df_train, palette="Set2", ax=axes[i,j])
        j += 1

# Converting categorical values to numerical

In [43]:
numerical_cols_new = []
categorical_cols_new = []

for col in df_train.columns:
    if df_train[col].dtype in('int64','float64'):
        numerical_cols_new.append(df_train[col].name)
    else:
        categorical_cols_new.append(df_train[col].name)

In [44]:
train_dummies = pd.get_dummies(df_train[categorical_cols_new], drop_first=True)
test_dummies = pd.get_dummies(df_test[categorical_cols_new], drop_first=True)

In [45]:
df_train.drop(categorical_cols_new, axis=1, inplace=True)
df_test.drop(categorical_cols_new, axis=1, inplace=True)

In [46]:
df_train = df_train.join(train_dummies)
df_test = df_test.join(test_dummies)

In [47]:
for col in df_train:
    if (col not in df_test.columns) & (col != 'SalePrice'):
        df_train.drop(col, axis=1, inplace=True)
        
for col in df_test:
    if col not in df_train.columns:
        df_test.drop(col, axis=1, inplace=True)

In [48]:
df_train.info()

In [49]:
df_test.info()

<h1 style="text-align: center;" class="list-group-item list-group-item-action active">Preparing Data & Modeling</h1>

# Build Linear Regression Model

## Splitting the data into training and test sets 

In [50]:
from sklearn.model_selection import train_test_split

In [51]:
# define response and explanatory variable.

# response variable
y = df_train['SalePrice']
# explanatory variable
X = df_train.drop('SalePrice', axis=1)

In [52]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

## build model and fit data

In [53]:
from sklearn.linear_model import LinearRegression

In [54]:
lmodel = LinearRegression()
lmodel.fit(X_train,y_train)

## model prediction

In [55]:
y_pred = lmodel.predict(X_test)
y_pred

In [56]:
y_test.values

## model evaluation

In [57]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

In [58]:
mean_absolute_error(y_test,y_pred)

In [59]:
# RMSE
np.sqrt(mean_squared_error(y_test,y_pred))

In [60]:
# R^2
r2_score(y_test,y_pred)

## predict test dataframe and submission

In [61]:
subm_preds = lmodel.predict(df_test)
subm_preds

In [62]:
submission = pd.DataFrame({
        "Id": id_test_list,
        "SalePrice": subm_preds})

In [63]:
submission.to_csv('submission.csv', index=False)