# Project Name: House Prices: Advanced Regression Techniques

The main aim of this project is to predict the house price based on various features

## Dataset to downloaded from the below link

https://www.kaggle.com/c/house-prices-advanced-regression-techniques/data

# Data Analysis

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

pd.pandas.set_option("display.max_columns", None)

In [None]:
ds = pd.read_csv("house_price_train.csv")
ds.shape

In [None]:
ds.head()

## Missing Values

In [None]:
features_with_nan = [feature for feature in ds.columns if ds[feature].isnull().sum() > 1]

for feature in features_with_nan:
    print(feature, np.round(ds[feature].isnull().mean(), 4), " % missing or null values")

In [None]:
# To see how missing values affect sale price for each feature
for column in features_with_nan:
    data = ds.copy()
    
    data[column] = np.where(data[column].isnull(), 1, 0) # If the observation is null then 1 or else 0
    
    data.groupby(column)["SalePrice"].median().plot.bar() # Calculate mean sale price for values which are null and not null
    plt.title(column)
    plt.show()

Here With the relation between the missing values and the dependent variable is clearly visible.So We need to replace these nan values with something meaningful values

In above dataset some of the features like Id is not required

## Numerical values

In [None]:
# Extracting features with numerical values
features_with_num = [feature for feature in ds.columns if ds[feature].dtype != 'O']

print("Number of features with numerical values: ", len(features_with_num))
ds[features_with_num].head()

In [None]:
'''Finding features with temporal values( datetime values) and their relationship with sale price since there are 4 features
   with datetime values in above numerical features list'''
temporal_features = [feature for feature in features_with_num if 'Year' in feature or 'Yr' in feature]
print(temporal_features)

for feature in temporal_features:
    print(feature, ds[feature].unique())

In [None]:
# Realtionship of YrSold with SalePrice
ds.groupby('YrSold')['SalePrice'].median().plot()
plt.xlabel('Year sold')
plt.ylabel('Sale Price')
plt.title("YrSold vs SalePrice")
plt.show()

In [None]:
# Lets check relationship of all the year features with sales price
for feature in temporal_features:
    data = ds.copy()
    if feature != 'YrSold':
        data[feature] = data['YrSold'] -  data[feature] # Calculate difference between year sold and year in feature
        
        plt.scatter(data[feature], data['SalePrice'])
        plt.xlabel(feature)
        plt.ylabel('SalePrice')
        plt.show()

In [None]:
# Numerical variables are of 2 types - Continous and Discrete variables
# First let's find out all the features with discrete variables
discrete_features = [feature for feature in features_with_num if len(ds[feature].unique()) < 25 and feature not in temporal_features+['Id']]
print('number of discrete features are: ', len(discrete_features))

In [None]:
ds[discrete_features].head()

In [None]:
# Let's compare the relationship of different discrete features with sale price
for feature in discrete_features:
    data = ds.copy()
    
    data.groupby(feature)['SalePrice'].median().plot.bar()
    plt.xlabel(feature)
    plt.ylabel('SalePrice')
    plt.title(f'{feature} vs SalePrice')
    plt.show()

In [None]:
# Now let's find all features with continous variables
continous_features = [feature for feature in features_with_num if feature not in temporal_features+discrete_features+['Id']]
print("Number of continous features are: ", len(continous_features))

In [None]:
ds[continous_features].head()

In [None]:
# Lets analyse the continuous values by creating histograms to understand the distribution
for feature in continous_features:
    data = ds.copy()

    data[feature].hist(bins=25)
    plt.xlabel(feature)
    plt.ylabel('Count')
    plt.title(feature)
    plt.show()

In [None]:
# we will use logarithmic transformation to deal with skewed data in continous type of data

for feature in continous_features:
    data = ds.copy()
    if 0 in data[feature].unique():
        pass
    else:
        data[feature] = np.log(data[feature])
        data['SalePrice'] = np.log(data['SalePrice'])
        data[feature].hist(bins=25)
        plt.xlabel(feature)
        plt.ylabel('Count')
        plt.title(feature)
        plt.show()

In [None]:
for feature in continous_features:
    data = ds.copy()
    if 0 in data[feature].unique():
        pass
    else:
        data[feature] = np.log(data[feature])
        data['SalePrice'] = np.log(data['SalePrice'])
        plt.scatter(data[feature], data['SalePrice'])
        plt.xlabel(feature)
        plt.ylabel('SalePrice')
        plt.title(feature)
        plt.show()

## Outliers

In [None]:
# Let's check outliers in continous features

for feature in continous_features:
    data = ds.copy()
    
    if 0 in data[feature].unique():
        pass
    else:
        data[feature] = np.log(data[feature])
        data.boxplot(column=feature)
        plt.ylabel(feature)
        plt.title(feature)
        plt.show()

## Categorical values

In [None]:
categorical_features = [feature for feature in ds.columns if ds[feature].dtypes == 'O']
print(f'The number of columns in dataset with categorical values are {len(categorical_features)}')
ds[categorical_features].head()

In [None]:
# Lets see how many categories does each feature have

for feature in categorical_features:
    print(f'{feature} has number of categories = {ds[feature].nunique()}')

In [None]:
# Let's check how different categories in each feature affect th sale price

for feature in categorical_features:
    data = ds.copy()
    
    data.groupby(feature)['SalePrice'].median().plot.bar()
    plt.xlabel(feature)
    plt.ylabel('SalePrice')
    plt.title(f'{feature} vs SalePrice')
    plt.show()

In [None]:
# To see count of each category in a feature

for feature in categorical_features:
    data = ds.copy()
    
    sns.set_style('whitegrid')
    sns.countplot(x=feature, data=data)
    plt.title(feature)
    plt.show()