# Life cycle in a data science project
### 1 - Data Analysis
### 2 - Feature Engineering
### 3 - Feature selection
### 4 - Model building
### 5 - Model deployment

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

In [None]:
dataset = pd.read_csv('../input/house-prices-advanced-regression-techniques/train.csv')
dataset.shape

## Data Analysis
### 1- Missing values
### 2- Numerical values
### 3- Distribution of the numerical values
### 4- Categorical values
### 5- cardinality of the categorical values
### 6-  outliers
### 7- Relationship between independent and dependent variables

### 1 - Missing Values

In [None]:
features_with_nan = [features for features in dataset.columns if dataset[features].isna().sum()>1]
for feature in features_with_nan:
    print(feature,np.round(dataset[feature].isnull().mean(),4), '% missing values')

In [None]:
# Relationship between missing values and saleprice
for feature in features_with_nan:
    data=dataset.copy()
    data[feature]=np.where(data[feature].isnull(),1,0)   # 1 if observation is missing else 0
    data.groupby(feature)['SalePrice'].median().plot.bar()
    plt.title('feature')
    plt.show()

### 2 - Numerical values

In [None]:
numerical_features = [feature for feature in dataset.columns if dataset[feature].dtypes!='O']
dataset[numerical_features].head(5)

### 2.1 - Temporal Variables - DateTime

In [None]:
year_features = [feature for feature in numerical_features if 'Yr' in feature or 'Year' in feature]
dataset[year_features].head(5)

In [None]:
# We will compare all years with the saleprice
for feature in year_features:
    data=dataset.copy()
    if feature!='YrSold':
        data[feature]=data['YrSold']-data[feature]
        plt.scatter(data[feature],data['SalePrice'])
        plt.xlabel(feature)
        plt.ylabel('SalePrice')
        plt.show()

##### Numerical variables are of 2 types - discrete and continuous

### 2.2 - discrete variables

In [None]:
discrete_features = [feature for feature in numerical_features if len(dataset[feature].unique())<25 and feature not in year_features+['Id']]
dataset[discrete_features].head(5)

##### Relationship between discrete variables and saleprice

In [None]:
for feature in discrete_features:
    data=dataset.copy()
    data.groupby(feature)['SalePrice'].median().plot.bar()
    plt.xlabel(feature)
    plt.ylabel('SalePrice')
    plt.title(feature)
    plt.show()

### 2.3 - Continuous variables

In [None]:
continuous_features = [feature for feature in numerical_features if feature not in discrete_features + year_features + ['Id']]
dataset[continuous_features].head(5)

In [None]:
# distribution of continuous features
for feature in continuous_features:
    data=dataset.copy()
    data[feature].hist(bins=25)
    plt.xlabel(feature)
    plt.ylabel("Count")
    plt.title(feature)
    plt.show()

In [None]:
# Logarithmic transformation
for feature in continuous_features:
    if feature!='SalePrice':
        data=dataset.copy()
        if 0 in data[feature].unique():
            pass
        else:
            data[feature]=np.log(data[feature])
            data['SalePrice']=np.log(data['SalePrice'])
            plt.scatter(data[feature],data['SalePrice'])
            plt.xlabel(feature)
            plt.ylabel('SalePrice')
            plt.title(feature)
            plt.show()

## 3 - Outliers

In [None]:
for feature in continuous_features:
    data=dataset.copy()
    if 0 in data[feature].unique():
        pass
    else:
        data[feature]=np.log(data[feature])
        data.boxplot(column=feature)
        plt.ylabel(feature)
        plt.xlabel(feature)
        plt.show()

## 4 - Categorical Variables

In [None]:
categorical_features=[feature for feature in dataset.columns if data[feature].dtypes=='O']
dataset[categorical_features].head(5)

In [None]:
# Relationship between categorical variables and saleprice
for feature in categorical_features:
    data=dataset.copy()
    data.groupby(feature)['SalePrice'].median().plot.bar()
    plt.xlabel(feature)
    plt.ylabel('SalePrice')
    plt.title(feature)
    plt.show()

#### HeatMap - correlation between attributes

In [None]:
correlation = dataset.corr()

f, ax = plt.subplots(figsize=(14,12))
plt.title('Correlation of numerical attributes', size=16)
sns.heatmap(correlation)
plt.show()