In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.mlab as mlab
import numpy as np
import scipy.stats as stats
import seaborn as sns

train = pd.read_csv('train.csv')
train.head(3)

In [None]:
# Inspect the shape of the train dataset
print("Number of columns: "+ str(train.shape[1]))
print("Number of rows: "+ str(train.shape[0]))

# Missing values
First, I want to identify whether there are missing values based on the percentage of NaN values. If a colunm has any many missing values, I would drop that column.

In [None]:
missing_value = (train.isnull().sum())
print (missing_value.sort_values(ascending = False)

As we can see, there are 19 columns with missing values. We will drop these columns.

In [None]:
# Drop the columns with missing values
train_drop = train.dropna(axis='columns')
print("Number of columns: "+ str(train_drop.shape[1]))
print("Number of rows: "+ str(train_drop.shape[0]))
train_drop.head(5)

# Data cleanup
Among 62 features, there are features that correllate better with the sale price and thus, are important for downstream analyses. Here, we calculate the corralation of all variables salewith 

In [None]:
plt.hist(train_drop.SalePrice, bins = 30)
plt.show()

The distribution looks normal and skews towards lower-priced homes. We can perform log transformation to make the distribution looks more normal.

In [None]:
plt.hist(np.log(train_drop.SalePrice), bins = 30)
plt.show()

## Numerical variables

In [None]:
plt.plot(train_drop.GrLivArea, train_drop.SalePrice,'.')
plt.show()

In [None]:
plt.plot(train_drop.GrLivArea, np.log(train_drop.SalePrice),'.')
plt.show()

# Data story

Ask the following questions and look for the answers using code and plots:

Can you count something interesting?
Can you find trends (e.g. high, low, increasing, decreasing, anomalies)?
Can you make a bar plot or a histogram?
Can you compare two related quantities?
Can you make a scatterplot?
Can you make a time-series plot?

Looking at the plots, what are some insights you can make? Do you see any correlations? Is there a hypothesis you’d like to investigate further? What other questions do the insights lead you to ask?

Now that you’ve asked questions, hopefully you’ve found some interesting insights. Is there a narrative or a way of presenting the insights using text and plots that tells a compelling story? What are some other trends/relationships you think will make the story more complete?

## Univariate analysis
Since the data sets have both numerical and categorical data, it makes sense to identify which is which.

In [None]:
def get_feature_groups():
    """ Returns a list of numerical and categorical features,
    excluding SalePrice and Id. """
    # Numerical Features
    num_features = train_drop.select_dtypes(include=['int64','float64']).columns
    num_features = num_features.drop(['Id','SalePrice']) # drop ID and SalePrice

    # Categorical Features
    cat_features = train_drop.select_dtypes(include=['object']).columns
    return list(num_features), list(cat_features)

num_features, cat_features = get_feature_groups()

### Explore categorical features
In dealing with cateforical variables, bar plots are usually chosen in place of histograms.

Exploring a categorical feature of interest, HouseStyle, I notice that 1-story is the most popular style in Ames, Iowa.

In [None]:
f = pd.melt(train_drop, value_vars=sorted(cat_features))
g = sns.FacetGrid(f, col='variable',col_wrap=4, sharex=False, sharey=False)
plt.xticks(rotation='vertical')
g = g.map(sns.countplot, 'value')
[plt.setp(ax.get_xticklabels(), rotation=45) for ax in g.axes.flat]
g.fig.tight_layout()
plt.show()

## Numerical features

In [None]:
f = pd.melt(train_drop, value_vars=sorted(num_features))
g = sns.FacetGrid(f, col='variable', col_wrap=4, sharex=False, sharey=False)
g = g.map(sns.distplot, 'value')
plt.show()

In [None]:
print('The oldest home was built in:', train_drop.YearBuilt.min())
print ('The newest houme was built in', train_drop.YearBuilt.max())
train_drop.YearBuilt.hist(bins=14, rwidth= 0.8)
plt.title('Year Built')
plt.show()

## Bivariate analysis

### Nominal features with Price
Since the goal is price prediction, I explore the corellation of price with some features. I choose Yearbuild and Neighborhood because these are logical choices.

Is interesting to notice the pattern of the real estate cycle here. Right before each market crash or recession, house prices increased significantly. After the crash, the price continued to decline for at least two years, stabilized until the dawn of the next market crash.

In [None]:
YearBuilt_meanSalePrice = \
    train_drop.groupby('YearBuilt')['SalePrice'].mean()

plt.figure(figsize=(20,5))
sns.pointplot(x = train_drop.YearBuilt.values, y = train_drop.SalePrice.values,
              order = YearBuilt_meanSalePrice.index)
plt.xticks(rotation=90)
plt.title("House price in Ames since 1872 to 2010")
plt.xlabel("Year")
plt.ylabel("Price (USD)")
plt.show()

Real estate investing is all about location. Among the neighborhood within Ames city limits, it looks like the Neighborhood where most houses locate in is Names. 

In [None]:
train_drop.groupby('Neighborhood').Id.count().\
    sort_values().\
    plot(kind='barh')

plt.title('Neighborhood')
plt.show()

In [None]:
Neighborhood_meanSP = \
    train_drop.groupby('Neighborhood')['SalePrice'].mean()
 
Neighborhood_meanSalePrice = Neighborhood_meanSP.sort_values()
print (Neighborhood_meanSalePrice)

In [None]:
sns.pointplot(x = train_drop.Neighborhood.values, y = train_drop.SalePrice.values,
              order = Neighborhood_meanSalePrice.index)
plt.xticks(rotation=45)
plt.show()

### Numerical features with Price