# Exploratory Data Analysis

In [None]:
# import the library
%matplotlib inline

import pandas as pd
import numpy as np
import scipy
import matplotlib.pyplot as plt

# the more advanced python visualization library
import seaborn as sns

# apply style to all the charts
sns.set_style('whitegrid')

# convert scientific notation to decimals
pd.set_option('display.float_format', lambda x: '%.2f' % x)

In [None]:
# Load the data
df = pd.read_csv('data/house_pricing.csv')
df['date'] = pd.to_datetime(df['date'])
df

## Data Perspective
- One variable
    - Numeric variables
        - continuous: average income
        - discrete: population
    - Categorical variables
        - ordinal: grade
        - nominal: house, apartment, townhouse, etc.
- Multiple variables
    - Numeric x Numeric
    - Categorical x Numeric
    - Categorical x Categorical

## One Variable (numeric)

In [None]:
# Analize pricing distribution

# Histogram
print(df['price'].describe().round())
plt.hist(df['price'], bins=100)
plt.show()
# Box plot
plt.boxplot(df['price'])
plt.xticks([1], ['Price'], rotation='horizontal')
plt.show()

In [None]:
#TODO: Analyze all the numerical variables

## One Variable (categorical)

In [None]:
# Analize zipcode distribution
print(df['bedrooms'].value_counts())

# Bar Chart
plt.figure(figsize=(12,6))
plt.title('# houses by bedroom')
plt.xlabel('bedrooms')
plt.ylabel('# houses')

labels = df['bedrooms'].value_counts().index
values = df['bedrooms'].value_counts().values

y_pos = range(len(labels))
plt.bar(y_pos, values, align='center', alpha=0.5)
plt.xticks(y_pos, labels)
plt.show()

In [None]:
#TODO: Analyze all the categorical variables

## Multiple variables (Numeric x Numeric)

In [None]:
# Correlation
df.corr()

In [None]:
# heatmap
corrmat = df.corr()
f, ax = plt.subplots(figsize=(12, 9))
sns.heatmap(corrmat, vmax=.8, square=True)
plt.show()

In [None]:
# Visualization : correlation between Price x SQFT

plt.figure(figsize=(10,10))
plt.xlabel('Price')
plt.ylabel('SQFT')

plt.plot(df['price'], df['sqft_living'], 'o', alpha=0.3)
plt.show()

In [None]:
# TODO: Create a scatterplot Price x Mean Income
# TODO: What is the correlation between Price x Mean Income


## Multiple variables (Categorical x Numeric)

In [None]:
# Box plot to show the distribution between bedrooms x price
values = []
labels = []
for bdr in sorted(df['bedrooms'].unique()):
    labels.append(bdr)
    values.append(list(df[df['bedrooms']==bdr]['price'].values))

plt.boxplot(values)
plt.xticks(range(1,len(labels)+1), labels, rotation='horizontal')
plt.show()

## Multiple variables (Categorical x Categorical)

In [None]:
# Count of number of houses by zipcode and bedrooms

n_bedrooms = [2,3,4]
zipcodes = df['zipcode'].unique()
r = range(len(zipcodes))

df_group = df.groupby(['zipcode', 'bedrooms'])['id'].count().reset_index()
df_group = df.pivot_table(index='zipcode', columns = 'bedrooms', values='id')
df_group = df_group[[2,3,4]]
df_group.columns = ['2BR','3BR','4BR']
df_group = df_group.sort_values(['2BR','3BR','4BR'])

raw_data = {'2BR': df_group['2BR'].values, 
            '3BR': df_group['3BR'].values, 
            '4BR': df_group['4BR'].values
           }

# From raw value to percentage
totals = [float(i+j+k) for i,j,k in zip(raw_data['2BR'], raw_data['3BR'], raw_data['4BR'])]
br2Bars = [i / j * 100 for i,j in zip(raw_data['2BR'], totals)]
br3Bars = [i / j * 100 for i,j in zip(raw_data['3BR'], totals)]
br4Bars = [i / j * 100 for i,j in zip(raw_data['4BR'], totals)]

# plot
barWidth = 0.85
names = zipcodes
plt.figure(figsize=(12,6))
# Create green Bars
plt.bar(r, br2Bars, edgecolor='white', width=barWidth, alpha=0.7, color='g', label='2BR')
# Create orange Bars
plt.bar(r, br3Bars, bottom=br2Bars, edgecolor='white', width=barWidth, alpha=0.7, color='orange', label='3BR')
# Create blue Bars
plt.bar(r, br4Bars, bottom=[i+j for i,j in zip(br2Bars, br3Bars)], edgecolor='white', width=barWidth, alpha=0.7, color='b', label='4BR')

# Limit on y-axis
plt.ylim(0,100)

# Custom x axis
plt.xticks(r, names, rotation='vertical')
plt.ylabel("Percent")
plt.xlabel("zipcode")

plt.legend()

# Show graphic
plt.show()

## Business Perspective
"An approximate answer to the right question is worth a great deal more than a precise answer to the wrong question." John Tukey

In [None]:
# What is the total number of sales per month?

In [None]:
# What is the total sales ($) per month?

In [None]:
# What is the total number of sales per day of week?

In [None]:
# What is the average price by zip code?

In [None]:
# Identify the zip codes with highest and lowest price per square foot

In [None]:
# Is the grade a good indicator of price?

In [None]:
# Is the walking score a good indicator of price?

In [None]:
# Apply linear regression to the price

from sklearn import linear_model

# Create linear regression object
model = linear_model.LinearRegression()

columns = ['bedrooms', 'sqft_living15', 'grade', 'condition']

# Train the model using the training sets
model.fit(df[columns], df['price'])

# Print the Coefficients
print('Coefficients', np.round(model.coef_,2))
print('Interception', round(model.intercept_,2))
print('')
for i, col in enumerate(columns):
    print(col, round(model.coef_[i],1))

print('')
prediction = model.predict([[4, 3000, 10, 4]])
print('Prediction', prediction.round(1)[0])

In [None]:
# Apply linear regression to the price considering the zipcode

from sklearn import linear_model

# Create linear regression object
model = linear_model.LinearRegression()

columns = ['bedrooms', 'sqft_living15', 'grade', 'condition']

# Train the model using the training sets
zipcode = 98000
model.fit(df[df['zipcode']==zipcode][columns], df[df['zipcode']==zipcode]['price'])

# Print the Coefficients
print('Coefficients', np.round(model.coef_,2))
print('Interception', round(model.intercept_,2))
print('')
for i, col in enumerate(columns):
    print(col, round(model.coef_[i],1))

print('')
prediction = model.predict([[4, 3000, 10, 4]])
print('Prediction', prediction.round(1)[0])