In [1]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelBinarizer
from sklearn.preprocessing import StandardScaler

from sklearn.model_selection import train_test_split

from sklearn.model_selection import cross_val_score

from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import LinearRegression

from sklearn.metrics import mean_squared_error

In [3]:
data = pd.read_csv('data.csv')

# EDA

In [None]:
data

In [None]:
data.describe()

### ocean_proximity count

In [None]:
plt.figure(figsize=(8,5))
sns.barplot(x=['<1H OCEAN', 'INLAND', 'NEAR OCEAN', 'NEAR BAY', 'ISLAND'], y=data['ocean_proximity'].value_counts(), data=data)

In [None]:
plt.figure(figsize=(10,7))
plt.pie(x=data['ocean_proximity'].value_counts(), labels=['<1H OCEAN', 'INLAND', 'NEAR OCEAN', 'NEAR BAY', 'ISLAND'], autopct='%1.1f%%')

### Where are the most populated areas?
#### population density recongnition

In [None]:
data.plot(kind='hexbin', x='longitude', y='latitude',gridsize=40, figsize=(13,8))

In [None]:
data.plot(kind='scatter', x='longitude', y='latitude', alpha=0.1, c='blue', edgecolor='black', figsize=(10,7))

In [None]:
data.plot(kind='scatter', x='longitude', y='latitude',
             alpha=0.5, s=data['population']/100,
             c='median_house_value', cmap=plt.get_cmap('jet'),
             figsize=(13,8),)

### Correlations

In [None]:
corr_matrix = data.corr()
corr_matrix['median_house_value'].sort_values(ascending=False)

In [None]:
attributes = ['median_house_value', 'median_income', 'total_rooms', 'housing_median_age']

In [None]:
pd.plotting.scatter_matrix(data[attributes], figsize=(12,8))

In [None]:
data.plot(kind='scatter', x='median_income', y='median_house_value', edgecolor='black', c='blue', alpha=0.05)

### Distribution of features

In [None]:
data.hist(bins=50, figsize=(20,15))

### Better intuition for outliers

In [None]:
# because we have varied scales, we put each box in a distinct plot 
plt.figure(figsize=(14,10))

n = 0
for c in ['total_rooms', 'total_bedrooms', 'population', 'households']:
    n += 1
    plt.subplot(2, 2, n)
    data.boxplot(column=[c],grid=False)

### Preprocessing

In [None]:
# Outlier Handling
# population
indices = data[data.loc[:,'population'] > 4700].index
data.loc[indices,'population'] = 4700

# total_rooms
indices = data[data.loc[:,'total_rooms'] > 8000].index
data.loc[indices,'total_rooms'] = 8000

# total_bedrooms
indices = data[data.loc[:,'total_bedrooms'] > 1700].index
data.loc[indices,'total_bedrooms'] = 1700

# households
indices = data[data.loc[:,'households'] > 2000].index
data.loc[indices,'households'] = 2000

# New Features and New Correlations
data['rooms_per_household'] = data['total_rooms']/data['households']
data['bedrooms_per_room'] = data['total_bedrooms']/data['total_rooms']
data['population_per_household'] = data['population']/data['households']

# Imputing
data_cat = data['ocean_proximity']
data_num = data.drop('ocean_proximity', axis=1)

imputer = SimpleImputer(strategy='median')
imputer.fit_transform(data_num)

# One-Hot Encoding
encoder = LabelBinarizer()

data_cat_encoded = encoder.fit_transform(data_cat)
data_cat_encoded = pd.Series(data_cat_encoded.tolist())

data = pd.concat([data_cat_encoded, data_num], axis=1)

data = data.rename(columns={0: 'ocean_proximity'})

data['op_0'] = [e[0] for e in data['ocean_proximity']]
data['op_1'] = [e[1] for e in data['ocean_proximity']]
data['op_2'] = [e[2] for e in data['ocean_proximity']]
data['op_3'] = [e[3] for e in data['ocean_proximity']]
data['op_4'] = [e[4] for e in data['ocean_proximity']]

data = data.drop(['ocean_proximity'], axis=1)

# Spliting
train_set, test_set = train_test_split(data, test_size=0.2, random_state=42)

# Seprating Labels from Features
y_train = train_set['median_house_value'] #Labels
x_train = train_set.drop('median_house_value', axis=1) #Features

# Scaling
scaler = StandardScaler()
x_train = pd.DataFrame(scaler.fit_transform(x_train), columns=x_train.columns, index=x_train.index)

### Check outliers again

In [None]:
plt.figure(figsize=(14,10))

n = 0
for c in ['total_rooms', 'total_bedrooms', 'population', 'households']:
    n += 1
    plt.subplot(2, 2, n)
    data.boxplot(column=[c])

### Check distributions again

In [None]:
data.hist(bins=50, figsize=(20,15))

### New features & New correlations

In [None]:
corr_matrix = data.corr()
corr_matrix['median_house_value'].sort_values(ascending=False)

### Imputer

In [None]:
data.info()

In [None]:
imputer.statistics_

In [None]:
data_num.median().values

# Training

### Linear Regression

In [None]:
lin_reg = LinearRegression()
lin_reg.fit(x_train, y_train)

In [None]:
some_data = x_train.iloc[:5]
some_labels = y_train.iloc[:5]

In [None]:
lin_reg.predict(some_data)

In [None]:
some_labels

In [None]:
lin_mse = np.sqrt(mean_squared_error(y_train, lin_reg.predict(x_train)))

### Decision Tree

In [None]:
tree_reg = DecisionTreeRegressor()
tree_reg.fit(x_train, y_train)
tree_predictions = tree_reg.predict(x_train)

tree_mse = np.sqrt(mean_squared_error(tree_predictions, y_train))

#### Cross-Validation

#### DecisionTree overfits our data, so we split our data into 10 distinct subsets (folds). In this way our model picks 1 subset for evaluation and 9 subsets for training

In [None]:
scores = cross_val_score(tree_reg, x_train, y_train, scoring='neg_mean_squared_error', cv=10)
tree_rmse = np.sqrt(-scores)

In [None]:
tree_rmse

### RandomForest

In [None]:
forest_reg = RandomForestRegressor()
forest_reg.fit(x_train, y_train)
rf_predictions = forest_reg.predict(x_train)

rf_mse = mean_squared_error(rf_predictions, y_train)

In [None]:
np.sqrt(-scores)