## 3.2 Data preparation

In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
data = 'https://raw.githubusercontent.com/alexeygrigorev/datasets/master/housing.csv'

In [4]:
!wget $data -O data.csv 

--2022-09-26 12:13:52--  https://raw.githubusercontent.com/alexeygrigorev/datasets/master/housing.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.110.133, 185.199.111.133, 185.199.108.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.110.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1423529 (1.4M) [text/plain]
Saving to: ‘data.csv’


2022-09-26 12:13:52 (19.4 MB/s) - ‘data.csv’ saved [1423529/1423529]



In [6]:
df = pd.read_csv('data.csv')
df.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY


In [60]:
df = pd.read_csv('data.csv')
columns = ["median_house_value",'latitude','longitude','housing_median_age','total_rooms','total_bedrooms','population','households','median_income','ocean_proximity']
df = df[columns]
df.fillna(0, inplace=True)
df["rooms_per_household"] = df["total_rooms"]/df["households"]
df["bedrooms_per_room"] = df["total_bedrooms"]/df["total_rooms"]
df["population_per_household"] = df["population"]/df["households"]

In [13]:
df["ocean_proximity"].mode()

0    <1H OCEAN
dtype: object

In [14]:
df.corr()

Unnamed: 0,latitude,longitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,rooms_per_household,bedrooms_per_room,population_per_household
latitude,1.0,-0.924664,0.011173,-0.0361,-0.065318,-0.108785,-0.071035,-0.079809,-0.14416,0.106389,-0.104112,0.002366
longitude,-0.924664,1.0,-0.108197,0.044568,0.068082,0.099773,0.05531,-0.015176,-0.045967,-0.02754,0.084836,0.002476
housing_median_age,0.011173,-0.108197,1.0,-0.361262,-0.317063,-0.296244,-0.302916,-0.119034,0.105623,-0.153277,0.125396,0.013191
total_rooms,-0.0361,0.044568,-0.361262,1.0,0.920196,0.857126,0.918484,0.19805,0.134153,0.133798,-0.174583,-0.024581
total_bedrooms,-0.065318,0.068082,-0.317063,0.920196,1.0,0.866266,0.966507,-0.007295,0.049148,0.002717,0.122205,-0.028019
population,-0.108785,0.099773,-0.296244,0.857126,0.866266,1.0,0.907222,0.004834,-0.02465,-0.072213,0.031397,0.069863
households,-0.071035,0.05531,-0.302916,0.918484,0.966507,0.907222,1.0,0.013033,0.065843,-0.080598,0.059818,-0.027309
median_income,-0.079809,-0.015176,-0.119034,0.19805,-0.007295,0.004834,0.013033,1.0,0.688075,0.326895,-0.573836,0.018766
median_house_value,-0.14416,-0.045967,0.105623,0.134153,0.049148,-0.02465,0.065843,0.688075,1.0,0.151948,-0.238759,-0.023737
rooms_per_household,0.106389,-0.02754,-0.153277,0.133798,0.002717,-0.072213,-0.080598,0.326895,0.151948,1.0,-0.387465,-0.004852


In [61]:
mean_price = df["median_house_value"].mean()
df["above_average"] = df['median_house_value'].apply(lambda x: 1 if x >= mean_price else 0)

In [29]:
del df['median_house_value']

## 3.3 Setting up the validation framework

* Perform the train/validation/test split with Scikit-Learn

In [16]:
from sklearn.model_selection import train_test_split

In [30]:
df_full_train, df_test = train_test_split(df, test_size=0.2, random_state=1)
df_train, df_val = train_test_split(df_full_train, test_size=0.25, random_state=1)
df_train = df_train.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)
y_train = df_train.above_average.values
y_val = df_val.above_average.values
y_test = df_test.above_average.values
del df_train['above_average']
del df_val['above_average']
del df_test['above_average']

In [21]:
df_full_train.above_average.value_counts(normalize=True)

0    0.591025
1    0.408975
Name: above_average, dtype: float64

In [22]:
df_full_train.above_average.mean()

0.4089752906976744

## Mutual information

In [23]:
from sklearn.metrics import mutual_info_score

In [27]:
def mutual_info_churn_score(series):
    return mutual_info_score(series, df_full_train.above_average)

In [28]:
mi = df_full_train[columns].apply(mutual_info_churn_score)
mi.sort_values(ascending=False)



median_house_value    0.676483
median_income         0.502303
total_rooms           0.224974
longitude             0.172332
latitude              0.149478
population            0.136895
ocean_proximity       0.103093
total_bedrooms        0.071424
households            0.066152
housing_median_age    0.013203
dtype: float64

## 3.8 One-hot encoding

* Use Scikit-Learn to encode categorical features

In [33]:
from sklearn.feature_extraction import DictVectorizer

In [36]:
dv = DictVectorizer(sparse=False)
train_dict = df_train[columns].to_dict(orient='records')
X_train = dv.fit_transform(train_dict)
val_dict = df_val[columns].to_dict(orient='records')
X_val = dv.transform(val_dict)

## 3.10 Training logistic regression with Scikit-Learn

* Train a model with Scikit-Learn
* Apply it to the validation dataset
* Calculate the accuracy

In [37]:
from sklearn.linear_model import LogisticRegression

In [38]:
model = LogisticRegression(solver="liblinear", C=1.0, max_iter=1000, random_state=42)
model.fit(X_train, y_train)
y_pred = model.predict_proba(X_val)[:, 1]
price_decision = (y_pred >= 0.5)

LogisticRegression(max_iter=1000, random_state=42, solver='liblinear')

In [42]:
df_pred = pd.DataFrame()
df_pred['probability'] = y_pred
df_pred['prediction'] = price_decision.astype(int)
df_pred['actual'] = y_val

In [43]:
df_pred['correct'] = df_pred.prediction == df_pred.actual

In [44]:
df_pred.correct.mean()

0.8226744186046512

## 3.12 Feature elimination

In [49]:
model = LogisticRegression(solver="liblinear", C=1.0, max_iter=1000, random_state=42)

In [53]:
from sklearn.metrics import accuracy_score

In [54]:
accuracy_dif = []
for x in ["total_rooms","total_bedrooms","population","households"]:
  columns = ['latitude','longitude','housing_median_age','total_rooms','total_bedrooms',
           'population','households','median_income','ocean_proximity',
           "rooms_per_household","bedrooms_per_room","population_per_household"]
  columns.remove(x)
  df_train_without = df_train.drop([x], axis = 1)
  df_val_without = df_val.drop([x], axis = 1)
  train_dict = df_train_without[columns].to_dict(orient='records')
  X_train = dv.fit_transform(train_dict)
  val_dict = df_val_without[columns].to_dict(orient='records')
  X_val = dv.transform(val_dict)
  model.fit(X_train, y_train)
  y_pred = model.predict(X_val)
  score =accuracy_score(y_val,y_pred)
  accuracy_dif.append(score)

In [56]:
accuracy_dif - df_pred.correct.mean()

array([ 0.00024225,  0.00121124, -0.02228682, -0.00678295])

### Alpha optimization

In [62]:
df = pd.read_csv('data.csv')
columns = ["median_house_value",'latitude','longitude','housing_median_age','total_rooms','total_bedrooms','population','households','median_income','ocean_proximity']
df = df[columns]
df.fillna(0, inplace=True)
df["rooms_per_household"] = df["total_rooms"]/df["households"]
df["bedrooms_per_room"] = df["total_bedrooms"]/df["total_rooms"]
df["population_per_household"] = df["population"]/df["households"]

In [64]:
df["median_house_value_log"] = np.log1p(df.median_house_value)

In [65]:
del df['median_house_value']

In [66]:
df_full_train, df_test = train_test_split(df, test_size=0.2, random_state=1)
df_train, df_val = train_test_split(df_full_train, test_size=0.25, random_state=1)
df_train = df_train.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)
y_train = df_train.median_house_value_log.values
y_val = df_val.median_house_value_log.values
y_test = df_test.median_house_value_log.values
del df_train['median_house_value_log']
del df_val['median_house_value_log']
del df_test['median_house_value_log']

In [71]:
columns = ['latitude','longitude','housing_median_age','total_rooms','total_bedrooms',
           'population','households','median_income','ocean_proximity',
           "rooms_per_household","bedrooms_per_room","population_per_household"]
dv = DictVectorizer(sparse=False)
train_dict = df_train[columns].to_dict(orient='records')
X_train = dv.fit_transform(train_dict)
val_dict = df_val[columns].to_dict(orient='records')
X_val = dv.transform(val_dict)

In [69]:
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error

In [70]:
for a in [0,0.01,0.1,1,10]:
  model = Ridge(alpha=a, solver="sag", random_state=42)
  model.fit(X_train, y_train)
  y_pred = model.predict(X_val)
  rmse = mean_squared_error(y_val, y_pred)
  print(rmse)

0.2820715148474216
0.28207151486843685
0.28207151506809097
0.28207151707514794
0.28207153711416366
