In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [3]:
data = 'https://raw.githubusercontent.com/alexeygrigorev/datasets/master/housing.csv'

In [4]:
!wget $data -O housing.csv

--2022-09-25 13:41:34--  https://raw.githubusercontent.com/alexeygrigorev/datasets/master/housing.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.111.133, 185.199.109.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1423529 (1.4M) [text/plain]
Saving to: 'housing.csv'

     0K .......... .......... .......... .......... ..........  3% 1.82M 1s
    50K .......... .......... .......... .......... ..........  7% 3.43M 1s
   100K .......... .......... .......... .......... .......... 10% 2.91M 0s
   150K .......... .......... .......... .......... .......... 14% 5.94M 0s
   200K .......... .......... .......... .......... .......... 17% 3.47M 0s
   250K .......... .......... .......... .......... .......... 21% 3.83M 0s
   300K .......... .......... .......... .......... .......... 25% 5.68M 0s
   350K .......... .......... ..

In [5]:
df = pd.read_csv('housing.csv')

In [6]:
df.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY


##### Data preparation

In [7]:
df = df.fillna(0)

In [8]:
df.isnull().sum()

longitude             0
latitude              0
housing_median_age    0
total_rooms           0
total_bedrooms        0
population            0
households            0
median_income         0
median_house_value    0
ocean_proximity       0
dtype: int64

In [9]:
df['rooms_per_household'] = df['total_rooms'] / df['households']
df['bedrooms_per_room'] = df['total_bedrooms'] / df['total_rooms']
df['population_per_household'] = df['population'] / df['households']

In [10]:
df.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity,rooms_per_household,bedrooms_per_room,population_per_household
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY,6.984127,0.146591,2.555556
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY,6.238137,0.155797,2.109842
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY,8.288136,0.129516,2.80226
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY,5.817352,0.184458,2.547945
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY,6.281853,0.172096,2.181467


##### Question 1

In [11]:
df.ocean_proximity.mode()

0    <1H OCEAN
Name: ocean_proximity, dtype: object

In [12]:
df.groupby('ocean_proximity').ocean_proximity.count()

ocean_proximity
<1H OCEAN     9136
INLAND        6551
ISLAND           5
NEAR BAY      2290
NEAR OCEAN    2658
Name: ocean_proximity, dtype: int64

##### Splitting data

In [13]:
from sklearn.model_selection import train_test_split

In [14]:
df_full_train, df_test = train_test_split(df, test_size=0.2, random_state=42)
df_train, df_val = train_test_split(df_full_train, test_size=0.25, random_state=42)
round(len(df_train)/len(df), 2), round(len(df_val)/len(df),2), round(len(df_test)/len(df),2)

(0.6, 0.2, 0.2)

In [15]:
y_train = df_train.median_house_value.values
y_val = df_val.median_house_value.values
y_test = df_test.median_house_value.values

In [16]:
del df_train['median_house_value']
del df_val['median_house_value']
del df_test['median_house_value']

##### Question 2

In [18]:
df_train.corr()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,rooms_per_household,bedrooms_per_room,population_per_household
longitude,1.0,-0.925005,-0.099812,0.036449,0.06384,0.09167,0.049762,-0.016426,-0.034814,0.10232,0.011022
latitude,-0.925005,1.0,0.002477,-0.025914,-0.05973,-0.100272,-0.063529,-0.076805,0.119118,-0.124507,-0.002301
housing_median_age,-0.099812,0.002477,1.0,-0.363522,-0.324156,-0.292476,-0.306119,-0.119591,-0.181275,0.129456,0.012167
total_rooms,0.036449,-0.025914,-0.363522,1.0,0.931546,0.853219,0.921441,0.198951,0.168926,-0.194185,-0.029452
total_bedrooms,0.06384,-0.05973,-0.324156,0.931546,1.0,0.87734,0.979399,-0.009833,0.010381,0.078094,-0.034301
population,0.09167,-0.100272,-0.292476,0.853219,0.87734,1.0,0.906841,-0.000849,-0.07621,0.031592,0.064998
households,0.049762,-0.063529,-0.306119,0.921441,0.979399,0.906841,1.0,0.011925,-0.085832,0.058004,-0.032522
median_income,-0.016426,-0.076805,-0.119591,0.198951,-0.009833,-0.000849,0.011925,1.0,0.394154,-0.616617,-0.000454
rooms_per_household,-0.034814,0.119118,-0.181275,0.168926,0.010381,-0.07621,-0.085832,0.394154,1.0,-0.500589,0.001801
bedrooms_per_room,0.10232,-0.124507,0.129456,-0.194185,0.078094,0.031592,0.058004,-0.616617,-0.500589,1.0,-0.002851


In [17]:
print('the correlation between total_bedrooms and households is: ', round(df_train.total_bedrooms.corr(df_train.households), 3))
print('the correlation between total_bedrooms and total_rooms is: ', round(df_train.total_bedrooms.corr(df_train.total_rooms), 3))
print('the correlation between population and households is: ', round(df_train.population.corr(df_train.households), 3))
print('the correlation between population_per_household and total_rooms is: ', round(df_train.population_per_household.corr(df_train.total_rooms), 3))

the correlation between total_bedrooms and households is:  0.979
the correlation between total_bedrooms and total_rooms is:  0.932
the correlation between population and households is:  0.907
the correlation between population_per_household and total_rooms is:  -0.029


##### Make median_house_value binary

In [19]:
target_mean = df.median_house_value.mean()
target_mean

206855.81690891474

In [20]:
above_average = (y_train > target_mean).astype(int)
above_average

array([1, 1, 0, ..., 1, 0, 0])

In [21]:
from sklearn.metrics import mutual_info_score

In [22]:
mutual_info_score(df_train.ocean_proximity, above_average).round(2)

0.1

##### Question 4

In [23]:
y_bin_train = (y_train > target_mean).astype(int)
y_bin_val = (y_val > target_mean).astype(int)

In [24]:
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LogisticRegression

In [25]:
train_dicts = df_train.to_dict(orient='records')
val_dicts = df_val.to_dict(orient='records')

In [26]:
dv = DictVectorizer(sparse=False)

In [27]:
X_train = dv.fit_transform(train_dicts)
X_val = dv.transform(val_dicts)

In [28]:
model = LogisticRegression(solver="liblinear", C=1.0, max_iter=1000, random_state=42)
model.fit(X_train, y_bin_train)

LogisticRegression(max_iter=1000, random_state=42, solver='liblinear')

In [29]:
p_pred = model.predict_proba(X_val)[:, 1]

In [30]:
y_pred = p_pred >= 0.5

In [31]:
acc = (y_pred == y_bin_val).mean()
acc.round(2)

0.84

##### Question 5

In [32]:
cols_to_drop = ['total_rooms', 'total_bedrooms', 'population', 'households']

dv = DictVectorizer(sparse=False)
model = LogisticRegression(solver="liblinear", C=1.0, max_iter=1000, random_state=42)    

for c in cols_to_drop:   
    
    temp_train = df_train.drop(c, axis=1)
    temp_val = df_val.drop(c, axis=1)
    
    temp_train_dicts = temp_train.to_dict(orient='records')
    temp_val_dicts = temp_val.to_dict(orient='records')
    
    temp_X_train = dv.fit_transform(temp_train_dicts)
    temp_X_val = dv.transform(temp_val_dicts)
    
    model.fit(temp_X_train, y_bin_train)
    predictions = model.predict(temp_X_val)
    
    score = (predictions == y_bin_val).mean().round(7)
    score_diff = abs(acc - score).round(7)
    
    print('dropped {}, score was {}, now it is {}, changed by {}'.format(c, round(acc, 5), score, score_diff)) 


dropped total_rooms, score was 0.836, now it is 0.8362403, changed by 0.0002422
dropped total_bedrooms, score was 0.836, now it is 0.8372093, changed by 0.0012112
dropped population, score was 0.836, now it is 0.8263081, changed by 0.00969
dropped households, score was 0.836, now it is 0.8340601, changed by 0.001938


##### Question 6

In [33]:
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error

In [34]:
y_train_log = np.log1p(y_train)
y_val_log = np.log1p(y_val)

In [35]:
alpha = [0, 0.01, 0.1, 1, 10]

for a in alpha:
    
    model = Ridge(alpha=a, solver="sag", random_state=42)
    
    model.fit(X_train, y_train_log)
    y_predict = model.predict(X_val)
    
    rmse = mean_squared_error(y_val_log, y_predict, squared=False).round(3)
       
    print('alpha = {}, RMSE = {}'.format(a, rmse))          

alpha = 0, RMSE = 0.524
alpha = 0.01, RMSE = 0.524
alpha = 0.1, RMSE = 0.524
alpha = 1, RMSE = 0.524
alpha = 10, RMSE = 0.524
