In [1]:
import pandas as pd
import numpy as np
import matplotlib as plt
import seaborn as sns

In [2]:
# !python -m wget https://raw.githubusercontent.com/alexeygrigorev/datasets/master/housing.csv

In [3]:
df = pd.read_csv('housing.csv')

In [4]:
used_cols = ['latitude',
'longitude',
'housing_median_age',
'total_rooms',
'total_bedrooms',
'population',
'households',
'median_income',
'median_house_value',
'ocean_proximity']

In [5]:
df = df[used_cols]

In [6]:
df.isna().sum()

latitude                0
longitude               0
housing_median_age      0
total_rooms             0
total_bedrooms        207
population              0
households              0
median_income           0
median_house_value      0
ocean_proximity         0
dtype: int64

In [7]:
df.fillna(0, inplace=True)
df.isna().sum()

latitude              0
longitude             0
housing_median_age    0
total_rooms           0
total_bedrooms        0
population            0
households            0
median_income         0
median_house_value    0
ocean_proximity       0
dtype: int64

In [8]:
df.dtypes

latitude              float64
longitude             float64
housing_median_age    float64
total_rooms           float64
total_bedrooms        float64
population            float64
households            float64
median_income         float64
median_house_value    float64
ocean_proximity        object
dtype: object

In [9]:
df['rooms_per_household'] = df['total_rooms'] / df['households']
df['bedrooms_per_room'] = df['total_bedrooms'] / df['total_rooms']
df['population_per_household'] = df['population'] / df['households']

## Question 1

What is the mode for ocean_proximity?

In [10]:
df['ocean_proximity'].mode()

0    <1H OCEAN
dtype: object

Answer: <1H OCEAN

## Question 2

In [11]:
numerical = [
 'latitude', 'longitude', 'housing_median_age', 'total_rooms',
       'total_bedrooms', 'population', 'households', 'median_income',
        'rooms_per_household',
       'bedrooms_per_room', 'population_per_household'   
]

In [12]:
for i in numerical:
    print(i)
    print(df[numerical].corrwith(df[i]).sort_values(ascending=False))

latitude
latitude                    1.000000
rooms_per_household         0.106389
housing_median_age          0.011173
population_per_household    0.002366
total_rooms                -0.036100
total_bedrooms             -0.065318
households                 -0.071035
median_income              -0.079809
bedrooms_per_room          -0.104112
population                 -0.108785
longitude                  -0.924664
dtype: float64
longitude
longitude                   1.000000
population                  0.099773
bedrooms_per_room           0.084836
total_bedrooms              0.068082
households                  0.055310
total_rooms                 0.044568
population_per_household    0.002476
median_income              -0.015176
rooms_per_household        -0.027540
housing_median_age         -0.108197
latitude                   -0.924664
dtype: float64
housing_median_age
housing_median_age          1.000000
bedrooms_per_room           0.125396
population_per_household    0.013191
latitud

The winner: total bedrooms vs. housholds 0.980255 <br>
total bedrooms vs. total rooms 0.930489 <br>
population vs. households 0.907452 <br>
population_per_household vs. total_rooms -0.024991 <br>

In [13]:
mean_value = df['median_house_value'].mean()
df['above_average'] = df['median_house_value'].apply(lambda x: 1 if x > mean_value else 0)

In [14]:
df.head(1)

Unnamed: 0,latitude,longitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity,rooms_per_household,bedrooms_per_room,population_per_household,above_average
0,37.88,-122.23,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY,6.984127,0.146591,2.555556,1


## Data splitting

In [15]:
from sklearn.model_selection import train_test_split

In [16]:
df_full_train, df_test = train_test_split(df, test_size=0.2, random_state=42)

In [17]:
df_train, df_val = train_test_split(df_full_train, test_size=0.25, random_state=42)

In [18]:
df_train.reset_index(drop=True, inplace=True)
df_val.reset_index(drop=True, inplace=True)
df_test.reset_index(drop=True, inplace=True)

In [19]:
y_train = df_train.median_house_value.values
y_val = df_val.median_house_value.values
y_test = df_test.median_house_value.values

In [20]:
del df_train['median_house_value']
del df_val['median_house_value']
del df_test['median_house_value']

## Question 3

In [21]:
from sklearn.metrics import mutual_info_score

In [22]:
score = mutual_info_score(df['above_average'], df['ocean_proximity'])
round(score, 2)

0.1

## Question 4

In [23]:
y_train = df_train.above_average.values
y_val = df_val.above_average.values
y_test = df_test.above_average.values

del df_train['above_average']
del df_val['above_average']
del df_test['above_average']

In [24]:
from sklearn.feature_extraction import DictVectorizer

In [25]:
train_dicts = df_train[['ocean_proximity'] + numerical ].to_dict(orient='records')

In [26]:
dv = DictVectorizer(sparse=False)

In [27]:
X_train = dv.fit_transform(train_dicts)

In [28]:
val_dicts = df_val[['ocean_proximity'] + numerical].to_dict(orient='records')

In [29]:
X_val = dv.transform(val_dicts)

In [30]:
from sklearn.linear_model import LogisticRegression

In [31]:
model = LogisticRegression(solver="liblinear", C=1.0, max_iter=1000, random_state=42)

In [32]:
model.fit(X_train, y_train)

LogisticRegression(max_iter=1000, random_state=42, solver='liblinear')

In [33]:
y_pred = model.predict_proba(X_val)[:, 1]

In [34]:
above_decision = (y_pred >= 0.5)

In [35]:
round((y_val == above_decision).mean(),2)

0.84

In [36]:
df_pred = pd.DataFrame()
df_pred['probability'] = y_pred
df_pred['prediction'] = above_decision.astype(int)
df_pred['actual'] = y_val

In [37]:
df_pred['correct'] = df_pred.prediction == df_pred.actual

In [38]:
og_acc = df_pred.correct.mean()

## Question 5

In [39]:
numerical1 = ['latitude', 'longitude', 'housing_median_age',
       'total_bedrooms', 'population', 'households', 'median_income',
        'rooms_per_household',
       'bedrooms_per_room', 'population_per_household']
numerical2 = [ 'latitude', 'longitude', 'housing_median_age', 'total_rooms',
        'population', 'households', 'median_income',
        'rooms_per_household',
       'bedrooms_per_room', 'population_per_household'   ]
numerical3 = [ 'latitude', 'longitude', 'housing_median_age', 'total_rooms',
       'total_bedrooms', 'households', 'median_income',
        'rooms_per_household',
       'bedrooms_per_room', 'population_per_household'   ]
numerical4 = [ 'latitude', 'longitude', 'housing_median_age', 'total_rooms',
       'total_bedrooms', 'population', 'median_income',
        'rooms_per_household',
       'bedrooms_per_room', 'population_per_household'   ]

In [40]:
test_list = [numerical1, numerical2, numerical3, numerical4]
acc_list = []
for i in test_list:
    train_dicts = df_train[['ocean_proximity'] +  i].to_dict(orient='records')
    dv = DictVectorizer(sparse=False)
    X_train = dv.fit_transform(train_dicts)
    val_dicts = df_val[['ocean_proximity'] + i].to_dict(orient='records')
    X_val = dv.transform(val_dicts)
    model = LogisticRegression(solver="liblinear", C=1.0, max_iter=1000, random_state=42)
    model.fit(X_train, y_train)
    y_pred = model.predict_proba(X_val)[:, 1]
    above_decision = (y_pred >= 0.5)
    acc_list.append((y_val == above_decision).mean())

In [42]:
acc_list - og_acc

array([ 0.0004845 , -0.0004845 , -0.01041667, -0.00290698])

In [43]:
min(acc_list - og_acc)

-0.01041666666666674

This means population has the smallest difference between original accuracy and accuracy without the feature

## Question 6

In [45]:
df_full_train, df_test = train_test_split(df, test_size=0.2, random_state=42)
df_train, df_val = train_test_split(df_full_train, test_size=0.25, random_state=42)

df_train.reset_index(drop=True, inplace=True)
df_val.reset_index(drop=True, inplace=True)
df_test.reset_index(drop=True, inplace=True)

y_train = df_train.median_house_value.values
y_val = df_val.median_house_value.values
y_test = df_test.median_house_value.values

del df_train['median_house_value']
del df_val['median_house_value']
del df_test['median_house_value']

In [49]:
def rmse(y, y_pred):
    error = y-y_pred
    se = error ** 2
    mse = se.mean()
    return np.sqrt(mse)

In [56]:
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_absolute_error

In [61]:
alpha_list = [0, 0.01, 0.1, 1, 10]
for a in alpha_list:
    model = Ridge(alpha=a, solver="sag", random_state=42)
    
    train_dicts = df_train[['ocean_proximity'] + numerical ].to_dict(orient='records')
    dv = DictVectorizer(sparse=False)
    X_train = dv.fit_transform(train_dicts)
    val_dicts = df_val[['ocean_proximity'] + numerical].to_dict(orient='records')
    X_val = dv.transform(val_dicts)
    
    model.fit(X_train, y_train)
    y_pred = model.predict(X_val)
    print(a, round(mean_absolute_error(y_val, y_pred), 3))

0 83133.522
0.01 83133.522
0.1 83133.522
1 83133.522
10 83133.525


The best and smallest RMSE alpha is 0 