In [1]:
import pandas as pd
import numpy as np

In [2]:
base = [
    "latitude",
    "longitude",
    "housing_median_age",
    "total_rooms",
    "total_bedrooms",
    "population",
    "households",
    "median_income",
    "median_house_value",
    "ocean_proximity",
]

In [74]:
df = pd.read_csv("data-hw.csv")

Index(['longitude', 'latitude', 'housing_median_age', 'total_rooms',
       'total_bedrooms', 'population', 'households', 'median_income',
       'median_house_value', 'ocean_proximity'],
      dtype='object')

In [73]:
df = df[base]

df.isnull().sum()

latitude              0
longitude             0
housing_median_age    0
total_rooms           0
total_bedrooms        0
population            0
households            0
median_income         0
median_house_value    0
ocean_proximity       0
dtype: int64

In [5]:
df["total_bedrooms"] = df["total_bedrooms"].fillna(0)

In [6]:
df.isnull().sum()

latitude              0
longitude             0
housing_median_age    0
total_rooms           0
total_bedrooms        0
population            0
households            0
median_income         0
median_house_value    0
ocean_proximity       0
dtype: int64

In [7]:
df["rooms_per_household"] = df["total_rooms"] / df["households"]
df["bedrooms_per_room"] = df["total_bedrooms"] / df["total_rooms"]
df["population_per_household"] = df["population"] / df["households"]

#### Question 1

In [8]:
df["ocean_proximity"].value_counts() # <1H OCEAN

<1H OCEAN     9136
INLAND        6551
NEAR OCEAN    2658
NEAR BAY      2290
ISLAND           5
Name: ocean_proximity, dtype: int64

#### Question 2

In [9]:
numerical = list(df.columns[df.dtypes == "float"])
categorical = ["ocean_proximity"]

In [10]:
median_house_value_mean = df["median_house_value"].mean()
df["median_house_value"] = (df["median_house_value"] > median_house_value_mean).astype(int)

In [11]:
from sklearn.model_selection import train_test_split

In [12]:
df_full_train, df_test = train_test_split(df, test_size=0.2, random_state=42)
df_train, df_val = train_test_split(df_full_train, test_size=0.25, random_state=42)
len(df_train), len(df_test), len(df_val)

(12384, 4128, 4128)

In [13]:
df_train = df_train.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)

In [14]:
y_full_train = df_full_train["median_house_value"].values
y_train = df_train["median_house_value"].values
y_val = df_val["median_house_value"].values
y_test = df_test["median_house_value"].values

In [15]:
df_full_train = df_full_train.reset_index(drop=True)

In [23]:
correlations = df_train[numerical].corrwith(df_train["median_house_value"])
correlations.sort_values(ascending=False)

median_house_value          1.000000
median_income               0.541110
rooms_per_household         0.135955
total_rooms                 0.123332
housing_median_age          0.082397
households                  0.066458
total_bedrooms              0.050593
population                 -0.022491
population_per_household   -0.027705
longitude                  -0.074043
latitude                   -0.097513
bedrooms_per_room          -0.224398
dtype: float64

In [24]:
del df_train["median_house_value"]
del df_val["median_house_value"]
del df_test["median_house_value"]

#### Question 3

In [25]:
from sklearn.metrics import mutual_info_score

In [99]:
score_original = mutual_info_score(df_train["ocean_proximity"], y_train)
np.round(score_original, 2)

0.1

#### Question 4

In [100]:
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction import DictVectorizer

In [101]:
dv = DictVectorizer(sparse=False)

In [102]:
train_dict = df_train.to_dict(orient="records")
X_train = dv.fit_transform(train_dict)

In [103]:
model = LogisticRegression(solver="liblinear", C=1.0, max_iter=1000, random_state=42)
model.fit(X_train, y_train)

LogisticRegression(max_iter=1000, random_state=42, solver='liblinear')

In [104]:
val_dict = df_val.to_dict(orient="records")
X_val = dv.transform(val_dict)

In [105]:
y_pred = model.predict_proba(X_val)[:, 1]

In [106]:
y_pred = (y_pred > 0.5)

In [107]:
(y_pred == y_val).mean().round(2)

0.84

#### Question 5

In [109]:
base = [
    "latitude",
    "longitude",
    "housing_median_age",
    "total_rooms",
    "total_bedrooms",
    "population",
    "households",
    "median_income",
    "ocean_proximity"
]

In [118]:
predictions = pd.DataFrame()
for feature in base:
    base_without_feature = base.copy()
    base_without_feature.remove(feature)
    small_dict = df_train[base_without_feature].to_dict(orient="records")
    X_small = dv.transform(small_dict)

    y_pred = model.predict_proba(X_small)[:, 1]
    score = y_pred > 0.5
    score = (score == y_train).mean()
    print(feature, score_original - score)

latitude -0.5385547728547141
longitude -0.306319630735851
housing_median_age -0.6834191139399853
total_rooms -0.7228247986945073
total_bedrooms -0.6834191139399853
population -0.5714197599348174
households -0.62552182711828
median_income -0.4911549020536805
ocean_proximity -0.7046561940433446


#### Question 6

In [119]:
y_train_log = np.log1p(y_train)

In [120]:
from sklearn.linear_model import Ridge

In [121]:
def rmse(y_actual, y_pred):
    se = (y_actual - y_pred) ** 2
    mse = se.mean()
    return np.round(np.sqrt(mse), 3)

In [127]:
for a in [0, 0.01, 0.1, 1, 10]:
    model = Ridge(alpha=a, solver="sag", random_state=42)
    model.fit(X_train, y_train_log)

    val_dict = df_val.to_dict(orient="records")
    X_val = dv.fit_transform(val_dict)

    y_pred = np.expm1(model.predict(X_val))

    score = rmse(y_val, y_pred)

    print(score, a)

0.46 0
0.46 0.01
0.46 0.1
0.46 1
0.46 10
