# Data Preparation

In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [4]:
df = pd.read_csv('/content/drive/MyDrive/ML Zoomcamp/Datasets/housing.csv')
df.sample(5).T

Unnamed: 0,2899,18963,847,17165,17435
longitude,-119.01,-122.04,-122.07,-122.22,-120.46
latitude,35.37,38.27,37.59,37.36,34.64
housing_median_age,35.0,16.0,15.0,34.0,37.0
total_rooms,120.0,8517.0,3475.0,1559.0,1697.0
total_bedrooms,35.0,1910.0,686.0,243.0,334.0
population,477.0,4508.0,2568.0,600.0,740.0
households,41.0,1837.0,653.0,242.0,272.0
median_income,1.9125,3.1853,4.6211,8.7382,2.3804
median_house_value,47500.0,129600.0,151400.0,500001.0,148000.0
ocean_proximity,INLAND,INLAND,NEAR BAY,NEAR OCEAN,NEAR OCEAN


In [5]:
cat_cols = list(df.dtypes[df.dtypes == 'object'].index)
for c in cat_cols:
  df[c] = df[c].str.lower().str.replace(' ','_')

In [6]:
df.head().T

Unnamed: 0,0,1,2,3,4
longitude,-122.23,-122.22,-122.24,-122.25,-122.25
latitude,37.88,37.86,37.85,37.85,37.85
housing_median_age,41.0,21.0,52.0,52.0,52.0
total_rooms,880.0,7099.0,1467.0,1274.0,1627.0
total_bedrooms,129.0,1106.0,190.0,235.0,280.0
population,322.0,2401.0,496.0,558.0,565.0
households,126.0,1138.0,177.0,219.0,259.0
median_income,8.3252,8.3014,7.2574,5.6431,3.8462
median_house_value,452600.0,358500.0,352100.0,341300.0,342200.0
ocean_proximity,near_bay,near_bay,near_bay,near_bay,near_bay


In [7]:
df.dtypes

longitude             float64
latitude              float64
housing_median_age    float64
total_rooms           float64
total_bedrooms        float64
population            float64
households            float64
median_income         float64
median_house_value    float64
ocean_proximity        object
dtype: object

**Handling Missing Values**

In [8]:
df.isnull().sum()

longitude               0
latitude                0
housing_median_age      0
total_rooms             0
total_bedrooms        207
population              0
households              0
median_income           0
median_house_value      0
ocean_proximity         0
dtype: int64

In [9]:
#fiiling missing values using median

median = df['total_bedrooms'].median()
df['total_bedrooms'].fillna(median,inplace=True)

In [10]:
df.isnull().sum()

longitude             0
latitude              0
housing_median_age    0
total_rooms           0
total_bedrooms        0
population            0
households            0
median_income         0
median_house_value    0
ocean_proximity       0
dtype: int64

**Creating New Columns**

In [11]:
df['rooms_per_household'] = df['total_rooms']/df['households']
df['bedrooms_per_room'] = df['total_bedrooms']/df['total_rooms']
df['population_per_household'] = df['population']/df['households']

In [12]:
df.head().T

Unnamed: 0,0,1,2,3,4
longitude,-122.23,-122.22,-122.24,-122.25,-122.25
latitude,37.88,37.86,37.85,37.85,37.85
housing_median_age,41.0,21.0,52.0,52.0,52.0
total_rooms,880.0,7099.0,1467.0,1274.0,1627.0
total_bedrooms,129.0,1106.0,190.0,235.0,280.0
population,322.0,2401.0,496.0,558.0,565.0
households,126.0,1138.0,177.0,219.0,259.0
median_income,8.3252,8.3014,7.2574,5.6431,3.8462
median_house_value,452600.0,358500.0,352100.0,341300.0,342200.0
ocean_proximity,near_bay,near_bay,near_bay,near_bay,near_bay


In [13]:
df['ocean_proximity'].mode()

0    <1h_ocean
Name: ocean_proximity, dtype: object

**Correlation Matrix**

In [14]:
df_numeric=df.copy
df_numeric = df.drop(['ocean_proximity','median_house_value'],axis=1)

In [15]:
df_numeric.corr()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,rooms_per_household,bedrooms_per_room,population_per_household
longitude,1.0,-0.924664,-0.108197,0.044568,0.06912,0.099773,0.05531,-0.015176,-0.02754,0.081205,0.002476
latitude,-0.924664,1.0,0.011173,-0.0361,-0.066484,-0.108785,-0.071035,-0.079809,0.106389,-0.098619,0.002366
housing_median_age,-0.108197,0.011173,1.0,-0.361262,-0.319026,-0.296244,-0.302916,-0.119034,-0.153277,0.135622,0.013191
total_rooms,0.044568,-0.0361,-0.361262,1.0,0.927058,0.857126,0.918484,0.19805,0.133798,-0.187381,-0.024581
total_bedrooms,0.06912,-0.066484,-0.319026,0.927058,1.0,0.873535,0.974366,-0.007617,0.001765,0.071649,-0.028325
population,0.099773,-0.108785,-0.296244,0.857126,0.873535,1.0,0.907222,0.004834,-0.072213,0.010035,0.069863
households,0.05531,-0.071035,-0.302916,0.918484,0.974366,0.907222,1.0,0.013033,-0.080598,0.034498,-0.027309
median_income,-0.015176,-0.079809,-0.119034,0.19805,-0.007617,0.004834,0.013033,1.0,0.326895,-0.545298,0.018766
rooms_per_household,-0.02754,0.106389,-0.153277,0.133798,0.001765,-0.072213,-0.080598,0.326895,1.0,-0.370308,-0.004852
bedrooms_per_room,0.081205,-0.098619,0.135622,-0.187381,0.071649,0.010035,0.034498,-0.545298,-0.370308,1.0,0.002601


In [16]:
df_numeric.corr().unstack().sort_values(ascending=False).abs()[:15]

longitude                 longitude                   1.000000
latitude                  latitude                    1.000000
bedrooms_per_room         bedrooms_per_room           1.000000
rooms_per_household       rooms_per_household         1.000000
median_income             median_income               1.000000
households                households                  1.000000
total_bedrooms            total_bedrooms              1.000000
total_rooms               total_rooms                 1.000000
housing_median_age        housing_median_age          1.000000
population                population                  1.000000
population_per_household  population_per_household    1.000000
total_bedrooms            households                  0.974366
households                total_bedrooms              0.974366
total_bedrooms            total_rooms                 0.927058
total_rooms               total_bedrooms              0.927058
dtype: float64

**Make Media House Value Binary**

In [17]:
df_class = df.copy()

In [18]:
mean = df_class['median_house_value'].mean()
df_class['above_average'] = np.where(df_class['median_house_value']>=mean,1,0)

In [19]:
df_class.sample(5).T

Unnamed: 0,789,10116,12560,555,17904
longitude,-122.1,-117.93,-121.46,-122.24,-121.97
latitude,37.63,33.94,38.54,37.77,37.36
housing_median_age,29.0,28.0,48.0,43.0,34.0
total_rooms,2172.0,3664.0,1001.0,955.0,884.0
total_bedrooms,435.0,719.0,205.0,284.0,153.0
population,1377.0,1820.0,605.0,585.0,534.0
households,408.0,657.0,175.0,266.0,154.0
median_income,3.7895,4.225,1.8333,2.3882,6.0116
median_house_value,180900.0,224700.0,58200.0,162500.0,271200.0
ocean_proximity,near_bay,<1h_ocean,inland,near_bay,<1h_ocean


# Splitting the Dataset

In [20]:
df_class = df_class.drop('median_house_value',axis=1)

In [21]:
from sklearn.model_selection import train_test_split

df_full_train,df_test = train_test_split(df_class,test_size=0.2,random_state=42)
df_train,df_val = train_test_split(df_full_train,test_size=0.25,random_state=42)

In [22]:
print(df_full_train.shape)
print(df_train.shape)
print(df_val.shape)
print(df_test.shape)

(16512, 13)
(12384, 13)
(4128, 13)
(4128, 13)


In [23]:
df_full_train = df_full_train.reset_index(drop=True)
df_train = df_train.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)

In [24]:
y_full_train=df_full_train.above_average.values
y_train=df_train.above_average.values
y_val=df_val.above_average.values
y_test=df_test.above_average.values

**Mutual Information**

In [25]:
from sklearn.metrics import mutual_info_score
cat_col = ['ocean_proximity']
def calculate_mi(series):
  return mutual_info_score(series,df_train.above_average)
mi = df_train[cat_col].apply(calculate_mi)
mi

ocean_proximity    0.101384
dtype: float64

In [26]:
df_train = df_train.drop('above_average',axis=1)
df_val = df_val.drop('above_average', axis=1)
df_test = df_test.drop('above_average', axis=1)

# Training the Logistic Regression Model

**One HOt Encoding**

In [27]:
from sklearn.feature_extraction import DictVectorizer
num_col = ["longitude", "latitude", "housing_median_age", "total_rooms", "total_bedrooms", "population", "households", "median_income", "rooms_per_household",
       "bedrooms_per_room", "population_per_household"]

train_dicts = df_train[cat_col+num_col].to_dict(orient='records')

dv = DictVectorizer(sparse=False)
dv.fit(train_dicts)
X_train = dv.transform(train_dicts)

**Logistic Regression**

In [28]:
from sklearn.linear_model import LogisticRegression

model = LogisticRegression(solver="liblinear", C=1.0, max_iter=1000, random_state=42)
model.fit(X_train,y_train)

In [29]:
val_dicts = df_val[cat_col+num_col].to_dict(orient='records')
X_val = dv.transform(val_dicts)

y_pred = model.predict(X_val)

**Calculating the Accuracy**

In [30]:
df_pred = pd.DataFrame()
df_pred['prediction'] = y_pred
df_pred['actual'] = y_val
df_pred['correct'] = df_pred.prediction == df_pred.actual
df_pred

Unnamed: 0,prediction,actual,correct
0,0,0,True
1,0,0,True
2,1,1,True
3,0,1,False
4,1,1,True
...,...,...,...
4123,0,0,True
4124,1,1,True
4125,1,1,True
4126,1,1,True


In [31]:
from sklearn.metrics import accuracy_score

accurracy = np.round(accuracy_score(y_val,y_pred),2)
accurracy

0.84

# Feature ELimination & Model Training

In [32]:
features = cat_col + num_col

In [34]:
original_score = accurracy

for c in features:
  subset = features.copy()
  subset.remove(c)

  train_dicts = df_train[subset].to_dict(orient='records')
  dv = DictVectorizer(sparse=False)
  dv.fit(train_dicts)
  X_train = dv.transform(train_dicts)

  model = LogisticRegression(solver="liblinear", C=1.0, max_iter=1000, random_state=42)
  model.fit(X_train,y_train)

  val_dicts = df_val[subset].to_dict(orient='records')

  X_val = dv.transform(val_dicts)
  y_pred = model.predict(X_val)
  score = accuracy_score(y_val, y_pred)
  print(c, original_score - score, score)




ocean_proximity 0.0204748062015504 0.8195251937984496
longitude 0.008846899224806193 0.8311531007751938
latitude 0.00618217054263559 0.8338178294573644
housing_median_age 0.008604651162790633 0.8313953488372093
total_rooms 0.0030329457364340895 0.8369670542635659
total_bedrooms 0.0042441860465115555 0.8357558139534884
population 0.013691860465116279 0.8263081395348837
households 0.006908914728682158 0.8330910852713178
median_income 0.053662790697674434 0.7863372093023255
rooms_per_household 0.004486434108527115 0.8355135658914729
bedrooms_per_room 0.005213178294573573 0.8347868217054264
population_per_household 0.0032751937984495383 0.8367248062015504
