In [1]:
import pandas as pd

import matplotlib.pyplot as plt

In [2]:
housing_data = pd.read_csv('datasets/housing.csv')

housing_data.sample(5)

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
19376,-120.79,37.76,14.0,3531.0,508.0,1505.0,497.0,5.5228,275300.0,INLAND
12993,-121.25,38.69,17.0,3050.0,481.0,1490.0,489.0,4.5562,134500.0,INLAND
1884,-119.99,38.93,23.0,1882.0,414.0,673.0,277.0,2.9091,141900.0,INLAND
18519,-122.04,36.97,30.0,2695.0,424.0,1098.0,420.0,5.3972,362300.0,NEAR OCEAN
2036,-119.69,36.75,6.0,1926.0,303.0,965.0,316.0,4.7463,93100.0,INLAND


In [3]:
housing_data = housing_data.dropna()

In [4]:
housing_data.shape

(20433, 10)

In [5]:
housing_data.loc[housing_data['median_house_value'] == 500001].count()

longitude             958
latitude              958
housing_median_age    958
total_rooms           958
total_bedrooms        958
population            958
households            958
median_income         958
median_house_value    958
ocean_proximity       958
dtype: int64

In [6]:
housing_data = housing_data.drop(housing_data.loc[housing_data['median_house_value'] == 500001].index)

In [7]:
housing_data.shape

(19475, 10)

In [8]:
housing_data.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY


In [9]:
housing_data['ocean_proximity'].unique()

array(['NEAR BAY', '<1H OCEAN', 'INLAND', 'NEAR OCEAN', 'ISLAND'],
      dtype=object)

In [10]:
housing_data = pd.get_dummies(housing_data, columns=['ocean_proximity'])

In [11]:
housing_data.shape

(19475, 14)

In [12]:
housing_data.sample(5)

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity_<1H OCEAN,ocean_proximity_INLAND,ocean_proximity_ISLAND,ocean_proximity_NEAR BAY,ocean_proximity_NEAR OCEAN
886,-121.98,37.54,17.0,5133.0,1375.0,3386.0,1339.0,3.1326,220800.0,1,0,0,0,0
20316,-119.16,34.17,17.0,7982.0,1603.0,6437.0,1596.0,4.1279,223900.0,0,0,0,0,1
584,-122.08,37.71,35.0,2211.0,350.0,1004.0,365.0,5.4639,238600.0,0,0,0,1,0
17285,-119.73,34.41,29.0,1769.0,297.0,703.0,269.0,4.4375,350000.0,0,0,0,0,1
19228,-122.66,38.48,16.0,2697.0,490.0,1462.0,515.0,4.2051,190300.0,1,0,0,0,0


In [13]:
median = housing_data['median_house_value'].median()

median

173800.0

In [14]:
#add new column in dataset
housing_data['above_median'] = (housing_data['median_house_value'] - median) > 0

In [15]:
housing_data.sample(5)

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity_<1H OCEAN,ocean_proximity_INLAND,ocean_proximity_ISLAND,ocean_proximity_NEAR BAY,ocean_proximity_NEAR OCEAN,above_median
9831,-121.93,36.63,28.0,3983.0,852.0,1582.0,778.0,3.5147,313900.0,0,0,0,0,1,True
19243,-122.79,38.5,18.0,4839.0,918.0,2755.0,841.0,3.75,248300.0,1,0,0,0,0,True
12424,-116.11,33.64,20.0,1273.0,354.0,1548.0,355.0,2.0871,84700.0,0,1,0,0,0,False
16517,-121.21,37.8,31.0,699.0,186.0,460.0,170.0,2.7443,94200.0,0,1,0,0,0,False
19657,-120.84,37.48,10.0,2874.0,612.0,1960.0,596.0,2.7381,104600.0,0,1,0,0,0,False


In [16]:
#Setting up features/labels of classification
X = housing_data.drop(['median_house_value', 'above_median'], axis = 1)
Y = housing_data['above_median']

In [17]:
X.columns

Index(['longitude', 'latitude', 'housing_median_age', 'total_rooms',
       'total_bedrooms', 'population', 'households', 'median_income',
       'ocean_proximity_<1H OCEAN', 'ocean_proximity_INLAND',
       'ocean_proximity_ISLAND', 'ocean_proximity_NEAR BAY',
       'ocean_proximity_NEAR OCEAN'],
      dtype='object')

In [18]:
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(X,Y, test_size =0.2)

In [19]:
x_train.shape, x_test.shape

((15580, 13), (3895, 13))

In [20]:
y_train.shape, y_test.shape

((15580,), (3895,))

In [21]:
from sklearn.linear_model import LogisticRegression
logistic_model = LogisticRegression(solver = 'liblinear').fit(x_train, y_train)

In [22]:
print("Training_score: " , logistic_model.score(x_train, y_train))

Training_score:  0.8211168164313222


In [24]:
y_pred = logistic_model.predict(x_test)

In [25]:
df_pred_actual = pd.DataFrame({'predicted': y_pred, 'actual': y_test})
df_pred_actual.head(10)

Unnamed: 0,predicted,actual
18800,False,False
9103,False,False
2352,False,False
11339,False,True
1643,False,False
15536,False,False
10895,False,False
14974,False,False
18426,True,True
2665,False,False


In [26]:
from sklearn.metrics import accuracy_score
print("Testing_score: ", accuracy_score(y_test, y_pred))

Testing_score:  0.8223363286264441
