In [1]:
import pandas as pd
import matplotlib.pyplot as plt

In [3]:
housing = pd.read_csv('datasets/housing.csv')
housing.sample(5)

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
15973,-122.4,37.72,41.0,1975.0,440.0,1528.0,424.0,3.8625,218300.0,NEAR BAY
12623,-121.53,38.48,5.0,27870.0,5027.0,11935.0,4855.0,4.8811,212200.0,INLAND
9800,-121.92,36.56,40.0,2124.0,449.0,643.0,341.0,5.5164,369100.0,NEAR OCEAN
12407,-116.29,33.72,5.0,3584.0,760.0,1097.0,470.0,3.1771,167400.0,INLAND
16348,-121.35,38.03,16.0,3158.0,515.0,1596.0,528.0,4.1739,131300.0,INLAND


In [4]:
housing = housing.dropna()
housing.shape

(20433, 10)

In [5]:
housing = housing.drop(housing.loc[housing['median_house_value'] == 500001].index)

In [6]:
housing.shape

(19475, 10)

In [7]:
housing['ocean_proximity'].unique()

array(['NEAR BAY', '<1H OCEAN', 'INLAND', 'NEAR OCEAN', 'ISLAND'],
      dtype=object)

In [8]:
housing=pd.get_dummies(housing,columns=['ocean_proximity'])

In [9]:
housing.shape

(19475, 14)

In [10]:
housing.sample(5)

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity_<1H OCEAN,ocean_proximity_INLAND,ocean_proximity_ISLAND,ocean_proximity_NEAR BAY,ocean_proximity_NEAR OCEAN
12115,-117.25,33.95,5.0,13096.0,2208.0,6780.0,2180.0,4.2775,138700.0,0,1,0,0,0
4164,-118.19,34.1,42.0,1577.0,379.0,1317.0,378.0,3.2121,153900.0,1,0,0,0,0
9816,-121.93,36.62,39.0,869.0,173.0,406.0,165.0,4.0313,253800.0,0,0,0,0,1
13877,-117.31,34.43,16.0,5130.0,1172.0,3126.0,1046.0,1.6784,71900.0,0,1,0,0,0
4994,-118.29,33.99,46.0,2608.0,636.0,1766.0,596.0,1.5846,114800.0,1,0,0,0,0


### Lets add a new column which says if the house price is above the median price or below the median price

In [15]:
median = housing['median_house_value'].median()
print(median)
housing['above_median'] = ((housing['median_house_value'] - median) > 0)

173800.0


In [17]:
housing.sample(5)

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity_<1H OCEAN,ocean_proximity_INLAND,ocean_proximity_ISLAND,ocean_proximity_NEAR BAY,ocean_proximity_NEAR OCEAN,above_median
11205,-117.91,33.83,37.0,1039.0,260.0,719.0,243.0,3.0288,161400.0,1,0,0,0,0,False
19173,-122.68,38.43,18.0,2723.0,529.0,1150.0,520.0,3.5885,191900.0,1,0,0,0,0,True
1969,-120.63,38.73,11.0,4577.0,836.0,1944.0,700.0,4.0675,140200.0,0,1,0,0,0,False
386,-122.29,37.9,52.0,1604.0,263.0,594.0,286.0,5.338,270900.0,0,0,0,1,0,True
11983,-117.48,34.01,23.0,2000.0,376.0,1361.0,388.0,4.369,121100.0,0,1,0,0,0,False


### Now seperate attributes(X) and target values(Y)

In [20]:
X = housing.drop(['median_house_value', 'above_median'], axis=1)
Y = housing['above_median']

In [21]:
X.columns

Index(['longitude', 'latitude', 'housing_median_age', 'total_rooms',
       'total_bedrooms', 'population', 'households', 'median_income',
       'ocean_proximity_<1H OCEAN', 'ocean_proximity_INLAND',
       'ocean_proximity_ISLAND', 'ocean_proximity_NEAR BAY',
       'ocean_proximity_NEAR OCEAN'],
      dtype='object')

### Now split the data for training and testing

In [23]:
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size = 0.2)

In [25]:
(x_train.shape, x_test.shape)

((15580, 13), (3895, 13))

In [26]:
(y_train.shape, y_test.shape)

((15580,), (3895,))

### Now perform logistic regression with training dataset


In [30]:
from sklearn.linear_model import LogisticRegression
logistic_model = LogisticRegression(solver='liblinear').fit(x_train, y_train)

In [31]:
# How the score gives us how many of the model predictions are correct
print("Training Score : ", logistic_model.score(x_train, y_train))

Training Score :  0.8202182284980745


### Now logistic regression model is built, lets see how does that perform on Test data


In [32]:
y_pred = logistic_model.predict(x_test)

In [33]:
# lets create new dataframe with predicted values and actual values from test dataset
df_pred_actual = pd.DataFrame({'Predicted':y_pred, 'Actual': y_test})
df_pred_actual.head(20)

Unnamed: 0,Predicted,Actual
16866,True,True
14927,True,True
9564,False,False
11891,False,False
6303,True,False
10933,True,True
9982,False,False
16420,False,False
14000,False,True
9574,False,False


### How well the model performed.. Get the accuracy score

In [34]:
from sklearn.metrics import accuracy_score
print("Testing Score : ", accuracy_score(y_test, y_pred))

Testing Score :  0.820795892169448
