# Dataset

In [1]:
# Generic lib
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

# Data set
from sklearn.datasets import fetch_california_housing

In [2]:
data = fetch_california_housing()

In [3]:
print(data.DESCR)

.. _california_housing_dataset:

California Housing dataset
--------------------------

**Data Set Characteristics:**

    :Number of Instances: 20640

    :Number of Attributes: 8 numeric, predictive attributes and the target

    :Attribute Information:
        - MedInc        median income in block
        - HouseAge      median house age in block
        - AveRooms      average number of rooms
        - AveBedrms     average number of bedrooms
        - Population    block population
        - AveOccup      average house occupancy
        - Latitude      house block latitude
        - Longitude     house block longitude

    :Missing Attribute Values: None

This dataset was obtained from the StatLib repository.
http://lib.stat.cmu.edu/datasets/

The target variable is the median house value for California districts.

This dataset was derived from the 1990 U.S. census, using one row per census
block group. A block group is the smallest geographical unit for which the U.S.
Census Bur

In [4]:
# Independent data i.e not related to each other 
df = pd.DataFrame(data = data.data,columns = data.feature_names )
df.head()

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude
0,8.3252,41.0,6.984127,1.02381,322.0,2.555556,37.88,-122.23
1,8.3014,21.0,6.238137,0.97188,2401.0,2.109842,37.86,-122.22
2,7.2574,52.0,8.288136,1.073446,496.0,2.80226,37.85,-122.24
3,5.6431,52.0,5.817352,1.073059,558.0,2.547945,37.85,-122.25
4,3.8462,52.0,6.281853,1.081081,565.0,2.181467,37.85,-122.25


In [5]:
#dependent data
df['Target'] = data.target
df.head()

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude,Target
0,8.3252,41.0,6.984127,1.02381,322.0,2.555556,37.88,-122.23,4.526
1,8.3014,21.0,6.238137,0.97188,2401.0,2.109842,37.86,-122.22,3.585
2,7.2574,52.0,8.288136,1.073446,496.0,2.80226,37.85,-122.24,3.521
3,5.6431,52.0,5.817352,1.073059,558.0,2.547945,37.85,-122.25,3.413
4,3.8462,52.0,6.281853,1.081081,565.0,2.181467,37.85,-122.25,3.422


# Exploratory Data Analysis (EDA)

In [6]:
#!pip install sweetviz

1. Sweetviz is an open-source Python library that generates beautiful, high-density visualizations to kickstart EDA (Exploratory Data Analysis) with just two lines of code

In [7]:
import sweetviz as sv

In [8]:
report = sv.analyze(df)

report.show_html("./report.html")

                                             |          | [  0%]   00:00 -> (? left)

Report ./report.html was generated! NOTEBOOK/COLAB USERS: the web browser MAY not pop up, regardless, the report IS saved in your notebook/colab files.


# Data Pre-Processing

## Feature Engineering

In [9]:
#!pip install geopy

In [10]:
from geopy.geocoders import Nominatim

In [11]:
geolocator = Nominatim(user_agent = 'geoapiExercises')

In [12]:
#aryan = geolocator.reverse("37.88"+" , "+"-122.23").raw['address']



In [13]:
# using google maps we are finding the area from longitude and latitude given to us

def locations(cord):
    latitude = str(cord[0])
    longitude = str(cord[1])
    
    location = geolocator.reverse(latitude +","+ longitude).raw['address'] #raw return a dictionary
    
    #if the values are missing replace by a empty string
    
    if location.get('road') is None:
        location['road'] = None
        
    if location.get('county') is None:
        location['county'] = None
    
    
    loc_update['County'].append(location['county'])
    loc_update['Road'].append(location['road'])
    

In [14]:
import pickle

In [15]:

"""
loc_update = {"County":[],
             "Road":[]}


for i,cord in enumerate(df.iloc[:,6:-1].values):
    
    locations(cord)
    pickle.dump(loc_update,open('loc_update.pickle','wb'))
    
    if i%100 == 0:
        print(i)
        
"""

    

'\nloc_update = {"County":[],\n             "Road":[]}\n\n\nfor i,cord in enumerate(df.iloc[:,6:-1].values):\n    \n    locations(cord)\n    pickle.dump(loc_update,open(\'loc_update.pickle\',\'wb\'))\n    \n    if i%100 == 0:\n        print(i)\n        \n'

In [16]:
# to load the pickle model
loc_update = pickle.load(open(r"C:\Users\mehna\Machine Learning Project\loc_update.pickle","rb"))

### Deleting latitude and longitude and modigy the DF with new values (County, Road)

In [17]:
loc = pd.DataFrame(loc_update)
loc.head()

Unnamed: 0,County,Road
0,Alameda County,Centennial Drive
1,Alameda County,Gwin Canyon Trail
2,Alameda County,Grove Shafter Freeway
3,Alameda County,Florio Street
4,Alameda County,Florio Street


In [18]:
loc.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20640 entries, 0 to 20639
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   County  20065 non-null  object
 1   Road    19578 non-null  object
dtypes: object(2)
memory usage: 322.6+ KB


In [19]:
# Add new feauters to dataframe

for i in loc_update.keys():
    df[i] = loc_update[i]

#df = df.sample(axis = 0, frac =1)

In [20]:
df.head()

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude,Target,County,Road
0,8.3252,41.0,6.984127,1.02381,322.0,2.555556,37.88,-122.23,4.526,Alameda County,Centennial Drive
1,8.3014,21.0,6.238137,0.97188,2401.0,2.109842,37.86,-122.22,3.585,Alameda County,Gwin Canyon Trail
2,7.2574,52.0,8.288136,1.073446,496.0,2.80226,37.85,-122.24,3.521,Alameda County,Grove Shafter Freeway
3,5.6431,52.0,5.817352,1.073059,558.0,2.547945,37.85,-122.25,3.413,Alameda County,Florio Street
4,3.8462,52.0,6.281853,1.081081,565.0,2.181467,37.85,-122.25,3.422,Alameda County,Florio Street


In [21]:
# drop latitude and longitude

df.drop(labels = ["Latitude","Longitude"], axis =1, inplace = True)

df.head()

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Target,County,Road
0,8.3252,41.0,6.984127,1.02381,322.0,2.555556,4.526,Alameda County,Centennial Drive
1,8.3014,21.0,6.238137,0.97188,2401.0,2.109842,3.585,Alameda County,Gwin Canyon Trail
2,7.2574,52.0,8.288136,1.073446,496.0,2.80226,3.521,Alameda County,Grove Shafter Freeway
3,5.6431,52.0,5.817352,1.073059,558.0,2.547945,3.413,Alameda County,Florio Street
4,3.8462,52.0,6.281853,1.081081,565.0,2.181467,3.422,Alameda County,Florio Street


In [22]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20640 entries, 0 to 20639
Data columns (total 9 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   MedInc      20640 non-null  float64
 1   HouseAge    20640 non-null  float64
 2   AveRooms    20640 non-null  float64
 3   AveBedrms   20640 non-null  float64
 4   Population  20640 non-null  float64
 5   AveOccup    20640 non-null  float64
 6   Target      20640 non-null  float64
 7   County      20065 non-null  object 
 8   Road        19578 non-null  object 
dtypes: float64(7), object(2)
memory usage: 1.4+ MB


#### Removing Null Values
##### Using Classification algo to fill the categorical values



#### 1. Predicting Road Null values

In [23]:
df.columns

Index(['MedInc', 'HouseAge', 'AveRooms', 'AveBedrms', 'Population', 'AveOccup',
       'Target', 'County', 'Road'],
      dtype='object')

In [24]:
# Applying Logistic Regression to find missing values for road column

missing_index = []

for i in range(df.shape[0]):
    if df["Road"][i] is None:
        missing_index.append(i)
        
#independent Variable
missing_road_x_train = np.array([ [df["MedInc"][i],df["AveRooms"][i], df["AveBedrms"][i] ] for i in range(df.shape[0]) if i not in missing_index])

#Dependent Variable
missing_road_y_train = np.array([df["Road"][i] for i in range(df.shape[0]) if i not in missing_index])

missing_road_x_test = np.array([ [df["MedInc"][i],df["AveRooms"][i], df["AveBedrms"][i] ] for i in range(df.shape[0]) if i in missing_index])        

In [25]:
missing_road_x_train

array([[8.3252    , 6.98412698, 1.02380952],
       [8.3014    , 6.23813708, 0.97188049],
       [7.2574    , 8.28813559, 1.07344633],
       ...,
       [1.7       , 5.20554273, 1.12009238],
       [1.8672    , 5.32951289, 1.17191977],
       [2.3886    , 5.25471698, 1.16226415]])

In [26]:
from sklearn.linear_model import SGDClassifier as sgdc

In [27]:
#model Initialization

model_1 = sgdc()

model_1.fit(missing_road_x_train,missing_road_y_train)

missing_road_y_pred = model_1.predict(missing_road_x_test)

In [28]:
np.unique(missing_road_y_pred)

array(['5th Street', 'Beach Boulevard', 'Bradford Street',
       'Broadway Avenue', 'El Rancho Drive', 'Euclid Avenue',
       'Foothill Expressway', 'Hiawatha Way', 'Horton Avenue',
       'Keswick Street', 'Lochridge Drive', 'Olympia Avenue',
       'Paramount Boulevard', 'Rheem Avenue', 'Saint Clair Avenue',
       'Valley Club Road', 'West 17th Street', 'West 66th Street',
       'West Beverly Boulevard', 'Westminster Boulevard'], dtype='<U77')

In [29]:
# Add the model back to dataframe

for n,i in enumerate(missing_index):
    df["Road"][i] = missing_road_y_pred[n]

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["Road"][i] = missing_road_y_pred[n]


In [30]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20640 entries, 0 to 20639
Data columns (total 9 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   MedInc      20640 non-null  float64
 1   HouseAge    20640 non-null  float64
 2   AveRooms    20640 non-null  float64
 3   AveBedrms   20640 non-null  float64
 4   Population  20640 non-null  float64
 5   AveOccup    20640 non-null  float64
 6   Target      20640 non-null  float64
 7   County      20065 non-null  object 
 8   Road        20640 non-null  object 
dtypes: float64(7), object(2)
memory usage: 1.4+ MB


In [31]:
df.head()

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Target,County,Road
0,8.3252,41.0,6.984127,1.02381,322.0,2.555556,4.526,Alameda County,Centennial Drive
1,8.3014,21.0,6.238137,0.97188,2401.0,2.109842,3.585,Alameda County,Gwin Canyon Trail
2,7.2574,52.0,8.288136,1.073446,496.0,2.80226,3.521,Alameda County,Grove Shafter Freeway
3,5.6431,52.0,5.817352,1.073059,558.0,2.547945,3.413,Alameda County,Florio Street
4,3.8462,52.0,6.281853,1.081081,565.0,2.181467,3.422,Alameda County,Florio Street


In [32]:
# label encode 

from sklearn.preprocessing import LabelEncoder

What is a label encoder?

Label Encoding is a popular encoding technique for handling categorical variables. In this technique, each label is assigned a unique integer based on alphabetical ordering.

In [33]:
le = LabelEncoder()

df["Road"] = le.fit_transform(df["Road"])

In [34]:
df.head()

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Target,County,Road
0,8.3252,41.0,6.984127,1.02381,322.0,2.555556,4.526,Alameda County,1474
1,8.3014,21.0,6.238137,0.97188,2401.0,2.109842,3.585,Alameda County,3491
2,7.2574,52.0,8.288136,1.073446,496.0,2.80226,3.521,Alameda County,3472
3,5.6431,52.0,5.817352,1.073059,558.0,2.547945,3.413,Alameda County,3095
4,3.8462,52.0,6.281853,1.081081,565.0,2.181467,3.422,Alameda County,3095


##### 2. Predicting County null values 

In [35]:
# Applying Logistic Regression to find missing values for County column

missing_index = []

for i in range(df.shape[0]):
    if df["County"][i] is None:
        missing_index.append(i)
        
#independent Variable
missing_county_x_train = np.array([ [df["MedInc"][i],df["AveRooms"][i], df["AveBedrms"][i] ] for i in range(df.shape[0]) if i not in missing_index])

#Dependent Variable
missing_county_y_train = np.array([df["County"][i] for i in range(df.shape[0]) if i not in missing_index])

missing_county_x_test = np.array([ [df["MedInc"][i],df["AveRooms"][i], df["AveBedrms"][i] ] for i in range(df.shape[0]) if i in missing_index])        

In [37]:
#model Initialization

model_2 = sgdc()

model_2.fit(missing_county_x_train,missing_county_y_train)

missing_county_y_pred = model_2.predict(missing_county_x_test)

In [38]:
# Add the model back to dataframe

for n,i in enumerate(missing_index):
    df["County"][i] = missing_county_y_pred[n]

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["County"][i] = missing_county_y_pred[n]


In [39]:
# no null values of county and road
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20640 entries, 0 to 20639
Data columns (total 9 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   MedInc      20640 non-null  float64
 1   HouseAge    20640 non-null  float64
 2   AveRooms    20640 non-null  float64
 3   AveBedrms   20640 non-null  float64
 4   Population  20640 non-null  float64
 5   AveOccup    20640 non-null  float64
 6   Target      20640 non-null  float64
 7   County      20640 non-null  object 
 8   Road        20640 non-null  int32  
dtypes: float64(7), int32(1), object(1)
memory usage: 1.3+ MB


In [41]:
df.head()

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Target,County,Road
0,8.3252,41.0,6.984127,1.02381,322.0,2.555556,4.526,Alameda County,1474
1,8.3014,21.0,6.238137,0.97188,2401.0,2.109842,3.585,Alameda County,3491
2,7.2574,52.0,8.288136,1.073446,496.0,2.80226,3.521,Alameda County,3472
3,5.6431,52.0,5.817352,1.073059,558.0,2.547945,3.413,Alameda County,3095
4,3.8462,52.0,6.281853,1.081081,565.0,2.181467,3.422,Alameda County,3095


In [42]:
df["County"] = le.fit_transform(df["County"])

In [44]:
df = df.sample(axis = 0, frac =1)
df.head()

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Target,County,Road
16301,5.3224,17.0,6.103634,1.025572,1562.0,2.102288,2.258,40,5589
17704,5.5145,15.0,5.168385,1.010309,1267.0,4.353952,2.194,44,7700
13979,0.536,4.0,14.0,3.333333,9.0,3.0,0.425,38,725
7443,2.6354,40.0,3.35274,1.013699,934.0,3.19863,1.518,20,6852
926,6.0878,26.0,5.647059,0.901961,1105.0,3.095238,2.469,0,8933


# Understanding which Model to use

Target variable is a continous value therefor the best model to use is Regression Model

model = random forest

In [64]:
y = df.iloc[:,-3].values

df.drop(labels = ['Target'], axis =1,inplace = True)

x = df.iloc[:,:].values

In [80]:
from sklearn.model_selection import train_test_split as tts

In [81]:
X_train, X_test, Y_train, Y_test = tts(x,y, test_size = 0.20, random_state =42)

In [82]:
from sklearn.ensemble import RandomForestRegressor as RFR

In [83]:
model = RFR()

model.fit(X_train,Y_train)


RandomForestRegressor()

In [84]:
# Model Prediction

y_pred = model.predict(X_test)

In [85]:
# model Accuracy

from sklearn.metrics import r2_score

In [86]:
r2_score(Y_test,y_pred)*100

75.7299399477611

# ADD our own data

In [87]:
inp = np.array([5.3224,17.0,6.103634,1.025572,1562.0,2.102288,40,5589])

In [90]:
inp.shape

inp = inp.reshape(1,-1)

In [91]:
model.predict(inp)

array([2.42832])