In [1]:
import pandas as pd
import warnings
warnings.filterwarnings('ignore')

In [2]:
df = pd.read_csv("housing.csv")
df.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY


In [3]:
X = df.drop("median_house_value", axis=1)
y = df['median_house_value']
X_num = X.iloc[:,:-1]
X_cat = X.iloc[:,-1:]

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20640 entries, 0 to 20639
Data columns (total 10 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   longitude           20640 non-null  float64
 1   latitude            20640 non-null  float64
 2   housing_median_age  20640 non-null  float64
 3   total_rooms         20640 non-null  float64
 4   total_bedrooms      20433 non-null  float64
 5   population          20640 non-null  float64
 6   households          20640 non-null  float64
 7   median_income       20640 non-null  float64
 8   median_house_value  20640 non-null  float64
 9   ocean_proximity     20640 non-null  object 
dtypes: float64(9), object(1)
memory usage: 1.6+ MB


In [5]:
from sklearn.base import BaseEstimator, TransformerMixin
import numpy as np

# indecies of columns
room_idx, bedrooms_idx, population_idx, households_idx = 3,4,5,6

class AddCombineAttributes(BaseEstimator,TransformerMixin):

    def __init__(self):
        pass
        
    def fit(self, X,y=None):
        return self

    def transform(self, X, y=None):
        rooms_per_household = X[:, room_idx] / X[:, households_idx]
        population_per_household = X[:, population_idx] / X[:, households_idx]
        bedrooms_per_room = X[:, bedrooms_idx]/ X[:, room_idx]

        return np.c_[X, rooms_per_household, population_per_household, bedrooms_per_room]

In [6]:
from sklearn.impute import SimpleImputer

imputer = SimpleImputer(strategy='median')
X_impute = imputer.fit_transform(df.iloc[:,:-1])

In [7]:
add_attrbs = AddCombineAttributes()
X_extra = add_attrbs.transform(X_impute)

In [8]:
num_features = ['longitude', 'latitude', 'housing_median_age', 'total_rooms',
       'total_bedrooms', 'population', 'households', 'median_income']
cat_features = ['ocean_proximity']

In [9]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OrdinalEncoder
from sklearn.compose import ColumnTransformer

In [10]:
# numarical pipeline to transform numarical features
num_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('attributes_adder', AddCombineAttributes()),
    ('scaler', StandardScaler())
])

In [11]:
full_pipeline = ColumnTransformer([
    ('num_pipeline', num_pipeline, num_features),
    ('cat', OrdinalEncoder(), cat_features)
])

In [12]:
X_prepared = full_pipeline.fit_transform(X)

In [13]:
X_prepared.shape, X_prepared

((20640, 12),
 array([[-1.32783522,  1.05254828,  0.98214266, ..., -0.04959654,
         -1.02998783,  3.        ],
        [-1.32284391,  1.04318455, -0.60701891, ..., -0.09251223,
         -0.8888972 ,  3.        ],
        [-1.33282653,  1.03850269,  1.85618152, ..., -0.02584253,
         -1.29168566,  3.        ],
        ...,
        [-0.8237132 ,  1.77823747, -0.92485123, ..., -0.0717345 ,
          0.02113407,  1.        ],
        [-0.87362627,  1.77823747, -0.84539315, ..., -0.09122515,
          0.09346655,  1.        ],
        [-0.83369581,  1.75014627, -1.00430931, ..., -0.04368215,
          0.11327519,  1.        ]]))