In [1]:
import numpy as np
import pandas as pd
from pathlib import Path

from sklearn.model_selection import KFold, cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler,LabelEncoder, OrdinalEncoder
from sklearn.compose import ColumnTransformer

from sklearn.ensemble import RandomForestRegressor


from sklearn.metrics import mean_absolute_error,r2_score

import category_encoders as ce

import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
interim_path = Path('../data/interim')
processed_path = Path('../data/processed')

In [3]:
df = pd.read_csv(interim_path / 'properties.csv')

In [4]:
X = df.drop(columns=['price'])
y = df['price']
y_transformed = np.log1p(y)

In [5]:
X_train, X_test, y_train, y_test = train_test_split(X, y_transformed, test_size=0.2, random_state=42)

In [6]:
columns_to_encode = ['property_type', 'balcony', 'agePossession', 'furnishing_type', 'luxury_category',
                                 'floor_category']
numerical_columns = ['bedRoom', 'bathroom', 'built_up_area', 'pooja room', 'store room']

preprocessor = ColumnTransformer(
                transformers=[
                    ('num', StandardScaler(), ['bedRoom', 'bathroom', 'built_up_area', 'pooja room', 'store room']),
                    ('sector', OneHotEncoder(drop='first',handle_unknown='ignore'), ['sector']),
                    ('cat', OrdinalEncoder(), columns_to_encode)

                ],
                remainder='passthrough'
            )

pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', RandomForestRegressor())
])

pipeline.fit(X_train,y_train)
predicted = pipeline.predict(X_test)
score = r2_score(y_test, predicted)

In [8]:
df2 = df.sample(300)
df2.to_csv(processed_path / 'sample_data.csv')

In [9]:
df2.shape

(300, 13)