### Imports

In [35]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

### Call Data and Specify Sample Modelling Data

In [2]:
train = pd.read_csv("Data/train.csv")

In [3]:
dependent = train["SalePrice"]
l_dependent = np.log(train["SalePrice"])

In [4]:
categoricals = ["MSSubClass", "MSZoning", "Alley"]

In [5]:
numericals = ["LotFrontage", "LotArea"]

### Initialise Functions for Pipes

In [6]:
cat_imputer = SimpleImputer(strategy="constant", fill_value="None")

In [7]:
num_imputer = SimpleImputer(strategy="median")

In [8]:
scaler = StandardScaler()

In [9]:
encoder = OneHotEncoder(handle_unknown="ignore", sparse=False)

### Initialise Pipes

In [10]:
cat_pipe = Pipeline([
    ('impute', cat_imputer),
    ('encode', encoder)
])

In [11]:
num_pipe = Pipeline([
    ('impute', num_imputer),
    ('scale', scaler),
])

### Compose Pipes Using Column Transformer

In [12]:
data_preprocessor = ColumnTransformer(
    transformers=[
        ('categorical', cat_pipe, categoricals),
        ('numerical', num_pipe, numericals)
    ]
)

In [14]:
data_pipe = Pipeline([
    ('preprocessor', data_preprocessor)
])

In [24]:
X = data_pipe.fit_transform(train[numericals + categoricals])

In [25]:
lr = LinearRegression()

In [32]:
lr.fit(X, l_dependent)

In [33]:
lr.score(X, l_dependent)

0.4452265968760091

In [37]:
preds = lr.predict(X)

In [41]:
np.sqrt(mean_squared_error(preds, l_dependent))

0.2974222746822299

In [42]:
pd.DataFranenp.exp(preds)

array([210744.63298229, 172028.04910818, 216929.54233691, ...,
       170322.71582706, 165045.22070156, 169403.46404748])

In [43]:
dependent

0       208500
1       181500
2       223500
3       140000
4       250000
         ...  
1455    175000
1456    210000
1457    266500
1458    142125
1459    147500
Name: SalePrice, Length: 1460, dtype: int64