In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, r2_score

In [2]:
df = pd.read_csv('nyc-rolling-sales.csv')

In [3]:
df.head()

Unnamed: 0.1,Unnamed: 0,BOROUGH,NEIGHBORHOOD,BUILDING CLASS CATEGORY,TAX CLASS AT PRESENT,BLOCK,LOT,EASE-MENT,BUILDING CLASS AT PRESENT,ADDRESS,...,RESIDENTIAL UNITS,COMMERCIAL UNITS,TOTAL UNITS,LAND SQUARE FEET,GROSS SQUARE FEET,YEAR BUILT,TAX CLASS AT TIME OF SALE,BUILDING CLASS AT TIME OF SALE,SALE PRICE,SALE DATE
0,4,1,ALPHABET CITY,07 RENTALS - WALKUP APARTMENTS,2A,392,6,,C2,153 AVENUE B,...,5,0,5,1633,6440,1900,2,C2,6625000,2017-07-19 00:00:00
1,5,1,ALPHABET CITY,07 RENTALS - WALKUP APARTMENTS,2,399,26,,C7,234 EAST 4TH STREET,...,28,3,31,4616,18690,1900,2,C7,-,2016-12-14 00:00:00
2,6,1,ALPHABET CITY,07 RENTALS - WALKUP APARTMENTS,2,399,39,,C7,197 EAST 3RD STREET,...,16,1,17,2212,7803,1900,2,C7,-,2016-12-09 00:00:00
3,7,1,ALPHABET CITY,07 RENTALS - WALKUP APARTMENTS,2B,402,21,,C4,154 EAST 7TH STREET,...,10,0,10,2272,6794,1913,2,C4,3936272,2016-09-23 00:00:00
4,8,1,ALPHABET CITY,07 RENTALS - WALKUP APARTMENTS,2A,404,55,,C2,301 EAST 10TH STREET,...,6,0,6,2369,4615,1900,2,C2,8000000,2016-11-17 00:00:00


In [4]:
df.describe()

Unnamed: 0.1,Unnamed: 0,BOROUGH,BLOCK,LOT,ZIP CODE,RESIDENTIAL UNITS,COMMERCIAL UNITS,TOTAL UNITS,YEAR BUILT,TAX CLASS AT TIME OF SALE
count,84548.0,84548.0,84548.0,84548.0,84548.0,84548.0,84548.0,84548.0,84548.0,84548.0
mean,10344.359878,2.998758,4237.218976,376.224015,10731.991614,2.025264,0.193559,2.249184,1789.322976,1.657485
std,7151.779436,1.28979,3568.263407,658.136814,1290.879147,16.721037,8.713183,18.972584,537.344993,0.819341
min,4.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
25%,4231.0,2.0,1322.75,22.0,10305.0,0.0,0.0,1.0,1920.0,1.0
50%,8942.0,3.0,3311.0,50.0,11209.0,1.0,0.0,1.0,1940.0,2.0
75%,15987.25,4.0,6281.0,1001.0,11357.0,2.0,0.0,2.0,1965.0,2.0
max,26739.0,5.0,16322.0,9106.0,11694.0,1844.0,2261.0,2261.0,2017.0,4.0


In [5]:
df.shape

(84548, 22)

In [6]:
df.columns

Index(['Unnamed: 0', 'BOROUGH', 'NEIGHBORHOOD', 'BUILDING CLASS CATEGORY',
       'TAX CLASS AT PRESENT', 'BLOCK', 'LOT', 'EASE-MENT',
       'BUILDING CLASS AT PRESENT', 'ADDRESS', 'APARTMENT NUMBER', 'ZIP CODE',
       'RESIDENTIAL UNITS', 'COMMERCIAL UNITS', 'TOTAL UNITS',
       'LAND SQUARE FEET', 'GROSS SQUARE FEET', 'YEAR BUILT',
       'TAX CLASS AT TIME OF SALE', 'BUILDING CLASS AT TIME OF SALE',
       'SALE PRICE', 'SALE DATE'],
      dtype='object')

In [7]:
df.isnull().sum()

Unnamed: 0                        0
BOROUGH                           0
NEIGHBORHOOD                      0
BUILDING CLASS CATEGORY           0
TAX CLASS AT PRESENT              0
BLOCK                             0
LOT                               0
EASE-MENT                         0
BUILDING CLASS AT PRESENT         0
ADDRESS                           0
APARTMENT NUMBER                  0
ZIP CODE                          0
RESIDENTIAL UNITS                 0
COMMERCIAL UNITS                  0
TOTAL UNITS                       0
LAND SQUARE FEET                  0
GROSS SQUARE FEET                 0
YEAR BUILT                        0
TAX CLASS AT TIME OF SALE         0
BUILDING CLASS AT TIME OF SALE    0
SALE PRICE                        0
SALE DATE                         0
dtype: int64

In [8]:
df[df['SALE PRICE'].str.strip()=='-']

Unnamed: 0.1,Unnamed: 0,BOROUGH,NEIGHBORHOOD,BUILDING CLASS CATEGORY,TAX CLASS AT PRESENT,BLOCK,LOT,EASE-MENT,BUILDING CLASS AT PRESENT,ADDRESS,...,RESIDENTIAL UNITS,COMMERCIAL UNITS,TOTAL UNITS,LAND SQUARE FEET,GROSS SQUARE FEET,YEAR BUILT,TAX CLASS AT TIME OF SALE,BUILDING CLASS AT TIME OF SALE,SALE PRICE,SALE DATE
1,5,1,ALPHABET CITY,07 RENTALS - WALKUP APARTMENTS,2,399,26,,C7,234 EAST 4TH STREET,...,28,3,31,4616,18690,1900,2,C7,-,2016-12-14 00:00:00
2,6,1,ALPHABET CITY,07 RENTALS - WALKUP APARTMENTS,2,399,39,,C7,197 EAST 3RD STREET,...,16,1,17,2212,7803,1900,2,C7,-,2016-12-09 00:00:00
5,9,1,ALPHABET CITY,07 RENTALS - WALKUP APARTMENTS,2,405,16,,C4,516 EAST 12TH STREET,...,20,0,20,2581,9730,1900,2,C4,-,2017-07-20 00:00:00
7,11,1,ALPHABET CITY,07 RENTALS - WALKUP APARTMENTS,2,407,18,,C7,520 EAST 14TH STREET,...,44,2,46,5163,21007,1900,2,C7,-,2017-07-20 00:00:00
8,12,1,ALPHABET CITY,08 RENTALS - ELEVATOR APARTMENTS,2,379,34,,D5,141 AVENUE D,...,15,0,15,1534,9198,1920,2,D5,-,2017-06-20 00:00:00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
84524,8390,5,WOODROW,02 TWO FAMILY DWELLINGS,1,7091,28,,B9,51 CLAY PIT ROAD,...,2,0,2,2986,1820,1999,1,B9,-,2017-07-06 00:00:00
84525,8391,5,WOODROW,02 TWO FAMILY DWELLINGS,1,7091,188,,B9,1576 WOODROW ROAD,...,2,0,2,2490,1530,1998,1,B9,-,2016-10-14 00:00:00
84538,8404,5,WOODROW,02 TWO FAMILY DWELLINGS,1,7316,61,,B2,178 DARNELL LANE,...,2,0,2,3215,1300,1995,1,B2,-,2017-06-30 00:00:00
84539,8405,5,WOODROW,02 TWO FAMILY DWELLINGS,1,7316,85,,B2,137 DARNELL LANE,...,2,0,2,3016,1300,1995,1,B2,-,2016-12-30 00:00:00


In [9]:
df.drop(['Unnamed: 0'],
             axis=1,
             inplace=True)

In [10]:
df.head()

Unnamed: 0,BOROUGH,NEIGHBORHOOD,BUILDING CLASS CATEGORY,TAX CLASS AT PRESENT,BLOCK,LOT,EASE-MENT,BUILDING CLASS AT PRESENT,ADDRESS,APARTMENT NUMBER,...,RESIDENTIAL UNITS,COMMERCIAL UNITS,TOTAL UNITS,LAND SQUARE FEET,GROSS SQUARE FEET,YEAR BUILT,TAX CLASS AT TIME OF SALE,BUILDING CLASS AT TIME OF SALE,SALE PRICE,SALE DATE
0,1,ALPHABET CITY,07 RENTALS - WALKUP APARTMENTS,2A,392,6,,C2,153 AVENUE B,,...,5,0,5,1633,6440,1900,2,C2,6625000,2017-07-19 00:00:00
1,1,ALPHABET CITY,07 RENTALS - WALKUP APARTMENTS,2,399,26,,C7,234 EAST 4TH STREET,,...,28,3,31,4616,18690,1900,2,C7,-,2016-12-14 00:00:00
2,1,ALPHABET CITY,07 RENTALS - WALKUP APARTMENTS,2,399,39,,C7,197 EAST 3RD STREET,,...,16,1,17,2212,7803,1900,2,C7,-,2016-12-09 00:00:00
3,1,ALPHABET CITY,07 RENTALS - WALKUP APARTMENTS,2B,402,21,,C4,154 EAST 7TH STREET,,...,10,0,10,2272,6794,1913,2,C4,3936272,2016-09-23 00:00:00
4,1,ALPHABET CITY,07 RENTALS - WALKUP APARTMENTS,2A,404,55,,C2,301 EAST 10TH STREET,,...,6,0,6,2369,4615,1900,2,C2,8000000,2016-11-17 00:00:00


In [11]:
df.dtypes

BOROUGH                            int64
NEIGHBORHOOD                      object
BUILDING CLASS CATEGORY           object
TAX CLASS AT PRESENT              object
BLOCK                              int64
LOT                                int64
EASE-MENT                         object
BUILDING CLASS AT PRESENT         object
ADDRESS                           object
APARTMENT NUMBER                  object
ZIP CODE                           int64
RESIDENTIAL UNITS                  int64
COMMERCIAL UNITS                   int64
TOTAL UNITS                        int64
LAND SQUARE FEET                  object
GROSS SQUARE FEET                 object
YEAR BUILT                         int64
TAX CLASS AT TIME OF SALE          int64
BUILDING CLASS AT TIME OF SALE    object
SALE PRICE                        object
SALE DATE                         object
dtype: object

In [12]:
empty_fields = df[ (df['GROSS SQUARE FEET'].str.strip()=='-') |  (df['LAND SQUARE FEET'].str.strip()=='-')| (df['SALE PRICE'].str.strip()=='-') | (df['SALE PRICE'].str.strip()=='0')].index
df.drop(empty_fields , inplace=True)

In [13]:
total_units_zero = df[ (df['TOTAL UNITS']==0)].index
df.drop(total_units_zero , inplace=True)

In [14]:
df['SALE PRICE'] = df['SALE PRICE'].astype('float')
df['LAND SQUARE FEET'] = df['LAND SQUARE FEET'].astype('float')
df['GROSS SQUARE FEET'] = df['GROSS SQUARE FEET'].astype('float')

In [15]:
df['SALE YEAR'] = pd.DatetimeIndex(df['SALE DATE']).year

In [16]:
df['AGE'] = df['SALE YEAR']-df['YEAR BUILT']

In [17]:
df.head()

Unnamed: 0,BOROUGH,NEIGHBORHOOD,BUILDING CLASS CATEGORY,TAX CLASS AT PRESENT,BLOCK,LOT,EASE-MENT,BUILDING CLASS AT PRESENT,ADDRESS,APARTMENT NUMBER,...,TOTAL UNITS,LAND SQUARE FEET,GROSS SQUARE FEET,YEAR BUILT,TAX CLASS AT TIME OF SALE,BUILDING CLASS AT TIME OF SALE,SALE PRICE,SALE DATE,SALE YEAR,AGE
0,1,ALPHABET CITY,07 RENTALS - WALKUP APARTMENTS,2A,392,6,,C2,153 AVENUE B,,...,5,1633.0,6440.0,1900,2,C2,6625000.0,2017-07-19 00:00:00,2017,117
3,1,ALPHABET CITY,07 RENTALS - WALKUP APARTMENTS,2B,402,21,,C4,154 EAST 7TH STREET,,...,10,2272.0,6794.0,1913,2,C4,3936272.0,2016-09-23 00:00:00,2016,103
4,1,ALPHABET CITY,07 RENTALS - WALKUP APARTMENTS,2A,404,55,,C2,301 EAST 10TH STREET,,...,6,2369.0,4615.0,1900,2,C2,8000000.0,2016-11-17 00:00:00,2016,116
6,1,ALPHABET CITY,07 RENTALS - WALKUP APARTMENTS,2B,406,32,,C4,210 AVENUE B,,...,8,1750.0,4226.0,1920,2,C4,3192840.0,2016-09-23 00:00:00,2016,96
9,1,ALPHABET CITY,08 RENTALS - ELEVATOR APARTMENTS,2,387,153,,D9,629 EAST 5TH STREET,,...,24,4489.0,18523.0,1920,2,D9,16232000.0,2016-11-07 00:00:00,2016,96


In [18]:
df.dtypes

BOROUGH                             int64
NEIGHBORHOOD                       object
BUILDING CLASS CATEGORY            object
TAX CLASS AT PRESENT               object
BLOCK                               int64
LOT                                 int64
EASE-MENT                          object
BUILDING CLASS AT PRESENT          object
ADDRESS                            object
APARTMENT NUMBER                   object
ZIP CODE                            int64
RESIDENTIAL UNITS                   int64
COMMERCIAL UNITS                    int64
TOTAL UNITS                         int64
LAND SQUARE FEET                  float64
GROSS SQUARE FEET                 float64
YEAR BUILT                          int64
TAX CLASS AT TIME OF SALE           int64
BUILDING CLASS AT TIME OF SALE     object
SALE PRICE                        float64
SALE DATE                          object
SALE YEAR                           int64
AGE                                 int64
dtype: object

In [19]:
df.nunique()

BOROUGH                               5
NEIGHBORHOOD                        250
BUILDING CLASS CATEGORY              43
TAX CLASS AT PRESENT                  9
BLOCK                             10085
LOT                                1572
EASE-MENT                             1
BUILDING CLASS AT PRESENT           142
ADDRESS                           30324
APARTMENT NUMBER                   1258
ZIP CODE                            180
RESIDENTIAL UNITS                   145
COMMERCIAL UNITS                     41
TOTAL UNITS                         152
LAND SQUARE FEET                   4600
GROSS SQUARE FEET                  4498
YEAR BUILT                          143
TAX CLASS AT TIME OF SALE             3
BUILDING CLASS AT TIME OF SALE      143
SALE PRICE                         5737
SALE DATE                           340
SALE YEAR                             2
AGE                                 147
dtype: int64

In [20]:
import datetime as dt
df['SALE DATE'] = pd.to_datetime(df['SALE DATE'])
df['SALE DATE']=df['SALE DATE'].map(dt.datetime.toordinal)

In [21]:
df.head()

Unnamed: 0,BOROUGH,NEIGHBORHOOD,BUILDING CLASS CATEGORY,TAX CLASS AT PRESENT,BLOCK,LOT,EASE-MENT,BUILDING CLASS AT PRESENT,ADDRESS,APARTMENT NUMBER,...,TOTAL UNITS,LAND SQUARE FEET,GROSS SQUARE FEET,YEAR BUILT,TAX CLASS AT TIME OF SALE,BUILDING CLASS AT TIME OF SALE,SALE PRICE,SALE DATE,SALE YEAR,AGE
0,1,ALPHABET CITY,07 RENTALS - WALKUP APARTMENTS,2A,392,6,,C2,153 AVENUE B,,...,5,1633.0,6440.0,1900,2,C2,6625000.0,736529,2017,117
3,1,ALPHABET CITY,07 RENTALS - WALKUP APARTMENTS,2B,402,21,,C4,154 EAST 7TH STREET,,...,10,2272.0,6794.0,1913,2,C4,3936272.0,736230,2016,103
4,1,ALPHABET CITY,07 RENTALS - WALKUP APARTMENTS,2A,404,55,,C2,301 EAST 10TH STREET,,...,6,2369.0,4615.0,1900,2,C2,8000000.0,736285,2016,116
6,1,ALPHABET CITY,07 RENTALS - WALKUP APARTMENTS,2B,406,32,,C4,210 AVENUE B,,...,8,1750.0,4226.0,1920,2,C4,3192840.0,736230,2016,96
9,1,ALPHABET CITY,08 RENTALS - ELEVATOR APARTMENTS,2,387,153,,D9,629 EAST 5TH STREET,,...,24,4489.0,18523.0,1920,2,D9,16232000.0,736275,2016,96


In [22]:
df.dtypes

BOROUGH                             int64
NEIGHBORHOOD                       object
BUILDING CLASS CATEGORY            object
TAX CLASS AT PRESENT               object
BLOCK                               int64
LOT                                 int64
EASE-MENT                          object
BUILDING CLASS AT PRESENT          object
ADDRESS                            object
APARTMENT NUMBER                   object
ZIP CODE                            int64
RESIDENTIAL UNITS                   int64
COMMERCIAL UNITS                    int64
TOTAL UNITS                         int64
LAND SQUARE FEET                  float64
GROSS SQUARE FEET                 float64
YEAR BUILT                          int64
TAX CLASS AT TIME OF SALE           int64
BUILDING CLASS AT TIME OF SALE     object
SALE PRICE                        float64
SALE DATE                           int64
SALE YEAR                           int64
AGE                                 int64
dtype: object

In [23]:
df.nunique()

BOROUGH                               5
NEIGHBORHOOD                        250
BUILDING CLASS CATEGORY              43
TAX CLASS AT PRESENT                  9
BLOCK                             10085
LOT                                1572
EASE-MENT                             1
BUILDING CLASS AT PRESENT           142
ADDRESS                           30324
APARTMENT NUMBER                   1258
ZIP CODE                            180
RESIDENTIAL UNITS                   145
COMMERCIAL UNITS                     41
TOTAL UNITS                         152
LAND SQUARE FEET                   4600
GROSS SQUARE FEET                  4498
YEAR BUILT                          143
TAX CLASS AT TIME OF SALE             3
BUILDING CLASS AT TIME OF SALE      143
SALE PRICE                         5737
SALE DATE                           340
SALE YEAR                             2
AGE                                 147
dtype: int64

In [24]:
df.drop('EASE-MENT', axis=1, inplace=True)
df.drop('ZIP CODE', axis=1, inplace=True)
df.drop('ADDRESS', axis=1, inplace=True)
df.drop('APARTMENT NUMBER', axis=1, inplace=True)
df.drop('NEIGHBORHOOD', axis=1, inplace=True)
df.drop('BUILDING CLASS AT PRESENT', axis=1, inplace=True)
df.drop('LOT', axis=1, inplace=True)
df.drop('TAX CLASS AT PRESENT', axis=1, inplace=True)
df.drop('BUILDING CLASS AT TIME OF SALE', axis=1, inplace=True)
df.head()

Unnamed: 0,BOROUGH,BUILDING CLASS CATEGORY,BLOCK,RESIDENTIAL UNITS,COMMERCIAL UNITS,TOTAL UNITS,LAND SQUARE FEET,GROSS SQUARE FEET,YEAR BUILT,TAX CLASS AT TIME OF SALE,SALE PRICE,SALE DATE,SALE YEAR,AGE
0,1,07 RENTALS - WALKUP APARTMENTS,392,5,0,5,1633.0,6440.0,1900,2,6625000.0,736529,2017,117
3,1,07 RENTALS - WALKUP APARTMENTS,402,10,0,10,2272.0,6794.0,1913,2,3936272.0,736230,2016,103
4,1,07 RENTALS - WALKUP APARTMENTS,404,6,0,6,2369.0,4615.0,1900,2,8000000.0,736285,2016,116
6,1,07 RENTALS - WALKUP APARTMENTS,406,8,0,8,1750.0,4226.0,1920,2,3192840.0,736230,2016,96
9,1,08 RENTALS - ELEVATOR APARTMENTS,387,24,0,24,4489.0,18523.0,1920,2,16232000.0,736275,2016,96


In [25]:
#df.drop('SALE DATE', axis=1, inplace=True)

In [None]:
sns.pairplot(df)

In [None]:
fig, ax = plt.subplots(figsize=(38,38))    
sns.heatmap(df.corr(), annot = True)
plt.show()

In [26]:
df['BOROUGH']=df['BOROUGH'].astype(object)

In [27]:
#categorical_cols = ['BOROUGH', 'BUILDING CLASS CATEGORY']
#one_hot_encoded_features = pd.get_dummies(df[categorical_cols])

In [28]:
#df.drop(categorical_cols, axis=1, inplace=True)
#df = pd.concat([df,one_hot_encoded_features], axis=1)

In [29]:
#df.head()

In [31]:
# split data 
# mse/price compare error across high or low prices, map, sklearn pipeline - three column transformation ML course
# cross validation 5 or 10 folds
# check ridge or lasso, cgbooster or random forest(better)  three base models, residential
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder

scaler = StandardScaler()
X = df.drop('SALE PRICE', axis=1)
y = df['SALE PRICE']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=10)

#one_hot_encoded_features = pd.get_dummies(df[categorical_cols])
#df.drop(categorical_cols, axis=1, inplace=True)
#df = pd.concat([df,one_hot_encoded_features], axis=1)

categorical_features = ['BOROUGH', 'BUILDING CLASS CATEGORY']

numerical_features = ['RESIDENTIAL UNITS',
       'COMMERCIAL UNITS', 'TOTAL UNITS', 'LAND SQUARE FEET',
       'GROSS SQUARE FEET', 'AGE']

numeric_transformer =StandardScaler()
categorical_transformer = OneHotEncoder(handle_unknown='ignore', sparse=False)

preprocessor = ColumnTransformer(transformers = [('numerical', numeric_transformer, numerical_features),('cat',categorical_transformer, categorical_features)])


X_train_preprocessed = preprocessor.fit_transform(X_train)
X_test_preprocessed = preprocessor.transform(X_test)

X_train_preprocessed_df = pd.DataFrame(data = X_train_preprocessed)
X_train_preprocessed_df.head() 
X_test_preprocessed_df = pd.DataFrame(data = X_test_preprocessed)
X_test_preprocessed_df.head() 

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,44,45,46,47,48,49,50,51,52,53
0,-0.08966,-0.020301,-0.082827,0.002458,-0.063061,-0.127133,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.014182,-0.020301,-0.001699,-0.044439,-0.021624,-0.256547,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,-0.037739,-0.020301,-0.042263,0.005451,-0.060259,-0.211534,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,-0.037739,-0.020301,-0.042263,-0.030984,-0.049404,-0.115879,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,-0.08966,-0.020301,-0.082827,-0.09865,-0.117337,-0.315628,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [32]:
X_train_preprocessed.shape

(23702, 54)

In [46]:
X_test_preprocessed.shape

(10158, 54)

In [45]:
X_test_preprocessed_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,44,45,46,47,48,49,50,51,52,53
0,-0.08966,-0.020301,-0.082827,0.002458,-0.063061,-0.127133,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.014182,-0.020301,-0.001699,-0.044439,-0.021624,-0.256547,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,-0.037739,-0.020301,-0.042263,0.005451,-0.060259,-0.211534,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,-0.037739,-0.020301,-0.042263,-0.030984,-0.049404,-0.115879,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,-0.08966,-0.020301,-0.082827,-0.09865,-0.117337,-0.315628,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [34]:
#df.corr().abs().unstack().sort_values(ascending =False )["SALE PRICE"]

In [35]:

linear_regression = LinearRegression()
linear_regression.fit(X_train_preprocessed, y_train)
predictions = linear_regression.predict(X_test_preprocessed)
r2_score(y_test, predictions)

0.7291092529749056

In [42]:
from sklearn.model_selection import cross_val_score
scores_lr = cross_val_score(LinearRegression(), X_train_preprocessed, y_train, cv=5, scoring='r2')#'neg_mean_squared_error')
print("Linear regression cross val score:", scores_lr.mean())

Linear regression cross val score: -2.5568057065578094


In [43]:
from sklearn.linear_model import Lasso, Ridge
scores_lr = cross_val_score(Ridge(random_state=4), X_train_preprocessed, y_train, cv=10, scoring='r2')#'neg_mean_squared_error')
print("Ridge cross val score:", scores_lr.mean())

Ridge cross val score: -3.439607976228314


In [None]:
from sklearn.ensemble import RandomForestRegressor
rf = RandomForestRegressor(n_estimators=100, random_state=0)
rf.fit(X_train, y_train)
rf.score(X_test, y_test)

In [None]:
rf_scaled = RandomForestRegressor(n_estimators=100, random_state=0)
rf_scaled.fit(X_train_scaled, y_train)
rf_scaled.score(X_test_scaled, y_test)