## Random Forest Classifier/Regressor, Cross Validation, VarianceThreshold, PCA

In [1]:
import pandas as pd
import numpy as np
pd.set_option('display.max_columns', None)
import warnings
warnings.filterwarnings('ignore')
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor

In [2]:
data = pd.read_csv('immo_cat.csv')
data = data.drop(['Unnamed: 0.1', 'Unnamed: 0'], axis=1)
cols = ['newlyConst', 'balcony', 'hasKitchen','lift', 'garden','cellar']
data[cols] = data[cols].replace({True:1,False:0})
data = data.drop(['serviceCharge','telekomHybridUploadSpeed','pricetrend','totalRent','scoutId','thermalChar','lastRefurbish','Euro/m2','livingSpace','picturecount','telekomUploadSpeed','yearConstructed','noParkSpaces','baseRent','geo_plz','heatingCosts','electricityBasePrice','electricityKwhPrice','regio1','telekomTvOffer','heatingType','typeOfFlat','houseNumber','geo_bln','geo_krs','street','regio2','regio3','date','description','facilities','streetPlain','firingTypes','condition','interiorQual','petsAllowed','noRoomsRange','energyEfficiencyClass'], axis=1)
data.head(1)

Unnamed: 0,newlyConst,balcony,hasKitchen,cellar,yearConstructedRange,lift,baseRentRange,noRooms,floor,numberOfFloors,garden,livingSpaceRange,price_class,typeOfFlat_cat,heatingType_cat,firingTypes_cat,regio1_cat,condition_cat,interiorQual_cat,petsAllowed_cat,noRoomsRange_cat,energyEfficiencyClass_cat
0,0,0,0,1,2.0,0,4,4.0,1.0,3.0,1,4,4,0.4,0.55,0.45,0.51,0.57,0.3,0.95,4.0,0.61


In [3]:
y = data['price_class']
X = data.drop(['price_class'], axis = 1)

numericalX = X.select_dtypes(np.number)
categoricalX = X.select_dtypes(np.object)

# we OneHotEncode the categoricals so we can use the same dataset to perform a regression later (in the lab).
# it is not needed for a DecisionTree or RandomForest model-->no one hot encoding needed, but do for later
from sklearn.preprocessing import OneHotEncoder
encoder = OneHotEncoder(drop='first').fit(categoricalX)
encoded_categorical = encoder.transform(categoricalX).toarray()
encoded_categorical = pd.DataFrame(encoded_categorical)
X = pd.concat([numericalX, encoded_categorical], axis = 1)

# Note: we need to do train/test split before downsampling, and then only downsample the training set - Why?
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [4]:
trainset=pd.concat([y_train, X_train ], axis=1)
trainset.shape

(214248, 22)

In [5]:
X_train = pd.DataFrame(X_train)
X_test = pd.DataFrame(X_test)

### RandomForestClassifier

In [6]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix

clf = RandomForestClassifier(max_depth=5,
                             min_samples_split=20,
                             min_samples_leaf =20,
                             max_samples=0.8,
                             random_state = 42)
clf.fit(X_train, y_train)
print(clf.score(X_train, y_train))
print(clf.score(X_test, y_test))

y_pred = clf.predict(X_test)
display(y_test.value_counts())
display(confusion_matrix(y_test, y_pred))

0.3220846869048953
0.3217019528770397


3     10269
4      7536
2      6356
5      5802
8      5326
6      4743
7      4487
9      3116
11     1780
10     1740
12      860
1       832
13      382
14      264
15       69
Name: price_class, dtype: int64

array([[   0,  279,  531,   15,    1,    0,    0,    6,    0,    0,    0,
           0,    0,    0,    0],
       [   0,  987, 5109,  208,   26,    2,    0,   24,    0,    0,    0,
           0,    0,    0,    0],
       [   0,  190, 9104,  743,  119,    5,    0,  108,    0,    0,    0,
           0,    0,    0,    0],
       [   0,    0, 5285, 1453,  436,   23,    0,  339,    0,    0,    0,
           0,    0,    0,    0],
       [   0,    1, 2463, 1424,  897,   57,    0,  960,    0,    0,    0,
           0,    0,    0,    0],
       [   0,    0, 1030,  975,  905,   79,    0, 1754,    0,    0,    0,
           0,    0,    0,    0],
       [   0,    0,  485,  552,  622,   70,    0, 2756,    1,    0,    1,
           0,    0,    0,    0],
       [   0,    0,  316,  348,  312,   28,    0, 4305,    5,    0,   12,
           0,    0,    0,    0],
       [   0,    0,  133,  103,   62,    5,    0, 2723,   21,    0,   69,
           0,    0,    0,    0],
       [   0,    0,   51,   44,   24,

In [7]:
# For cross validation
from sklearn.model_selection import cross_val_score
clf = RandomForestClassifier(max_depth=5,
                             min_samples_split=20,
                             min_samples_leaf =20)
cross_val_scores = cross_val_score(clf, X_train, y_train, cv=10)
print(np.mean(cross_val_scores))

0.316044885156986


In [8]:
from sklearn.feature_selection import VarianceThreshold, RFE
from sklearn.decomposition import PCA
from sklearn.model_selection import cross_val_score
import statsmodels.api as sm
X_added_constant = sm.add_constant(X_train)
X_added_constant
model = sm.OLS(y_train,X_added_constant).fit()
model.summary()

0,1,2,3
Dep. Variable:,price_class,R-squared:,0.86
Model:,OLS,Adj. R-squared:,0.86
Method:,Least Squares,F-statistic:,62810.0
Date:,"Wed, 19 Apr 2023",Prob (F-statistic):,0.0
Time:,03:24:17,Log-Likelihood:,-317200.0
No. Observations:,214248,AIC:,634400.0
Df Residuals:,214226,BIC:,634700.0
Df Model:,21,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,1.5086,0.071,21.104,0.000,1.369,1.649
newlyConst,0.0213,0.012,1.839,0.066,-0.001,0.044
balcony,-0.1455,0.005,-28.282,0.000,-0.156,-0.135
hasKitchen,0.3145,0.005,58.667,0.000,0.304,0.325
cellar,-0.1353,0.005,-27.364,0.000,-0.145,-0.126
yearConstructedRange,0.0160,0.001,11.604,0.000,0.013,0.019
lift,0.1945,0.007,28.880,0.000,0.181,0.208
baseRentRange,1.4016,0.002,662.806,0.000,1.397,1.406
noRooms,-0.0707,0.012,-6.114,0.000,-0.093,-0.048

0,1,2,3
Omnibus:,48644.332,Durbin-Watson:,2.002
Prob(Omnibus):,0.0,Jarque-Bera (JB):,185552.064
Skew:,1.098,Prob(JB):,0.0
Kurtosis:,6.995,Cond. No.,368.0


In [9]:
rfc = RandomForestClassifier(max_depth=5,
                             min_samples_split=20,
                             min_samples_leaf =20,
                             max_samples=0.2,
                             random_state = 42)
rfc.fit(X_train, y_train)
print(rfc.score(X_train,y_train))
print(rfc.score(X_test,y_test))
# Make predictions on the test data
y_pred = rfc.predict(X_test)

0.32279414510287147
0.32241141107501586


In [10]:
vt = VarianceThreshold(threshold=0.05)
X_vt = vt.fit_transform(X_train)
scores_vt = cross_val_score(rfc, X_vt, y_train, cv=5, scoring='accuracy')
scores_vt.mean()

0.3304628573166747

In [11]:
pca = PCA(n_components=6)
X_pca = pca.fit_transform(X_train)
scores_pca = cross_val_score(rfc, X_pca, y_train, cv=5, scoring='accuracy')
scores_pca.mean()

0.3620337196155951

In [12]:
from sklearn.ensemble import RandomForestRegressor
regr = RandomForestRegressor(max_depth=10,
                             criterion = 'squared_error',
                             min_samples_split=10,
                             min_samples_leaf = 10)
regr.fit(X_train, y_train)
print(regr.score(X_train, y_train))
print(regr.score(X_test, y_test))

y_pred = regr.predict(X_test)

0.9161553271534614
0.9119782947813627
