## Random Forest - Regression

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
plt.rcParams["figure.figsize"] = (9,5)
import warnings
warnings.filterwarnings('ignore')
pd.set_option('display.max_rows', 1000)
pd.set_option('display.max_columns', 1000)
pd.set_option('display.width', 1000)
pd.set_option('display.float_format', lambda x: '%.3f' % x)

In [2]:
df=pd.read_csv("car_dataset.csv") #get dummyden daha iyi sonuçlar veriyor 
df.head()

FileNotFoundError: [Errno 2] No such file or directory: 'car_dataset.csv'

In [None]:
df.shape

In [None]:
df.info()

In [None]:
df.isnull().any()

In [None]:
df.describe().T

In [None]:
sns.heatmap(df.corr(), annot=True)
plt.show()

In [None]:
df.head()

In [None]:
df["vehicle_age"]=2021-df.Year

In [None]:
df.Car_Name.value_counts() #unique verileri olan featurları düşürebilirsiniz 

In [None]:
df.drop(columns=["Year"], inplace=True)

In [None]:
df.head()

## Train test split

In [None]:
from sklearn.preprocessing import OrdinalEncoder

In [None]:
X=df.drop("Selling_Price", axis=1)
y=df.Selling_Price

In [None]:
cat = X.select_dtypes("object").columns
cat

In [None]:
X[cat].head()

In [None]:
enc = OrdinalEncoder()
X[cat] = enc.fit_transform(X[cat])
X.head()

In [None]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2, random_state=5)

print("Train features shape : ", X_train.shape)
print("Train target shape   : ", y_train.shape)
print("Test features shape  : ", X_test.shape)
print("Test target shape    : ", y_test.shape)


## Modeling for Decision Tree

In [None]:
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor

In [None]:
DT_model = DecisionTreeRegressor(random_state=101)

In [None]:
DT_model.fit(X_train, y_train)

In [None]:
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error

In [None]:
def train_val(model, X_train, y_train, X_test, y_test):
    
    y_pred = model.predict(X_test)
    y_train_pred = model.predict(X_train)
    
    scores = {"train": {"R2" : r2_score(y_train, y_train_pred),
    "mae" : mean_absolute_error(y_train, y_train_pred),
    "mse" : mean_squared_error(y_train, y_train_pred),                          
    "rmse" : np.sqrt(mean_squared_error(y_train, y_train_pred))},
    
    "test": {"R2" : r2_score(y_test, y_pred),
    "mae" : mean_absolute_error(y_test, y_pred),
    "mse" : mean_squared_error(y_test, y_pred),
    "rmse" : np.sqrt(mean_squared_error(y_test, y_pred))}}
    
    return pd.DataFrame(scores)

In [None]:
train_val(DT_model, X_train, y_train, X_test, y_test)#bir overfit olduğunu söylüyor classificationda problem yok 
                                                     #dregression da pure score gördüğünüzde direk overfit yaklaşımını kullanabiliriz 

In [None]:
from sklearn.model_selection import cross_validate, cross_val_score
model = DecisionTreeRegressor(random_state=101)
scores = cross_validate(model, X_train, y_train, scoring=['r2', 
            'neg_mean_absolute_error','neg_mean_squared_error','neg_root_mean_squared_error'], cv =10)
df_scores = pd.DataFrame(scores)
df_scores.mean()[2:]

## Feature Importances

In [None]:
DT_model.feature_importances_

In [None]:
df_f_i = pd.DataFrame(index=X.columns, data = DT_model.feature_importances_, 
                      columns = ["Feature Importance"]).sort_values("Feature Importance")
df_f_i                 #scoreları biraz daha dengeleme yaptığını görebiliriz 

In [None]:
sns.barplot(x = df_f_i.index, y = 'Feature Importance', data = df_f_i)
plt.xticks(rotation = 90)
plt.tight_layout()

## Modeling for Random Forest

In [None]:
from sklearn.ensemble import RandomForestRegressor

In [None]:
rf_model = RandomForestRegressor(random_state=101) #tek seferde overfitin icabına baktı diyemeyiz cross validation yapmalıyız   
rf_model.fit(X_train,y_train)
train_val(rf_model, X_train, y_train, X_test, y_test)

In [None]:
from sklearn.model_selection import cross_validate, cross_val_score #şimdi anlıyoruzki overfitin icabına bakamamış küçük data 
model = RandomForestRegressor(random_state=101)                      #setlerinde böyle
scores = cross_validate(model, X_train, y_train, scoring=['r2', 
            'neg_mean_absolute_error','neg_mean_squared_error','neg_root_mean_squared_error'], cv =10)
df_scores = pd.DataFrame(scores)
df_scores.mean()[2:]

## Feature Importances

In [None]:
rf_model.feature_importances_

In [None]:
df_f_i = pd.DataFrame(index=X.columns, data = rf_model.feature_importances_, 
                      columns = ["Feature Importance"]).sort_values("Feature Importance")
df_f_i

In [None]:
sns.barplot(x = df_f_i.index, y = 'Feature Importance', data = df_f_i)
plt.xticks(rotation = 90)
plt.tight_layout()

## Visualizing trees

In [None]:
from sklearn.tree import plot_tree

def report_model(model):
    y_pred = model.predict(X_test)
    y_train_pred = model.predict(X_train)
    print('\n')
    print(train_val(model, X_train, y_train, X_test, y_test))
    print('\n')
    plt.figure(figsize=(12,8),dpi=100)
    plot_tree(model.estimators_[0],filled=True, feature_names=X.columns, fontsize=10);

In [None]:
pruned_tree = RandomForestRegressor(n_estimators=250, max_depth=4, random_state=101)
pruned_tree.fit(X_train,y_train)

In [None]:
report_model(pruned_tree)     #fiyatı yüksek olanları sağa diğerlerin i sola ayırıp en güçlü feature present price
                              #classificationda auto kareköküydü burada her seferinde max_feature bulundurun
                              #feature sayısı böü 3 mutlaka kullanın max_featureın en iyi ağaç seçeceğinden dolayı 
                              #deneyip scoreları bulmalıyız 

## GridSearch

In [None]:
from sklearn.model_selection import GridSearchCV

In [None]:
model = RandomForestRegressor(random_state=101)

In [None]:
param_grid = {"n_estimators":[200, 500],
              "max_depth": [None, 4, 5, 6, 7,8],
              "min_samples_leaf": [1, 2, 3, 4],
              "min_samples_split": [2, 3, 5, 6],
              "max_features":['auto', len(df.columns)/3, 2, 4]}

In [None]:

grid_model = GridSearchCV(estimator=model,
                          param_grid=param_grid,
                          scoring='neg_root_mean_squared_error',
                          cv=10,
                          n_jobs = -1)

In [None]:
grid_model.fit(X_train,y_train)#cv yapmamıza gerek yok 

In [None]:
grid_model.best_estimator_ 

In [None]:
grid_model.best_params_

In [None]:
grid_score =pd.DataFrame(grid_model.cv_results_)
grid_score

In [None]:
grid_model.best_index_

In [None]:
grid_model.best_score_

In [None]:
train_val(grid_model, X_train, y_train, X_test, y_test) #diğer tahminlere nisbeten daha iyi olduğu 
                                                        #rastgele 2/3 ünü alıyor boostrap n estimatordaki ağaç sayısının fazla olması
                                                        #max_feature ile 2/3 classification için default olarak karekök 
                                                        #random forest beging metodunun sub sample çıkarır aynı algoritmaya verebilir 
                                                        #decision tree ile seçersen importat feature beignin ile boosting çoğu kişi bilmez 
                                                        3 öğretilmez

In [None]:
from sklearn.model_selection import cross_validate, cross_val_score

model = RandomForestRegressor(random_state=101, max_depth = None,
             max_features= 4,
             min_samples_leaf = 1,
             min_samples_split= 2,
             n_estimators= 200)
scores = cross_validate(model, X_train, y_train, scoring=['r2', 
            'neg_mean_absolute_error','neg_mean_squared_error','neg_root_mean_squared_error'], cv = 10)
df_scores = pd.DataFrame(scores)
df_scores.mean()[2:]

In [None]:
y_pred = grid_model.predict(X_test)
my_dict = { 'Actual': y_test, 'Pred': y_pred, 'Residual': y_test-y_pred }
compare = pd.DataFrame(my_dict)

In [None]:
comp_sample = compare.sample(20)
comp_sample

In [None]:
comp_sample.plot(kind='bar',figsize=(15,9))
plt.show()