# A Multi Target Random Forest Regression (MTRT)

In this section we develop a multi target random forest regression for the prediction of a minimum and maximum price of laptop models.


## 1. Import Statements


In [1]:
import pandas as pd
import numpy as np
import math
import io

import matplotlib.pyplot as plt
import matplotlib.image as mpimg
import pydotplus

from sklearn.tree import DecisionTreeRegressor, export_graphviz
from sklearn.ensemble import RandomForestRegressor 

from sklearn import tree
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
from sklearn.tree import plot_tree
from sklearn.externals.six import StringIO

from IPython.display import SVG
from graphviz import Source
from IPython.display import display

## Edit Print options
desired_width = 500
pd.set_option('display.width', desired_width)
pd.set_option('display.max_columns', 30)
pd.set_option('display.max_rows', 600)



## 2. Dealing with missing values



In [2]:
## Importing the preprocessed data
missing_values = ["n/a", "na", "--","NAN"," ","nan","NaN"]
data = pd.read_csv('datafile2.csv',na_values = missing_values)
test_data = pd.read_csv('test.csv',na_values = missing_values)

## Removing missing values
# data.dropna()

## Just a temporary dataset to test the capabilities of the model
temp_data = data[['brand','gpu_brand','cpu_brand','cpu_type_name','gpu_type','screen_size','touchscreen','discrete_gpu','gpu','ram','ssd', 
                  'detachable_keyboard','weight','cpu_benchmark','gpu_benchmark','cpu_GHZ', 'min_price','max_price','diff_price']]


## Filling in missing values: CATEGORICAL & BINARY VARIABLES
temp_data['detachable_keyboard'] = temp_data['detachable_keyboard'].fillna(0)
# temp_data['detachable_keyboard'] = temp_data['detachable_keyboard'].astype(bool)
temp_data['gpu_brand'] = temp_data['gpu_brand'].fillna("Unknown", inplace = True) 
temp_data['cpu_type_name'] = temp_data.apply(lambda row: row['cpu_brand'] if pd.isnull(row['cpu_type_name']) 
                                             else row['cpu_type_name'], axis=1)
temp_data['gpu_type'] = temp_data['gpu_type'].fillna("No GPU", inplace = True) 


## Filling in missing values: NUMERICAL VARIABLES
temp_data['weight'] = temp_data['weight'].fillna(temp_data['weight'].mean())
temp_data['cpu_benchmark'] = temp_data['cpu_benchmark'].fillna(temp_data['cpu_benchmark'].mean())
temp_data['gpu_benchmark'] = temp_data['gpu_benchmark'].fillna(temp_data['gpu_benchmark'].mean())
temp_data['cpu_GHZ'] = temp_data['cpu_GHZ'].fillna(temp_data['cpu_GHZ'].mean())

## Showing the data
data.head()


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  from ipykernel import kernelapp as app
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._update_inplace(new_data)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy

Unnamed: 0.1,Unnamed: 0,id,name,brand,base_name,screen_size,pixels_x,pixels_y,screen_surface,touchscreen,cpu,cpu_details,detachable_keyboard,discrete_gpu,gpu,...,ssd,storage,weight,min_price,max_price,cpu_brand,cpu_type_name,gpu_brand,gpu_type,os_details_2,cpu_details_2,cpu_benchmark,gpu_benchmark,diff_price,cpu_GHZ
0,0,7774,LENOVO FLEX 3 15.6-INCH TOUCHSCREEN LAPTOP (CO...,LENOVO,LENOVO FLEX 3 80JM002CUS,15.6,1920,1080,GLOSSY,1,INTEL CORE I7,INTEL CORE I7-5500U 2.40 GHZ (5TH GEN BROADWEL...,0.0,0,INTEL HD,...,0,1000,4.6,899.0,899.0,INTEL,CORE I7,INTEL,HD,10,5500U,2715.0,10.0,0.0,2.4
1,1,25926,RAZER BLADE 15 GAMING LAPTOP: INTEL CORE I7-87...,RAZER,RAZER BLADE 15,15.6,1920,1080,MATTE,0,INTEL CORE I7,INTEL CORE I7-8750H 2.2 GHZ (8TH GEN COFFEE LA...,0.0,1,NVIDIA GEFORCE RTX 2070 MAX-Q,...,512,512,4.63,2099.99,2099.99,INTEL,CORE I7,NVIDIA,GEFORCE RTX 2070 MAX-Q,10 HOME,8750H,10418.0,,0.0,2.2
2,2,25267,HP 15.6 INCH HD THIN AND LIGHT LAPTOP ( 7TH GE...,HP,HP,15.6,1366,768,,0,AMD A6,AMD A6-9220 2.5 GHZ (7TH GEN STONEY RIDGE DUAL...,0.0,0,AMD RADEON R4,...,0,500,4.63,439.0,449.0,AMD,A6,AMD,RADEON R4,10,9220,1032.0,,10.0,2.5
3,3,22367,"ACER ASPIRE E 15, 15.6"" FULL HD, 8TH GEN INTEL...",ACER,ACER ASPIRE E5-576,15.6,1920,1080,MATTE,0,INTEL CORE I3,INTEL CORE I3-8130U 2.2 GHZ (8TH GEN KABY LAKE...,0.0,0,INTEL UHD 620,...,0,1000,5.3,375.0,449.0,INTEL,CORE I3,INTEL,UHD 620,10 HOME,8130U,3676.0,888.0,74.0,2.2
4,4,17471,"HP 17.3 INCH (1600 X 900) HD+ LAPTOP PC, INTEL...",HP,HP 17,17.3,1600,900,GLOSSY,0,INTEL CORE I5,INTEL CORE I5-7200U 2.5 GHZ (7TH GEN KABY LAKE...,0.0,0,INTEL HD 620,...,0,1000,5.8,559.0,559.0,INTEL,CORE I5,INTEL,HD 620,10,7200U,3522.0,888.0,0.0,2.5


## 3. Feature Importance (Selection)

## 4. Splitting Data & Fitting Data

In [3]:

## Seperating categorical - binary - numerical variables
X_cat = temp_data[['brand','gpu','gpu_brand','cpu_type_name','gpu_type']]
X_bin = temp_data[['touchscreen','discrete_gpu','detachable_keyboard']]
X_num = temp_data[['screen_size','ram','ssd','weight','cpu_benchmark','cpu_GHZ']]

## Creating dummy variables:
X_cat_dummies = pd.get_dummies(X_cat, drop_first=True)

## Merging input data
X = pd.concat([X_cat_dummies,X_bin,X_num], axis=1)
# X = pd.concat([X_cat_dummies,X_num], axis=1)
# X = X_cat_dummies

## Defining output data
Ya = temp_data[['min_price','max_price']]
Yb = temp_data[['min_price','diff_price']]


## Splitting the dataset into train and test sets
# Predict Minimum and Maximum prices
X_train_a, X_test_a, Y_train_a, Y_test_a = train_test_split(X, Ya, test_size=0.33, random_state=42)
# Predict Minimum price and deviation from this price
# X_train_b, X_test_b, Y_train_b, Y_test_b = train_test_split(X, Yb, test_size=0.33, random_state=42)

X_train_a.head()

## 2 models
Model1 = RandomForestRegressor(n_estimators = 300,criterion='mae',random_state = 1)
Model1.fit(X_train_a,Y_train_a)

# Model2 = RandomForestRegressor(n_estimators = 200,criterion='mae',random_state = 0)
# Model2.fit(X_train_b,Y_train_b)

RandomForestRegressor(bootstrap=True, ccp_alpha=0.0, criterion='mae',
                      max_depth=None, max_features='auto', max_leaf_nodes=None,
                      max_samples=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=1,
                      min_samples_split=2, min_weight_fraction_leaf=0.0,
                      n_estimators=300, n_jobs=None, oob_score=False,
                      random_state=1, verbose=0, warm_start=False)

## 5. Measuring Mean Summed Absolute Error

### A: MSAE for predicting minimum and maximum  price

In [4]:
## Predicting the target values
Y_pred = Model1.predict(X_test_a)

## Calculating the score
MAE = mean_absolute_error(Y_test_a, Y_pred, multioutput='raw_values')
print(MAE[0]+MAE[1])

## Turn DF to array
Y_test_array = Y_test_a.to_numpy()

Sum_error = 0
Y_pred_min_array = []
for i in range(len(Y_pred)):
#     print(Y_pred[i],Y_test_array[i])
    Y_pred_min = Y_pred[i][0]
    Y_pred_max = Y_pred[i][1]
    Y_test_min = Y_test_array[i][0]
    Y_test_max = Y_test_array[i][1]
    
    Y_pred_min_array.append(Y_pred_min)
    
    Error_min = abs(Y_pred_min - Y_test_min)
    Error_max = abs(Y_pred_max - Y_test_max)
    Total_error = Error_min + Error_max
    
    Sum_error = Sum_error + Total_error
    counter = i

Avg_error = Sum_error / counter
MSAE = Avg_error
print(MSAE)



339.936504536489
341.95993611111095


### B: MSAE for predicting minimum price and price difference

**Result:** No significant improvement was obtained. This is as expected because the regression tree considers the two outputs, min and max price, at the same time. They are not considered independent and the splits are based on improving both outputs.

In [5]:
# Y_pred = Model2.predict(X_test_b)

# ## Turn DF to array
# Y_test_array = Y_test_b.to_numpy()

# Sum_error = 0
# Y_pred_min_array = []
# for i in range(len(Y_pred)):
# #     print(Y_pred[i],Y_test_array[i])
#     Y_pred_min = Y_pred[i][0]
#     Y_pred_max = Y_pred_min + Y_pred[i][1]
#     Y_test_min = Y_test_array[i][0]
#     Y_test_max = Y_test_min + Y_test_array[i][1]
    
#     Y_pred_min_array.append(Y_pred_min)
    
#     Error_min = abs(Y_pred_min - Y_test_min)
#     Error_max = abs(Y_pred_max - Y_test_max)
#     Total_error = Error_min + Error_max
    
#     Sum_error = Sum_error + Total_error
#     counter = i

# Avg_error = Sum_error / counter
# MSAE = Avg_error
# print('The MSAE = ', round(MSAE,3))



## 6. Visualization of Model and Data

In [6]:
## Printing the decision tree
# FEATURE_NAMES = X_train.columns[:]


# dot_data = export_graphviz(Model1,
#                                out_file=None,
#                                feature_names=FEATURE_NAMES,
#                                filled = True)
# graph = Source(dot_data)

# # dot_data = io.StringIO()
# # export_graphviz(Model1, out_file=dot_data, rounded=True, filled=True)
# # filename = "tree.png"
# # pydotplus.graph_from_dot_data(dot_data.getvalue()).write_png(filename)
# # plt.figure(figsize=(300,100))
# # img = mpimg.imread(filename)
# # imgplot = plt.imshow(img)
# # plt.show()

# plt.figure(figsize=(120,50))
# plot_tree(Model1, feature_names=FEATURE_NAMES,fontsize=11, rounded=True)
# plt.show()

In [7]:
# Visualising the Random Forest Regression results 
  
# arange for creating a range of values 
# from min value of x to max  
# value of x with a difference of 0.01  
# between two consecutive values 
# X_grid = np.arange(X_test, max(x), 0.01)  
# X_ram
  
# reshape for reshaping the data into a len(X_grid)*1 array,  
# i.e. to make a column out of the X_grid value                   
# X_grid = X_grid.reshape((len(X_grid), 1)) 
  
# Scatter plot for original data 
# plt.scatter(x, y, color = 'blue')   
  
# plot predicted data 
# plt.plot(Y_pred_min_array,  
#          color = 'green')  
# plt.title('Random Forest Regression') 
# plt.xlabel('Position level') 
# plt.ylabel('Max_price') 
# plt.show()