# A Multi Target Random Forest Regression (MTRT)

In this section we develop a multi target random forest regression for the prediction of a minimum and maximum price of laptop models.


## 1. Import Statements


In [1]:
import pandas as pd
import numpy as np
import math
import io

import matplotlib.pyplot as plt
import matplotlib.image as mpimg
import pydotplus

from sklearn.tree import DecisionTreeRegressor, export_graphviz
from sklearn.ensemble import RandomForestRegressor 

from sklearn import tree
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
from sklearn.tree import plot_tree
from sklearn.externals.six import StringIO

from IPython.display import SVG
from graphviz import Source
from IPython.display import display

## Edit Print options
desired_width = 500
pd.set_option('display.width', desired_width)
pd.set_option('display.max_columns', 30)
pd.set_option('display.max_rows', 600)



## 2. Dealing with missing values



In [2]:
## Importing the preprocessed data
missing_values = ["n/a", "na", "--","NAN"," ","nan","NaN"]
data = pd.read_csv('datafile2.csv',na_values = missing_values)
data2 = pd.read_csv('datafile2_test.csv',na_values = missing_values)

## Removing missing values
# data.dropna()

## Just a temporary dataset to test the capabilities of the model
temp_data = data[['brand','gpu_brand','cpu_brand','cpu_type_name','gpu_type','screen_surface','screen_size','pixels_x',
                  'touchscreen','discrete_gpu','gpu','ram','ssd','detachable_keyboard','weight',
                  'cpu_benchmark','gpu_benchmark','cpu_GHZ', 'min_price','max_price']]

test_data = data2[['brand','gpu_brand','cpu_brand','cpu_type_name','gpu_type','screen_surface','screen_size','pixels_x',
                   'touchscreen','discrete_gpu','gpu','ram','ssd', 'detachable_keyboard','weight',
                   'cpu_benchmark','gpu_benchmark','cpu_GHZ']]

concat_df = pd.concat([temp_data , test_data])

##### TRAINING DATA #####
## Filling in missing values: CATEGORICAL & BINARY VARIABLES
temp_data['detachable_keyboard'] = temp_data['detachable_keyboard'].fillna(0)
# temp_data['detachable_keyboard'] = temp_data['detachable_keyboard'].astype(bool)
temp_data['gpu_brand'] = temp_data['gpu_brand'].fillna("Unknown", inplace = True) 
temp_data['cpu_type_name'] = temp_data.apply(lambda row: row['cpu_brand'] if pd.isnull(row['cpu_type_name']) 
                                             else row['cpu_type_name'], axis=1)
temp_data['gpu_type'] = temp_data['gpu_type'].fillna("No GPU", inplace = True) 
temp_data['screen_surface'] = temp_data['screen_surface'].fillna("Unknown", inplace = True) 

# ## Filling in missing values: NUMERICAL VARIABLES
temp_data['weight'] = temp_data['weight'].fillna(temp_data['weight'].mean())
temp_data['cpu_benchmark'] = temp_data['cpu_benchmark'].fillna(temp_data['cpu_benchmark'].mean())
temp_data['gpu_benchmark'] = temp_data['gpu_benchmark'].fillna(temp_data['gpu_benchmark'].mean())
temp_data['cpu_GHZ'] = temp_data['cpu_GHZ'].fillna(temp_data['cpu_GHZ'].mean())
temp_data['pixels_x'] = temp_data['pixels_x'].fillna(1920)

## Showing the data
# data.head()

########################################################################################################

##### TRAINING & TESTING DATA #####
## Filling in missing values: CATEGORICAL & BINARY VARIABLES
concat_df['detachable_keyboard'] = concat_df['detachable_keyboard'].fillna(0)
# concat_df['detachable_keyboard'] = concat_df['detachable_keyboard'].astype(bool)
concat_df['gpu_brand'] = concat_df['gpu_brand'].fillna("Unknown", inplace = True) 
concat_df['cpu_type_name'] = concat_df.apply(lambda row: row['cpu_brand'] if pd.isnull(row['cpu_type_name']) 
                                             else row['cpu_type_name'], axis=1)
concat_df['gpu_type'] = concat_df['gpu_type'].fillna("No GPU", inplace = True) 
concat_df['screen_surface'] = concat_df['screen_surface'].fillna("Unknown", inplace = True) 

## Filling in missing values: NUMERICAL VARIABLES
concat_df['weight'] = concat_df['weight'].fillna(concat_df['weight'].mean())
concat_df['cpu_benchmark'] = concat_df['cpu_benchmark'].fillna(concat_df['cpu_benchmark'].mean())
concat_df['gpu_benchmark'] = concat_df['gpu_benchmark'].fillna(concat_df['gpu_benchmark'].mean())
concat_df['cpu_GHZ'] = concat_df['cpu_GHZ'].fillna(concat_df['cpu_GHZ'].mean())
concat_df['pixels_x'] = concat_df['pixels_x'].fillna(1920)



of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._update_inplace(new_data)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/

## 3. Feature Importance (Selection)

## 4. Splitting Data & Fitting Data

In [3]:
#### Training Data ####
## Seperating categorical - binary - numerical variables
X_cat = temp_data[['brand','gpu','gpu_brand','cpu_type_name','gpu_type','screen_surface']]
X_bin = temp_data[['touchscreen','discrete_gpu','detachable_keyboard']]
X_num = temp_data[['screen_size','ram','ssd','weight','cpu_benchmark','cpu_GHZ','pixels_x']]

## Creating dummy variables:
X_cat_dummies = pd.get_dummies(X_cat, drop_first=True)

## Merging input data
X = pd.concat([X_cat_dummies,X_bin,X_num], axis=1)
# X = pd.concat([X_cat_dummies,X_num], axis=1)
# X = X_cat_dummies

## Defining output data
Ya = temp_data[['min_price','max_price']]
# Yb = temp_data[['min_price','diff_price']]

########################################################################################
## Seperating categorical - binary - numerical variables
X_cat_test = concat_df[['brand','gpu','gpu_brand','cpu_type_name','gpu_type','screen_surface']]
X_bin_test = concat_df[['touchscreen','discrete_gpu','detachable_keyboard']]
X_num_test = concat_df[['screen_size','ram','ssd','weight','cpu_benchmark','cpu_GHZ','pixels_x']]

## Creating dummy variables:
X_cat_dummies_test = pd.get_dummies(X_cat_test, drop_first=True)

## Merging input data
X_full = pd.concat([X_cat_dummies_test,X_bin_test,X_num_test], axis=1)
X_train = X_full.head(len(data))
X_test = X_full.tail(len(data2))

########################################################################################
## Splitting the dataset into train and test sets
# Predict Minimum and Maximum prices
X_train_a, X_test_a, Y_train_a, Y_test_a = train_test_split(X, Ya, test_size=0.33, random_state=42)
# Predict Minimum price and deviation from this price
# X_train_b, X_test_b, Y_train_b, Y_test_b = train_test_split(X, Yb, test_size=0.33, random_state=42)

X_train_a.head()

## 2 models
Model1 = RandomForestRegressor(n_estimators = 300,criterion='mae',random_state = 1)
Model1.fit(X_train_a,Y_train_a)

# Model2 = RandomForestRegressor(n_estimators = 200,criterion='mae',random_state = 0)
# Model2.fit(X_train_b,Y_train_b)

RandomForestRegressor(bootstrap=True, ccp_alpha=0.0, criterion='mae',
                      max_depth=None, max_features='auto', max_leaf_nodes=None,
                      max_samples=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=1,
                      min_samples_split=2, min_weight_fraction_leaf=0.0,
                      n_estimators=300, n_jobs=None, oob_score=False,
                      random_state=1, verbose=0, warm_start=False)

## 5. Measuring Mean Summed Absolute Error

### A: MSAE for predicting minimum and maximum  price

In [4]:
## Predicting the target values
Y_pred = Model1.predict(X_test_a)

## Calculating the score
MAE = mean_absolute_error(Y_test_a, Y_pred, multioutput='raw_values')
print(MAE[0]+MAE[1])

## Turn DF to array
Y_test_array = Y_test_a.to_numpy()

Sum_error = 0
Y_pred_min_array = []
for i in range(len(Y_pred)):
#     print(Y_pred[i],Y_test_array[i])
    Y_pred_min = Y_pred[i][0]
    Y_pred_max = Y_pred[i][1]
    Y_test_min = Y_test_array[i][0]
    Y_test_max = Y_test_array[i][1]
    
    Y_pred_min_array.append(Y_pred_min)
    
    Error_min = abs(Y_pred_min - Y_test_min)
    Error_max = abs(Y_pred_max - Y_test_max)
    Total_error = Error_min + Error_max
    
    Sum_error = Sum_error + Total_error
    counter = i

Avg_error = Sum_error / counter
MSAE = Avg_error
print(MSAE)



324.63657633136086
326.5689369047622


### B: MSAE for predicting minimum price and price difference

**Result:** No significant improvement was obtained. This is as expected because the regression tree considers the two outputs, min and max price, at the same time. They are not considered independent and the splits are based on improving both outputs.

In [5]:
# Y_pred = Model2.predict(X_test_b)

# ## Turn DF to array
# Y_test_array = Y_test_b.to_numpy()

# Sum_error = 0
# Y_pred_min_array = []
# for i in range(len(Y_pred)):
# #     print(Y_pred[i],Y_test_array[i])
#     Y_pred_min = Y_pred[i][0]
#     Y_pred_max = Y_pred_min + Y_pred[i][1]
#     Y_test_min = Y_test_array[i][0]
#     Y_test_max = Y_test_min + Y_test_array[i][1]
    
#     Y_pred_min_array.append(Y_pred_min)
    
#     Error_min = abs(Y_pred_min - Y_test_min)
#     Error_max = abs(Y_pred_max - Y_test_max)
#     Total_error = Error_min + Error_max
    
#     Sum_error = Sum_error + Total_error
#     counter = i

# Avg_error = Sum_error / counter
# MSAE = Avg_error
# print('The MSAE = ', round(MSAE,3))



## 6. Visualization of Model and Data

In [6]:
## Printing the decision tree
# FEATURE_NAMES = X_train.columns[:]


# dot_data = export_graphviz(Model1,
#                                out_file=None,
#                                feature_names=FEATURE_NAMES,
#                                filled = True)
# graph = Source(dot_data)

# # dot_data = io.StringIO()
# # export_graphviz(Model1, out_file=dot_data, rounded=True, filled=True)
# # filename = "tree.png"
# # pydotplus.graph_from_dot_data(dot_data.getvalue()).write_png(filename)
# # plt.figure(figsize=(300,100))
# # img = mpimg.imread(filename)
# # imgplot = plt.imshow(img)
# # plt.show()

# plt.figure(figsize=(120,50))
# plot_tree(Model1, feature_names=FEATURE_NAMES,fontsize=11, rounded=True)
# plt.show()

In [7]:
# Visualising the Random Forest Regression results 
  
# arange for creating a range of values 
# from min value of x to max  
# value of x with a difference of 0.01  
# between two consecutive values 
# X_grid = np.arange(X_test, max(x), 0.01)  
# X_ram
  
# reshape for reshaping the data into a len(X_grid)*1 array,  
# i.e. to make a column out of the X_grid value                   
# X_grid = X_grid.reshape((len(X_grid), 1)) 
  
# Scatter plot for original data 
# plt.scatter(x, y, color = 'blue')   
  
# plot predicted data 
# plt.plot(Y_pred_min_array,  
#          color = 'green')  
# plt.title('Random Forest Regression') 
# plt.xlabel('Position level') 
# plt.ylabel('Max_price') 
# plt.show()

## 7. Predicting the test set

In [8]:
Model_Final = RandomForestRegressor(n_estimators = 300,criterion='mae',random_state = 1)
Model_Final.fit(X_train,Ya)
Y_pred_test = Model_Final.predict(X_test)

RESULTS = []
for i in range(len(data2)):
    temp_result = []
    temp_result.append(data2['id'][i])
    temp_result.append(Y_pred_test[i][0])
    temp_result.append(Y_pred_test[i][1])
    RESULTS.append(temp_result)

RESULTS = pd.DataFrame(RESULTS)
print(RESULTS)



         0            1            2
0    28807  1127.365700  1193.300833
1    22559   385.427733   401.750700
2    28647   827.645917   859.047067
3    22141   522.619500   542.157983
4    26116  2150.110633  2332.082700
5    27111  1362.687733  1404.905500
6    23420   628.582983   649.678300
7    21464   182.872150   189.135267
8    29405   729.264450   761.076017
9    27107   406.104800   414.568367
10   26141   997.807500  1032.610933
11   25928  1565.318000  1663.880400
12   24845   710.034967   766.738733
13   28804   339.491200   355.222050
14   26772   611.625300   618.990300
15   27413   886.264133   911.062567
16    7261   819.436950   866.083367
17   31424   761.371833   780.192500
18    3940   404.038400   407.649100
19   16238   533.887333   552.749467
20   29407  1247.391467  1299.622050
21   24011   351.231833   371.008367
22    8542   370.185267   377.719533
23   14448   726.980300   740.924367
24   19611   229.921900   239.524233
25   30068  1198.298333  1258.368500
2

In [9]:
RESULTS.to_csv("/Users/Simon/Documents/GitHub/adana123/RESULTS.csv",header=False,index=False)