# A Multi Target Random Forest Regression (MTRT)

In this section we develop a multi target random forest regression for the prediction of a minimum and maximum price of laptop models.


## 1. Import Statements


In [1]:
import pandas as pd
import numpy as np
import math
import io

import matplotlib.pyplot as plt
import matplotlib.image as mpimg
import pydotplus

from sklearn.tree import DecisionTreeRegressor, export_graphviz
from sklearn.ensemble import RandomForestRegressor 

from sklearn import tree
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
from sklearn.tree import plot_tree
from sklearn.externals.six import StringIO

from IPython.display import SVG
from graphviz import Source
from IPython.display import display

## Edit Print options
desired_width = 500
pd.set_option('display.width', desired_width)
pd.set_option('display.max_columns', 30)
pd.set_option('display.max_rows', 600)



## 2. Dealing with missing values



In [2]:
## Importing the preprocessed data
missing_values = ["n/a", "na", "--","NAN"," ","nan","NaN",""]
data = pd.read_csv('datafile2.csv',na_values = missing_values)
data2 = pd.read_csv('datafile2_test.csv',na_values = missing_values)

## Removing missing values
# data.dropna()

## Just a temporary dataset to test the capabilities of the model
temp_data = data[['brand','base_name','os_details','gpu_brand','cpu_brand','cpu_type_name','gpu_type','screen_surface','screen_size','pixels_x',
                  'touchscreen','discrete_gpu','gpu','ram','ssd','detachable_keyboard','weight',
                  'cpu_benchmark','gpu_benchmark','cpu_GHZ', 'min_price','max_price']]

test_data = data2[['brand','base_name','os_details','gpu_brand','cpu_brand','cpu_type_name','gpu_type','screen_surface','screen_size','pixels_x',
                   'touchscreen','discrete_gpu','gpu','ram','ssd', 'detachable_keyboard','weight',
                   'cpu_benchmark','gpu_benchmark','cpu_GHZ']]

concat_df = pd.concat([temp_data , test_data])
concat_df.to_csv("/Users/Simon/Documents/GitHub/adana123/concat_df1.csv",header=True,index=True)

##### TRAINING DATA #####
## Filling in missing values: CATEGORICAL & BINARY VARIABLES
temp_data['detachable_keyboard'] = temp_data['detachable_keyboard'].fillna(0)

temp_data['gpu_brand'] = temp_data['gpu_brand'].fillna("Unknown", inplace = True) 
temp_data['cpu_type_name'] = temp_data.apply(lambda row: row['cpu_brand'] if pd.isnull(row['cpu_type_name']) 
                                             else row['cpu_type_name'], axis=1)
temp_data['gpu_type'] = temp_data['gpu_type'].fillna("No GPU", inplace = True) 
temp_data['screen_surface'] = temp_data['screen_surface'].fillna("Unknown", inplace = True) 
temp_data['os_details'] = temp_data['os_details'].fillna("Unknown", inplace = False) 

# ## Filling in missing values: NUMERICAL VARIABLES
temp_data['weight'] = temp_data['weight'].fillna(temp_data['weight'].mean())
temp_data['cpu_benchmark'] = temp_data['cpu_benchmark'].fillna(temp_data['cpu_benchmark'].mean()) 
temp_data['gpu_benchmark'] = temp_data['gpu_benchmark'].fillna(temp_data['gpu_benchmark'].mean())
temp_data['cpu_GHZ'] = temp_data['cpu_GHZ'].fillna(temp_data['cpu_GHZ'].mean())
temp_data['pixels_x'] = temp_data['pixels_x'].fillna(temp_data['pixels_x'].mode()) #Modus! Most frequent number! (median)

## Showing the data
# data.head()

########################################################################################################

##### TRAINING & TESTING DATA #####
## Filling in missing values: CATEGORICAL & BINARY VARIABLES
detach_key = int(temp_data.loc[:,'detachable_keyboard'].mode())
concat_df.loc[:,'detachable_keyboard'] = concat_df.loc[:,'detachable_keyboard'].fillna(detach_key)
# print(concat_df['detachable_keyboard'])
concat_df['gpu_brand'] = concat_df['gpu_brand'].fillna("Unknown", inplace = False) 
concat_df['cpu_type_name'] = concat_df.apply(lambda row: row['cpu_brand'] if pd.isnull(row['cpu_type_name']) 
                                             else row['cpu_type_name'], axis=1)

concat_df['gpu_type'] = concat_df['gpu_type'].fillna("No GPU", inplace = False)
concat_df['gpu'] = concat_df['gpu'].fillna("No GPU", inplace = False) 
concat_df['screen_surface'] = concat_df['screen_surface'].fillna("Unknown", inplace = False) 
concat_df['os_details'] = concat_df['os_details'].fillna("Unknown", inplace = False) 

## Filling in missing values: NUMERICAL VARIABLES
concat_df['weight'] = concat_df['weight'].fillna(temp_data['weight'].mean())
concat_df['cpu_benchmark'] = concat_df['cpu_benchmark'].fillna(temp_data['cpu_benchmark'].mean()) 
concat_df['gpu_benchmark'] = concat_df['gpu_benchmark'].fillna(temp_data['gpu_benchmark'].mean())
concat_df['cpu_GHZ'] = concat_df['cpu_GHZ'].fillna(temp_data['cpu_GHZ'].mean())

pixel_mode = int(temp_data.loc[:,'pixels_x'].mode())
concat_df.loc[:,'pixels_x'] = concat_df.loc[:,'pixels_x'].fillna(pixel_mode)




concat_df.to_csv("/Users/Simon/Documents/GitHub/adana123/concat_df2.csv",header=True,index=True)

#### Mean only from training set!

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._update_inplace(new_data)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/

## 3. Feature Importance (Selection)

## 4. Splitting Data & Fitting Data

In [14]:
#### Training Data ####
## Seperating categorical - binary - numerical variables
X_cat = temp_data[['brand','cpu_brand','gpu_brand','cpu_type_name','screen_surface','gpu']]#,'gpu_type','os_details','base_name'
X_bin = temp_data[['touchscreen','discrete_gpu','detachable_keyboard']]
X_num = temp_data[['screen_size','ram','ssd','weight','cpu_GHZ','pixels_x']]#,'cpu_benchmark','gpu_benchmark'

## Creating dummy variables:
X_cat_dummies = pd.get_dummies(X_cat, drop_first=True)

## Merging input data
X = pd.concat([X_cat_dummies,X_bin,X_num], axis=1)
# X = pd.concat([X_cat_dummies,X_num], axis=1)
# X = X_cat_dummies

## Defining output data
Ya = temp_data[['min_price','max_price']]
# Yb = temp_data[['min_price','diff_price']]

########################################################################################
## Seperating categorical - binary - numerical variables
X_cat_test = concat_df[['brand','cpu_brand','gpu','gpu_brand','cpu_type_name','gpu_type','screen_surface']]#'os_details','base_name'
X_bin_test = concat_df[['touchscreen','discrete_gpu','detachable_keyboard']]
X_num_test = concat_df[['screen_size','ram','ssd','weight','cpu_GHZ','pixels_x','cpu_benchmark','gpu_benchmark']]

## Creating dummy variables:
X_cat_dummies_test = pd.get_dummies(X_cat_test, drop_first=True)

## Merging input data
X_full = pd.concat([X_cat_dummies_test,X_bin_test,X_num_test], axis=1)
X_train = X_full.head(len(data))
X_test = X_full.tail(len(data2))

########################################################################################
## Splitting the dataset into train and test sets
# Predict Minimum and Maximum prices
X_train_a, X_test_a, Y_train_a, Y_test_a = train_test_split(X, Ya, test_size=0.33, random_state=42)
# Predict Minimum price and deviation from this price
# X_train_b, X_test_b, Y_train_b, Y_test_b = train_test_split(X, Yb, test_size=0.33, random_state=42)

X_train_a.head()

## 2 models
Model1 = RandomForestRegressor(n_estimators = 100,criterion='mae',random_state = 1)
Model1.fit(X_train_a,Y_train_a)

# Model2 = RandomForestRegressor(n_estimators = 200,criterion='mae',random_state = 0)
# Model2.fit(X_train_b,Y_train_b)

RandomForestRegressor(bootstrap=True, ccp_alpha=0.0, criterion='mae',
                      max_depth=None, max_features='auto', max_leaf_nodes=None,
                      max_samples=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=1,
                      min_samples_split=2, min_weight_fraction_leaf=0.0,
                      n_estimators=100, n_jobs=None, oob_score=False,
                      random_state=1, verbose=0, warm_start=False)

## 5. Measuring Mean Summed Absolute Error

### A: MSAE for predicting minimum and maximum  price

In [15]:
## Predicting the target values
Y_pred = Model1.predict(X_test_a)

## Calculating the score
MAE = mean_absolute_error(Y_test_a, Y_pred, multioutput='raw_values')
print(MAE[0]+MAE[1])

## Turn DF to array
Y_test_array = Y_test_a.to_numpy()

Sum_error = 0
Y_pred_min_array = []
for i in range(len(Y_pred)):
#     print(Y_pred[i],Y_test_array[i])
    Y_pred_min = Y_pred[i][0]
    Y_pred_max = Y_pred[i][1]
    Y_test_min = Y_test_array[i][0]
    Y_test_max = Y_test_array[i][1]
    
    Y_pred_min_array.append(Y_pred_min)
    
    Error_min = abs(Y_pred_min - Y_test_min)
    Error_max = abs(Y_pred_max - Y_test_max)
    Total_error = Error_min + Error_max
    
    Sum_error = Sum_error + Total_error
    counter = i

Avg_error = Sum_error / counter
MSAE = Avg_error
print(MSAE)



319.68249763313605
321.5853696428571


### B: MSAE for predicting minimum price and price difference

**Result:** No significant improvement was obtained. This is as expected because the regression tree considers the two outputs, min and max price, at the same time. They are not considered independent and the splits are based on improving both outputs.

In [5]:
# Y_pred = Model2.predict(X_test_b)

# ## Turn DF to array
# Y_test_array = Y_test_b.to_numpy()

# Sum_error = 0
# Y_pred_min_array = []
# for i in range(len(Y_pred)):
# #     print(Y_pred[i],Y_test_array[i])
#     Y_pred_min = Y_pred[i][0]
#     Y_pred_max = Y_pred_min + Y_pred[i][1]
#     Y_test_min = Y_test_array[i][0]
#     Y_test_max = Y_test_min + Y_test_array[i][1]
    
#     Y_pred_min_array.append(Y_pred_min)
    
#     Error_min = abs(Y_pred_min - Y_test_min)
#     Error_max = abs(Y_pred_max - Y_test_max)
#     Total_error = Error_min + Error_max
    
#     Sum_error = Sum_error + Total_error
#     counter = i

# Avg_error = Sum_error / counter
# MSAE = Avg_error
# print('The MSAE = ', round(MSAE,3))



## 6. Visualization of Model and Data

In [6]:
## Printing the decision tree
# FEATURE_NAMES = X_train.columns[:]


# dot_data = export_graphviz(Model1,
#                                out_file=None,
#                                feature_names=FEATURE_NAMES,
#                                filled = True)
# graph = Source(dot_data)

# # dot_data = io.StringIO()
# # export_graphviz(Model1, out_file=dot_data, rounded=True, filled=True)
# # filename = "tree.png"
# # pydotplus.graph_from_dot_data(dot_data.getvalue()).write_png(filename)
# # plt.figure(figsize=(300,100))
# # img = mpimg.imread(filename)
# # imgplot = plt.imshow(img)
# # plt.show()

# plt.figure(figsize=(120,50))
# plot_tree(Model1, feature_names=FEATURE_NAMES,fontsize=11, rounded=True)
# plt.show()

In [7]:
# Visualising the Random Forest Regression results 
  
# arange for creating a range of values 
# from min value of x to max  
# value of x with a difference of 0.01  
# between two consecutive values 
# X_grid = np.arange(X_test, max(x), 0.01)  
# X_ram
  
# reshape for reshaping the data into a len(X_grid)*1 array,  
# i.e. to make a column out of the X_grid value                   
# X_grid = X_grid.reshape((len(X_grid), 1)) 
  
# Scatter plot for original data 
# plt.scatter(x, y, color = 'blue')   
  
# plot predicted data 
# plt.plot(Y_pred_min_array,  
#          color = 'green')  
# plt.title('Random Forest Regression') 
# plt.xlabel('Position level') 
# plt.ylabel('Max_price') 
# plt.show()

## 7. Predicting the test set

In [8]:
Model_Final = RandomForestRegressor(n_estimators = 1000,criterion='mae',random_state = 1)
Model_Final.fit(X_train,Ya)
Y_pred_test = Model_Final.predict(X_test)

RESULTS = []
for i in range(len(data2)):
    temp_result = []
    temp_result.append(data2['id'][i])
    temp_result.append(Y_pred_test[i][0])
    temp_result.append(Y_pred_test[i][1])
    RESULTS.append(temp_result)

RESULTS = pd.DataFrame(RESULTS)
print(RESULTS)



         0            1            2
0    28807  1145.900515  1214.424845
1    22559   384.238435   401.199165
2    28647   787.307420   818.843230
3    22141   525.181825   544.099305
4    26116  2070.852230  2258.131030
5    27111  1352.791120  1399.565130
6    23420   609.725945   635.258840
7    21464   186.433140   192.890625
8    29405   729.090325   766.278475
9    27107   395.410330   405.608615
10   26141   939.515660   973.619190
11   25928  1549.611415  1658.296720
12   24845   714.503900   770.309440
13   28804   359.498665   377.767285
14   26772   607.776625   615.346155
15   27413   914.778110   941.755370
16    7261   853.705100   896.528920
17   31424   783.663000   804.042105
18    3940   421.289455   430.860235
19   16238   530.275220   548.869615
20   29407  1204.709470  1253.337185
21   24011   338.141380   357.947060
22    8542   390.783045   399.088780
23   14448   704.558620   719.857710
24   19611   233.742515   243.896485
25   30068  1203.861410  1264.879710
2

In [9]:
RESULTS.to_csv("/Users/Simon/Documents/GitHub/adana123/RESULTS.csv",header=False,index=False)