In [1]:
%matplotlib inline

In [39]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.preprocessing import MinMaxScaler

from sklearn.model_selection import train_test_split

from sklearn.tree import DecisionTreeRegressor, ExtraTreeRegressor

# Regression Live Demo

In [3]:
mpg_data = pd.read_fwf('data/auto-mpg.data', header = None, na_vals = '?')
mpg_data

Unnamed: 0,0,1,2,3,4,5,6,7,8
0,18.0,8,307.0,130.0,3504.0,12.0,70,1,"""chevrolet chevelle malibu"""
1,15.0,8,350.0,165.0,3693.0,11.5,70,1,"""buick skylark 320"""
2,18.0,8,318.0,150.0,3436.0,11.0,70,1,"""plymouth satellite"""
3,16.0,8,304.0,150.0,3433.0,12.0,70,1,"""amc rebel sst"""
4,17.0,8,302.0,140.0,3449.0,10.5,70,1,"""ford torino"""
...,...,...,...,...,...,...,...,...,...
393,27.0,4,140.0,86.00,2790.0,15.6,82,1,"""ford mustang gl"""
394,44.0,4,97.0,52.00,2130.0,24.6,82,2,"""vw pickup"""
395,32.0,4,135.0,84.00,2295.0,11.6,82,1,"""dodge rampage"""
396,28.0,4,120.0,79.00,2625.0,18.6,82,1,"""ford ranger"""


In [4]:
mpg_data.columns = ['mpg', 'cylinders', 'displacement', 'horsepower', 'weight', 'acceleration', 'model_year', 'origin', 'car_name']

In [5]:
mpg_data.head()

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model_year,origin,car_name
0,18.0,8,307.0,130.0,3504.0,12.0,70,1,"""chevrolet chevelle malibu"""
1,15.0,8,350.0,165.0,3693.0,11.5,70,1,"""buick skylark 320"""
2,18.0,8,318.0,150.0,3436.0,11.0,70,1,"""plymouth satellite"""
3,16.0,8,304.0,150.0,3433.0,12.0,70,1,"""amc rebel sst"""
4,17.0,8,302.0,140.0,3449.0,10.5,70,1,"""ford torino"""


In [6]:
mpg_data.dtypes

mpg             float64
cylinders         int64
displacement    float64
horsepower       object
weight          float64
acceleration    float64
model_year        int64
origin            int64
car_name         object
dtype: object

In [7]:
mpg_data.horsepower.unique()

array(['130.0', '165.0', '150.0', '140.0', '198.0', '220.0', '215.0',
       '225.0', '190.0', '170.0', '160.0', '95.00', '97.00', '85.00',
       '88.00', '46.00', '87.00', '90.00', '113.0', '200.0', '210.0',
       '193.0', '?', '100.0', '105.0', '175.0', '153.0', '180.0', '110.0',
       '72.00', '86.00', '70.00', '76.00', '65.00', '69.00', '60.00',
       '80.00', '54.00', '208.0', '155.0', '112.0', '92.00', '145.0',
       '137.0', '158.0', '167.0', '94.00', '107.0', '230.0', '49.00',
       '75.00', '91.00', '122.0', '67.00', '83.00', '78.00', '52.00',
       '61.00', '93.00', '148.0', '129.0', '96.00', '71.00', '98.00',
       '115.0', '53.00', '81.00', '79.00', '120.0', '152.0', '102.0',
       '108.0', '68.00', '58.00', '149.0', '89.00', '63.00', '48.00',
       '66.00', '139.0', '103.0', '125.0', '133.0', '138.0', '135.0',
       '142.0', '77.00', '62.00', '132.0', '84.00', '64.00', '74.00',
       '116.0', '82.00'], dtype=object)

In [8]:
mpg_data.horsepower.astype(np.float64, errors = 'ignore')

0      130.0
1      165.0
2      150.0
3      150.0
4      140.0
       ...  
393    86.00
394    52.00
395    84.00
396    79.00
397    82.00
Name: horsepower, Length: 398, dtype: object

In [9]:
mpg_data = mpg_data.drop(mpg_data[mpg_data.horsepower == '?'].index)

In [10]:
mpg_data.horsepower = mpg_data.horsepower.astype(float)

In [11]:
mpg_data.dtypes

mpg             float64
cylinders         int64
displacement    float64
horsepower      float64
weight          float64
acceleration    float64
model_year        int64
origin            int64
car_name         object
dtype: object

In [12]:
mpg_data.shape

(392, 9)

In [13]:
# mpg_data.horsepower.replace(('?': 'nan'))

In [14]:
mpg_data = mpg_data.drop('car_name', axis = 1)

In [15]:
mpg_data_attributes = mpg_data.drop('mpg', axis = 1)
mpg_data_target = mpg_data['mpg']

In [16]:
mpg_data_attributes_scaled = MinMaxScaler().fit_transform(mpg_data_attributes)

In [19]:
attributes_train, attributes_test, targets_train, targets_test = train_test_split(
    mpg_data_attributes_scaled,
    mpg_data_target, 
    test_size = 0.2,
    random_state = 42
)

In [22]:
tree = DecisionTreeRegressor()

In [23]:
tree.fit(attributes_train, targets_train)

DecisionTreeRegressor()

In [25]:
tree.score(attributes_train, targets_train)

1.0

In [26]:
tree.score(attributes_test, targets_test)

0.7609769982371054

In [29]:
tree.decision_path(attributes_train[: 10])

<10x527 sparse matrix of type '<class 'numpy.int64'>'
	with 110 stored elements in Compressed Sparse Row format>

In [30]:
tree.get_depth()

16

In [33]:
tree.feature_importances_

array([0.00242178, 0.63063737, 0.17166741, 0.06250934, 0.02389405,
       0.10293826, 0.0059318 ])

In [34]:
type(mpg_data_attributes)

pandas.core.frame.DataFrame

In [35]:
mpg_data_attributes.columns

Index(['cylinders', 'displacement', 'horsepower', 'weight', 'acceleration',
       'model_year', 'origin'],
      dtype='object')

In [38]:
list(zip(mpg_data_attributes.columns, tree.feature_importances_))

[('cylinders', 0.002421775140366278),
 ('displacement', 0.6306373671945167),
 ('horsepower', 0.17166740644288864),
 ('weight', 0.06250934348555037),
 ('acceleration', 0.023894049876603523),
 ('model_year', 0.10293825600515483),
 ('origin', 0.00593180185491984)]