In [1]:
import requests 
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error as MSE

### Import and preprocessing data

In [2]:
df = pd.read_csv('auto.csv')

In [3]:
df.head(2)

Unnamed: 0,mpg,displ,hp,weight,accel,origin,size
0,18.0,250.0,88,3139,14.5,US,15.0
1,9.0,304.0,193,4732,18.5,US,20.0


In [4]:
dummy = pd.get_dummies(df['origin'], prefix='origin_')

In [5]:
dummy.head(2)

Unnamed: 0,origin__Asia,origin__Europe,origin__US
0,0,0,1
1,0,0,1


In [6]:
df = pd.concat([df, dummy], axis=1)
df = df.drop(columns='origin')

In [7]:
df.head(2)

Unnamed: 0,mpg,displ,hp,weight,accel,size,origin__Asia,origin__Europe,origin__US
0,18.0,250.0,88,3139,14.5,15.0,0,0,1
1,9.0,304.0,193,4732,18.5,20.0,0,0,1


In [8]:
X = df.iloc[:, 1:]
y = df.iloc[:, 0]

### Split data into training and testing set

In [9]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

### Instantiate a DecisionTreeRegressor

In [10]:
dt = DecisionTreeRegressor(max_depth=4, min_samples_leaf=0.26)

### Compute the array containing the 10-folds CV MSEs

In [11]:
MSE_CV_scores = - cross_val_score(dt, X_train, y_train, cv=10, 
                       scoring='neg_mean_squared_error',
                       n_jobs=-1)

In [12]:
MSE_CV_scores

array([19.6815796 , 19.65006172, 36.58196128, 18.30975418, 15.89852161,
       20.56803986, 15.15091772, 31.90920676, 22.71487658,  6.78486054])

### Compute the 10-folds CV RMSE

In [13]:
RMSE_CV = (MSE_CV_scores.mean())**(1/2)
RMSE_CV

4.552469438288795

### Fit DecisionTreeRegressor to the training set

In [14]:
dt.fit(X_train, y_train)

DecisionTreeRegressor(ccp_alpha=0.0, criterion='mse', max_depth=4,
                      max_features=None, max_leaf_nodes=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=0.26, min_samples_split=2,
                      min_weight_fraction_leaf=0.0, presort='deprecated',
                      random_state=None, splitter='best')

### Predict the labels of the training set

In [15]:
y_pred_train = dt.predict(X_train)

### Evaluate the training set RMSE of dt

In [16]:
RMSE_train = (MSE(y_train, y_pred_train))**(1/2)
RMSE_train

4.400091700882458