In [44]:
import pandas as pd
import numpy as np
from sklearn.ensemble import AdaBoostRegressor,RandomForestRegressor,GradientBoostingRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.kernel_ridge import KernelRidge
from sklearn.svm import SVR,LinearSVR
from sklearn.metrics import mean_squared_error,r2_score

In [45]:
## Load the trainign data and test data
df_train=pd.read_csv("../data/training_data.csv")
df_test=pd.read_csv("../data/test_data.csv")

In [46]:
df_train.head()

Unnamed: 0,truckingcompanyid,productid,distanceinmiles,fuelcost,linehaulcost,source,destination
0,9343,2,3600.84,6.17,64.09,34,21
1,2363,23,1265.16,4.5,628.64,29,42
2,9634,23,486.6,5.52,351.3,42,6
3,5243,15,973.2,6.34,57.65,33,4
4,2149,20,2043.72,5.65,823.44,12,31


In [79]:
X_columns=['truckingcompanyid','productid','distanceinmiles','fuelcost','source']
y_columns=['linehaulcost']

In [80]:
X_train=df_train[X_columns]
y_train=df_train[y_columns]
X_test=df_test[X_columns]
y_test=df_test[y_columns]

In [86]:
classifiers_1 = [
    'KernelRidge',
    'DecisionTreeRegressor',
    'RandomForestRegressor',
    'GradientBoostingRegressor',
    'AdaBoostRegressor',
    'SVR', 
    'LinearSVR',
    'PolynomialRegression'
    ]

In [87]:
%%time
classifiers = [
    KernelRidge(),
    DecisionTreeRegressor(max_depth=3),
    RandomForestRegressor(n_estimators=5000),
    GradientBoostingRegressor(n_estimators=5000),
    AdaBoostRegressor(n_estimators=5000),
    SVR(), 
    LinearSVR(),
    Pipeline([('poly', PolynomialFeatures(degree=2)),('linear', LinearRegression(fit_intercept=False))])
    ]

name = []
test_score = []
models = []
test_rmse = []
train_rmse=[]
train_r2_score=[]
test_r2_score=[]
regsr=[]
i = 0
for classifier in classifiers:
    classifier.fit(X_train, y_train.linehaulcost)   
    name.append(classifiers_1[i] )
    train_r2_score.append(r2_score(y_train, classifier.predict(X_train)))
    test_r2_score.append(r2_score(y_test, classifier.predict(X_test)))
    test_score.append(classifier.score(X_test, y_test))
    train_rmse.append(np.sqrt(mean_squared_error(y_train,classifier.predict(X_train))))
    test_rmse.append(np.sqrt(mean_squared_error( y_test,classifier.predict(X_test))))
    i+=1



Wall time: 2min 46s




In [90]:
scores=pd.DataFrame()
scores['name']=name
scores['train_r2score']=train_r2_score
scores['test_r2_score']=test_r2_score
scores['test_score']=test_score
scores['train_rmse']=train_rmse
scores['test_rmse']=test_rmse

In [91]:
scores

Unnamed: 0,name,train_r2score,test_r2_score,test_score,train_rmse,test_rmse
0,KernelRidge,0.1518,0.141583,0.141583,1576.014865,1542.203295
1,DecisionTreeRegressor,0.515986,0.479908,0.479908,1190.529032,1200.41774
2,RandomForestRegressor,0.923822,0.361297,0.361297,472.307857,1330.276071
3,GradientBoostingRegressor,0.974988,0.232117,0.232117,270.638433,1458.612125
4,AdaBoostRegressor,0.486979,0.382219,0.382219,1225.684912,1308.306738
5,SVR,-0.087535,-0.093927,-0.093927,1784.566442,1740.951176
6,LinearSVR,0.030183,0.013235,0.013235,1685.217791,1653.481398
7,PolynomialRegression,0.3054,0.285727,0.285727,1426.193639,1406.774887


In [66]:
scores

Unnamed: 0,name,score,rmse
0,KernelRidge,0.130781,1551.876342
1,RandomForestRegressor,0.315663,1376.979325
2,GradientBoostingRegressor,0.437486,1248.415276
3,AdaBoostRegressor,0.391392,1298.558142
4,SVR,-0.093927,1740.951209
5,LinearSVR,-3.610074,3573.932591


In [67]:
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import Pipeline

model = Pipeline([('poly', PolynomialFeatures(degree=2)),('linear', LinearRegression(fit_intercept=False))])
# fit to an order-3 polynomial data
model = model.fit(X_train,y_train)

print(model.score(X_test, y_test))
print(np.sqrt(mean_squared_error(model.predict(X_test), y_test)))

0.2845783698783598
1407.9052962299918


In [68]:
# Code Lines 1 to 4: Fit the regression tree 'dtree1' and 'dtree2' 
dtree1 = DecisionTreeRegressor(max_depth=3)
dtree2 = DecisionTreeRegressor(max_depth=5)
dtree1.fit(X_train, y_train)
dtree2.fit(X_train, y_train)

# Code Lines 5 to 6: Predict on training data
tr1 = dtree1.predict(X_train)
tr2 = dtree2.predict(X_train) 

#Code Lines 7 to 8: Predict on testing data
y1 = dtree1.predict(X_test)
y2 = dtree2.predict(X_test)

In [69]:
# Print RMSE and R-squared value for regression tree 'dtree1' on training data
print(np.sqrt(mean_squared_error(y_train,tr1))) 
print(r2_score(y_train, tr1))

# Print RMSE and R-squared value for regression tree 'dtree1' on testing data
print(np.sqrt(mean_squared_error(y_test,y1))) 
print(r2_score(y_test, y1)) 

1190.5290315087188
0.5159861100927711
1200.4177399977582
0.47990821865612443


In [70]:
# Print RMSE and R-squared value for regression tree 'dtree2' on training data
print(np.sqrt(mean_squared_error(y_train,tr2))) 
print(r2_score(y_train, tr2))

# Print RMSE and R-squared value for regression tree 'dtree2' on testing data
print(np.sqrt(mean_squared_error(y_test,y2))) 
print(r2_score(y_test, y2))

1133.8709250091656
0.560958996231908
1222.7687471622473
0.46036036327095087


In [71]:
#RF model
model_rf = GradientBoostingRegressor(n_estimators=5000, max_depth=3, random_state=100)
model_rf.fit(X_train, y_train) 
pred_train_rf= model_rf.predict(X_train)
print(np.sqrt(mean_squared_error(y_train,pred_train_rf)))
print(r2_score(y_train, pred_train_rf))

pred_test_rf = model_rf.predict(X_test)
print(np.sqrt(mean_squared_error(y_test,pred_test_rf)))
print(r2_score(y_test, pred_test_rf))

  y = column_or_1d(y, warn=True)


257.5790241203563
0.9773431864231209
1453.7916402052067
0.23718448861684793


In [72]:
#RF model
model_rf = RandomForestRegressor(n_estimators=5000, max_depth=3, random_state=100)
model_rf.fit(X_train, y_train) 
pred_train_rf= model_rf.predict(X_train)
print(np.sqrt(mean_squared_error(y_train,pred_train_rf)))
print(r2_score(y_train, pred_train_rf))

pred_test_rf = model_rf.predict(X_test)
print(np.sqrt(mean_squared_error(y_test,pred_test_rf)))
print(r2_score(y_test, pred_test_rf))

  This is separate from the ipykernel package so we can avoid doing imports until


1168.0733513138862
0.5340727874694569
1182.0701983097063
0.49568519502303177


In [73]:
model_rf.feature_importances_

array([0.0132079 , 0.0112039 , 0.94029167, 0.01720308, 0.01563362,
       0.00245982])

In [74]:
X_train.columns

Index(['truckingcompanyid', 'productid', 'distanceinmiles', 'fuelcost',
       'source', 'destination'],
      dtype='object')

In [47]:
from sklearn.feature_selection import RFE

In [57]:
estimator = SVR(kernel="linear")
selector = RFE(model_rf, n_features_to_select=5, step=1)
selector = selector.fit(X_train, y_train.linehaulcost)
selector.support_
selector.ranking_

array([1, 1, 1, 1, 1, 2])

In [58]:
X_train

Unnamed: 0,truckingcompanyid,productid,distanceinmiles,fuelcost,source,destination
0,9343,2,3600.84,6.17,34,21
1,2363,23,1265.16,4.50,29,42
2,9634,23,486.60,5.52,42,6
3,5243,15,973.20,6.34,33,4
4,2149,20,2043.72,5.65,12,31
...,...,...,...,...,...,...
6995,5252,3,2335.68,5.19,21,11
6996,9679,4,97.32,7.37,35,34
6997,7767,23,973.20,5.03,17,39
6998,4833,24,389.28,4.96,6,11
