In [1]:
import pandas as pd
from sklearn.preprocessing import RobustScaler, StandardScaler, PolynomialFeatures
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.metrics import mean_absolute_error , mean_squared_error , r2_score
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import BaggingRegressor,GradientBoostingRegressor,VotingRegressor,RandomForestRegressor
from sklearn.svm import SVR
from sklearn.model_selection import GridSearchCV

import pickle

import warnings
warnings.filterwarnings('ignore')

In [2]:
df_processed=pd.read_csv("../data/processed/df_copy_encoded.csv")
df_processed.head()

Unnamed: 0,year,customer_age,unit_cost,unit_price,cost,revenue,hour,profit,profit_margin,country_0,country_1,country_2,product_category_0,product_category_1,sub_category_0,sub_category_1,sub_category_2,sub_category_3,sub_category_4,customer_gender_M
0,2016.0,29.0,80.0,109.0,80.0,109.0,0,29.0,0.266055,0,0,1,0,1,0,0,0,0,1,0
1,2016.0,29.0,24.5,28.5,49.0,57.0,0,8.0,0.140351,0,0,1,1,0,0,0,0,1,0,0
2,2016.0,29.0,3.67,5.0,11.0,15.0,0,4.0,0.266667,0,0,1,0,1,0,0,0,0,1,0
3,2016.0,29.0,87.5,116.5,175.0,233.0,0,58.0,0.248927,0,0,1,0,1,0,0,0,0,1,0
4,2016.0,29.0,35.0,41.666667,105.0,125.0,0,20.0,0.16,0,0,1,0,1,0,0,0,0,1,0


In [3]:
df_processed.drop('profit_margin',axis=1,inplace=True)
df_processed.drop('revenue',axis=1,inplace=True)

In [4]:
x= df_processed.drop("profit", axis=1)
y= df_processed['profit']

In [5]:
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2,random_state=0)

In [6]:
scaler=StandardScaler()
x_train=scaler.fit_transform(x_train)
x_test=scaler.transform(x_test)

In [7]:
models={
    'linear_reg':LinearRegression(),
    'SVR':SVR(),
    'DT':DecisionTreeRegressor(),
    'RF':RandomForestRegressor(n_estimators=25,n_jobs=-1),
    'Bagging_classifier':BaggingRegressor(DecisionTreeRegressor(),n_estimators=5,n_jobs=-1),
    'xgboost':GradientBoostingRegressor(random_state=0),
 
}

In [8]:
for name,model in models.items():
    print('--------- ',name,'-------------')
    model.fit(x_train,y_train)
   
    print('R2 train score: ',model.score(x_train,y_train))
    print('R2_test score: ',model.score(x_test,y_test))

    print('Mean squared error train: ')
    pickle.dump(model,open("../models/"+name+"_model.h5",'wb'))
    
   
    print('-'*30)

---------  linear_reg -------------
R2 train score:  0.8389589450146102
R2_test score:  0.8314987615264023
Mean squared error train: 
------------------------------
---------  SVR -------------
R2 train score:  0.4168178112490428
R2_test score:  0.41774836676009885
Mean squared error train: 
------------------------------
---------  DT -------------
R2 train score:  1.0
R2_test score:  0.9501397265605869
Mean squared error train: 
------------------------------
---------  RF -------------
R2 train score:  0.9946439649157666
R2_test score:  0.9687089717123613
Mean squared error train: 
------------------------------
---------  Bagging_classifier -------------
R2 train score:  0.9897766653125899
R2_test score:  0.9569290346195471
Mean squared error train: 
------------------------------
---------  xgboost -------------
R2 train score:  0.8544782742615546
R2_test score:  0.8221858915611364
Mean squared error train: 
------------------------------


In [9]:
model=SVR()
params = [
        {'C':[1, 10], 'kernel':['linear', 'sigmoid', 'poly']},
        {'C':[1, 10], 'kernel':['rbf'], 'gamma':[0.5, 0.6, 0.7, 0.1, 0.01, 0.01]}
         ]
grid_search_svc=GridSearchCV(estimator=model,
                        param_grid=params,
                        scoring='r2',
                        n_jobs=-1)

In [10]:
grid_search_svc.fit(x_train,y_train)

In [None]:
grid_search_svc.best_params_

In [None]:
grid_search_svc.best_score

In [11]:
features=x.columns

In [13]:
pickle.dump(features,open("../models/features.h5",'wb'))
pickle.dump(scaler,open("../models/scaler.h5",'wb'))