In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import cross_val_score, KFold, train_test_split
from sklearn.model_selection import LeaveOneOut
from sklearn.preprocessing import PolynomialFeatures

from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn import tree
from sklearn.svm import SVR
import xgboost as xgb
from sklearn.linear_model import Lasso
from sklearn.neural_network import MLPRegressor

from sklearn.metrics import r2_score
from sklearn.metrics import mean_absolute_percentage_error

In [2]:
df = pd.read_csv('SamsungData.csv')
newdf = df.drop(columns=['NAME', 'CO2E'])
features = newdf.to_numpy()
emissions = df['CO2E'].to_numpy()

In [18]:
def modelScore(X, y, model):
    avgR2 = avgSSE = avgMAPE = 0
    #cross validation using K folds
    kFolds = 5
    for i, (trainIndex, testIndex) in enumerate(KFold(n_splits=kFolds, shuffle=True, random_state=42).split(X)):
        X_train = X[trainIndex]
        X_test = X[testIndex]
        y_train = y[trainIndex]
        y_test = y[testIndex]
    

        if model == 'linear regression':
            fittedModel = LinearRegression().fit(X_train, y_train)
            
        elif model == 'decision tree':
            dtModel = tree.DecisionTreeRegressor(criterion='squared_error', max_depth=4, min_samples_split=2,
                                                 max_leaf_nodes=110)
            fittedModel = dtModel.fit(X_train, y_train)
                
        elif model == 'random forest':
            y_train = y_train.flatten()
            RF = RandomForestRegressor(criterion='friedman_mse', max_depth=5, n_estimators=50,
                                        max_features=None, max_leaf_nodes=100, ccp_alpha=0.1)
            fittedModel = RF.fit(X_train, y_train)
            
        elif model == 'support vector regression':
            y_train = y_train.flatten()
            fittedModel = SVR(kernel='poly', degree=5, coef0=10, tol=1e-2).fit(X_train, y_train)
    
        elif model == 'polynomial regression':
            polyFeatures = PolynomialFeatures(degree=2, interaction_only=False, include_bias=True, order='C')
            polyX_train = polyFeatures.fit_transform(X_train)
            polyX_test = polyFeatures.transform(X_test)
            fittedModel = LinearRegression().fit(polyX_train, y_train)
            X_test = polyX_test

        elif model == 'xgboost':
            xgbModel = xgb.XGBModel(learning_rate=0.2, n_estimators=225, booster='gblinear', eval_metric='rmse',
                                    feature_selector='thrifty', updater='coord_descent', reg_lambda=0.3, reg_alpha=0.3)
            fittedModel = xgbModel.fit(X_train, y_train)
        
        elif model =='lasso':
            lassoModel = Lasso(random_state=42, fit_intercept=False, )

            fittedModel = lassoModel.fit(X_train, y_train)
        
        elif model =="neural network":
            nn = MLPRegressor(random_state=42, hidden_layer_sizes=(15, 10, 10, 10, 15), activation='identity', solver='lbfgs', max_iter=7500,
                                early_stopping=True)

            fittedModel = nn.fit(X_train, y_train)

        y_pred = fittedModel.predict(X_test)

        #scores
        avgR2 += r2_score(y_test, y_pred)    
        avgSSE += np.sum((y_test - y_pred) ** 2)
        avgMAPE += mean_absolute_percentage_error(y_test, y_pred)



    avgR2, avgSSE, avgMAPE = avgR2 / kFolds, avgSSE / kFolds, avgMAPE / kFolds
   #print results
    """print(model)
    print("R2 = " + str(avgR2))
    print("SSE = " + str(avgSSE))
    print("MAPE = " + str(avgMAPE))"""
    print(str(avgR2))
    print(str(avgSSE))
    print(str(avgMAPE))


In [19]:
modelScore(features, emissions, 'lasso')

0.7027983078464224
756.0154424271107
0.10374733736565327


  model = cd_fast.enet_coordinate_descent_gram(


: 

In [575]:
modelScore(features, emissions, 'linear regression')

-0.06619907080824869
6373.320577431783
0.27289562962364766


In [397]:
modelScore(features, emissions, 'decision tree')

0.021818476330967073
1747.2668094293495
0.147526320301414


In [398]:
modelScore(features, emissions, 'random forest')

0.4735448829279699
1364.9565671534451
0.14220264351345407


In [399]:
modelScore(features, emissions, 'support vector regression')

-0.1368508467895437
9262.644319400591
0.24544453178817888


In [400]:
modelScore(features, emissions, 'polynomial regression')

0.46950777095235646
1980.4844181740368
0.14705533175652433


In [401]:
modelScore(features, emissions, 'xgboost')

0.6511285384007175
990.4185620089617
0.1278077575404118


In [12]:
modelScore(features, emissions, 'lasso')

0.7072522115508122
763.676894340049
0.10457777656664932


In [13]:
modelScore(features, emissions, 'neural network')

-194.74399722262635
742553.9316142211
3.196332011190825




In [404]:
modelsList = ['linear regression', 'decision tree', 'random forest', 'support vector regression', 'polynomial regression', 'xgboost', 'lasso', 'neural network']
multipleFeatures = df[['DISPLAY (inch)', 'WEIGHT (gm)', 'BATTERY (mAh)', 'MAIN CAMERA (mp)']].to_numpy()
wholeFeaturesList = ['DISPLAY (inch)','STORAGE (gb)','MEMORY (gb)','CPU (cores)','CPU SPEED (GHz)','GPU (cores)','GPU BENCHMARK (3DMark)','WEIGHT (gm)','MAIN CAMERA (mp)','FRONT CAMERA (mp)','NUMBER OF CAMERAS (back)','SUM OF MEGAPIXELS','BATTERY (mAh)']

In [405]:
for model in modelsList:
    modelScore(multipleFeatures, emissions, model)

0.42986533629614054
2199.7450497723016
0.15842647368736912
0.013888920588438136
1847.3188094293491
0.1516241737302219
0.4906964712174123
1050.992065324343
0.12974496764703852
-0.1488331950739301
9361.508291669707
0.24625907687825474
0.42986533629617085
2199.745049772572
0.15842647368736917
0.6153709505356818
1166.1438681169552
0.14459217623095805
0.5777430003391363
1290.7663533143839
0.14743637974681503
-6.624697887144327
62299.46744486123
0.7314665805789088


