In [9]:
import pandas as pd
import numpy as np

from sklearn.model_selection import cross_val_score, KFold, train_test_split
from sklearn.preprocessing import PolynomialFeatures

from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn import tree
from sklearn.svm import SVR
import xgboost as xgb
from sklearn.linear_model import Lasso

from sklearn.metrics import r2_score
from sklearn.metrics import mean_absolute_percentage_error

In [10]:
df = pd.read_csv('IphoneData.csv')
newdf = df.drop(columns=['NAME', 'CO2E'])
features = newdf.to_numpy()
emissions = df['CO2E'].to_numpy()

In [22]:
def modelScore(X, y, model):
    avgR2 = avgSSE = avgMAPE = 0
    #cross validation using K folds
    kFolds = 5
    for i, (trainIndex, testIndex) in enumerate(KFold(n_splits=kFolds, shuffle=True, random_state=42).split(X)):
        X_train = X[trainIndex]
        X_test = X[testIndex]
        y_train = y[trainIndex]
        y_test = y[testIndex]
    

        if model == 'linear regression':
            fittedModel = LinearRegression().fit(X_train, y_train)
                
        elif model == 'decision tree':
            fittedModel = tree.DecisionTreeRegressor().fit(X_train, y_train)
                
        elif model == 'random forest':
            y_train = y_train.flatten()
            fittedModel = RandomForestRegressor().fit(X_train, y_train)
            
        elif model == 'support vector regression':
            y_train = y_train.flatten()
            fittedModel = SVR(kernel = 'linear').fit(X_train, y_train)
    
        elif model == 'polynomial regression':
            polyX_train = PolynomialFeatures(degree=2).fit_transform(X_train)
            polyX_test = PolynomialFeatures(degree=2).fit_transform(X_test)
            fittedModel = LinearRegression().fit(polyX_train, y_train)
            X_test = polyX_test

        elif model == 'xgboost':
            fittedModel = xgb.XGBRFRegressor(objective ='reg:linear', n_estimators = 20, seed = 42).fit(X_train, y_train)
        
        elif model =='lasso':
            fittedModel = Lasso().fit(X_train, y_train)
            

        y_pred = fittedModel.predict(X_test)
        
        #scores
        avgR2 += r2_score(y_test, y_pred)    
        avgSSE += np.sum((y_test - y_pred) ** 2)
        avgMAPE += mean_absolute_percentage_error(y_test, y_pred)

    avgR2, avgSSE, avgMAPE = avgR2 / kFolds, avgSSE / kFolds, avgMAPE / kFolds
   #print results
    print(model)
    print("R2 = " + str(avgR2))
    print("SSE = " + str(avgSSE))
    print("MAPE = " + str(avgMAPE))


In [23]:
modelScore(features, emissions, 'linear regression')

AttributeError: 'LinearRegression' object has no attribute 'feature_importances_'

In [24]:
modelScore(features, emissions, 'decision tree')

decision tree
R2 = 0.6466532234696586
SSE = 746.0
MAPE = 0.09171827072560002


array([1.87421144e-01, 6.29573856e-01, 5.23894985e-05, 0.00000000e+00,
       1.93841144e-03, 2.16156576e-02, 3.68663653e-02, 2.11304310e-03,
       0.00000000e+00, 2.39245376e-03, 7.87588793e-03, 1.10150792e-01])

In [25]:
modelScore(features, emissions, 'random forest')

random forest
R2 = 0.8345926879156655
SSE = 409.12643999999983
MAPE = 0.06588648578044304


array([0.03935496, 0.5966231 , 0.01671169, 0.        , 0.01628271,
       0.01968922, 0.14449957, 0.00882708, 0.00807519, 0.05961117,
       0.02711602, 0.06320929])

In [26]:
modelScore(features, emissions, 'support vector regression')

AttributeError: 'SVR' object has no attribute 'feature_importances_'

In [27]:
modelScore(features, emissions, 'polynomial regression')

AttributeError: 'LinearRegression' object has no attribute 'feature_importances_'

In [17]:
modelScore(features, emissions, 'xgboost')

xgboost
R2 = 0.8043675644701505
SSE = 424.87292831292143
MAPE = 0.07209835789567827




In [28]:
modelScore(features, emissions, 'lasso')

AttributeError: 'Lasso' object has no attribute 'feature_importances_'

In [20]:
modelsList = ['linear regression', 'decision tree', 'random forest', 'support vector regression', 'polynomial regression', 'xgboost', 'lasso']
multipleFeatures = df[[ 'STORAGE (gb)', 'WEIGHT (gm)']].to_numpy()
featuresList = ['DISPLAY (inch)', 'STORAGE (gb)', 'MEMORY (gb)', 'CPU (cores)', 'GPU (cores)', 'NEURAL ENGINE (cores)', 'WEIGHT (gm)', 'FRONT CAMERA (mp)', 'BATTERY (mAh)', 'number of camera (back)', 'Sum of megapixels', 'MAIN CAMERA (mp)']

In [21]:
for model in modelsList:
    modelScore(multipleFeatures, emissions, model)

linear regression
R2 = 0.7189643439389602
SSE = 616.1919821761769
MAPE = 0.07737486607798909
decision tree
R2 = 0.6482988116424577
SSE = 836.2
MAPE = 0.08773436093350004
random forest
R2 = 0.7595831004336813
SSE = 580.1238777611112
MAPE = 0.07905004903184132
support vector regression
R2 = 0.7078947985688305
SSE = 631.3382321497635
MAPE = 0.07260462607196813
polynomial regression
R2 = 0.7054607112654743
SSE = 666.0156044604062
MAPE = 0.09068771529191445
xgboost
R2 = 0.7125838134350828
SSE = 682.7622618082357
MAPE = 0.09297101928037119
lasso
R2 = 0.71894829829049
SSE = 616.2671302794026
MAPE = 0.07729833074002816




In [None]:
"""for x in featuresList:
    print(x)
    for model in modelsList:
        print(modelScore(df[x].to_numpy().reshape(-1, 1), emissions, model))
    print("")  """

'for x in featuresList:\n    print(x)\n    for model in modelsList:\n        print(modelScore(df[x].to_numpy().reshape(-1, 1), emissions, model))\n    print("")  '