In [3]:
import pandas as pd
import numpy as np

from sklearn.model_selection import cross_val_score, KFold, train_test_split
from sklearn.model_selection import LeaveOneOut
from sklearn.preprocessing import PolynomialFeatures

from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn import tree
from sklearn.svm import SVR
import xgboost as xgb
from sklearn.linear_model import Lasso

from sklearn.metrics import r2_score
from sklearn.metrics import mean_absolute_percentage_error

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


In [4]:
df = pd.read_csv('IphoneData.csv')
newdf = df.drop(columns=['NAME', 'CO2E'])
features = newdf.to_numpy()
emissions = df['CO2E'].to_numpy()

In [5]:
df.head()


Unnamed: 0,NAME,DISPLAY (inch),STORAGE (gb),MEMORY (gb),CPU (cores),GPU (cores),NEURAL ENGINE (cores),WEIGHT (gm),MAIN CAMERA (mp),FRONT CAMERA (mp),number of camera (back),Sum of megapixels,BATTERY (mAh),CO2E
0,iPhone SE - Gen 2,4.7,64,3,6,4,8,148,12,7,1,19,1821,57
1,iPhone SE - Gen 2,4.7,128,3,6,4,8,148,12,7,1,19,1821,62
2,iPhone SE - Gen 2,4.7,256,3,6,4,8,148,12,7,1,19,1821,73
3,iPhone SE - Gen 3,4.7,64,4,6,4,16,144,12,7,1,19,2018,46
4,iPhone SE - Gen 3,4.7,128,4,6,4,16,144,12,7,1,19,2018,50


In [6]:
def modelScore(X, y, model):
    avgSSE = []
    avgMAPE = []
    splits = LeaveOneOut().get_n_splits(X)
    #cross validation using leave one out
    for i, (trainIndex, testIndex) in enumerate(LeaveOneOut().split(X)):
        X_train = X[trainIndex]
        X_test = X[testIndex]
        y_train = y[trainIndex]
        y_test = y[testIndex]
    

        if model == 'linear regression':
            fittedModel = LinearRegression().fit(X_train, y_train)
                
        elif model == 'decision tree':
            fittedModel = tree.DecisionTreeRegressor().fit(X_train, y_train)
                
        elif model == 'random forest':
            y_train = y_train.flatten()
            RF = RandomForestRegressor(ccp_alpha=2.6, max_samples=.9, max_features='log2', min_samples_split=2,
                      n_estimators=50, max_depth=17, min_samples_leaf=1, criterion='squared_error')
            fittedModel = RF.fit(X_train, y_train)
            
        elif model == 'support vector regression':
            y_train = y_train.flatten()
            fittedModel = SVR(kernel = 'linear').fit(X_train, y_train)
    
        elif model == 'polynomial regression':
            polyX_train = PolynomialFeatures(degree=2).fit_transform(X_train)
            polyX_test = PolynomialFeatures(degree=2).fit_transform(X_test)
            fittedModel = LinearRegression().fit(polyX_train, y_train)
            X_test = polyX_test

        elif model == 'xgboost':
            xgbModel = xgb.XGBRFRegressor(max_depth=7, colsample_bytree=1.0, gamma=0.2, learning_rate=0.2, min_child_weight=1,
                                           n_estimators=50, reg_alpha=0, reg_lambda=0, subsample=1)
            fittedModel = xgbModel.fit(X_train, y_train)
        
        elif model =='lasso':
            fittedModel = Lasso().fit(X_train, y_train)
            

        y_pred = fittedModel.predict(X_test)
        
        #scores
        avgSSE.append(np.sum((y_test - y_pred) ** 2))
        avgMAPE.append(mean_absolute_percentage_error(y_test, y_pred))

    avgSSE, avgMAPE = np.mean(avgSSE), np.mean(avgMAPE)

   #print results
    print("SSE = " + str(avgSSE))
    print("MAPE = " + str(avgMAPE))
    print(r2_score([1], [2]))




In [7]:
modelScore(features, emissions, 'linear regression')

SSE = 19.697207782900612
MAPE = 0.04873289978762851
nan




In [8]:
modelScore(features, emissions, 'decision tree')

SSE = 62.666666666666664
MAPE = 0.0858217872684246
nan




In [9]:
modelScore(features, emissions, 'random forest')

SSE = 75.09693181970738
MAPE = 0.09418356037651165
nan




In [10]:
modelScore(features, emissions, 'support vector regression')

SSE = 23.649005892460583
MAPE = 0.04674170011550882
nan




In [11]:
modelScore(features, emissions, 'polynomial regression')

SSE = 131.59927368581603
MAPE = 0.0650026720431112
nan




In [12]:
modelScore(features, emissions, 'xgboost')

SSE = 193.64482982169284
MAPE = 0.14410857482397235
nan




In [13]:
modelScore(features, emissions, 'lasso')

SSE = 23.44672381260511
MAPE = 0.0479361956012399
nan


