In [3]:
import pandas as pd
import numpy as np

from sklearn.model_selection import cross_val_score, KFold, train_test_split
from sklearn.model_selection import LeaveOneOut
from sklearn.preprocessing import PolynomialFeatures

from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn import tree
from sklearn.svm import SVR
import xgboost as xgb
from sklearn.linear_model import Lasso

from sklearn.metrics import r2_score
from sklearn.metrics import mean_absolute_percentage_error

In [4]:
df = pd.read_csv('IphoneData.csv')
newdf = df.drop(columns=['NAME', 'CO2E'])
features = newdf.to_numpy()
emissions = df['CO2E'].to_numpy()

In [21]:
def modelScore(X, y, model):
    avgAdjustedR2 = []
    avgSSE = []
    avgMAPE = []
    splits = LeaveOneOut().get_n_splits(X)
    #cross validation using leave one out
    for i, (trainIndex, testIndex) in enumerate(LeaveOneOut().split(X)):
        X_train = X[trainIndex]
        X_test = X[testIndex]
        y_train = y[trainIndex]
        y_test = y[testIndex]
    

        if model == 'linear regression':
            fittedModel = LinearRegression().fit(X_train, y_train)
                
        elif model == 'decision tree':
            fittedModel = tree.DecisionTreeRegressor().fit(X_train, y_train)
                
        elif model == 'random forest':
            y_train = y_train.flatten()
            fittedModel = RandomForestRegressor().fit(X_train, y_train)
            
        elif model == 'support vector regression':
            y_train = y_train.flatten()
            fittedModel = SVR(kernel = 'linear').fit(X_train, y_train)
    
        elif model == 'polynomial regression':
            polyX_train = PolynomialFeatures(degree=2).fit_transform(X_train)
            polyX_test = PolynomialFeatures(degree=2).fit_transform(X_test)
            fittedModel = LinearRegression().fit(polyX_train, y_train)
            X_test = polyX_test

        elif model == 'xgboost':
            fittedModel = xgb.XGBRFRegressor(objective ='reg:linear', n_estimators = 20, seed = 42).fit(X_train, y_train)
        
        elif model =='lasso':
            fittedModel = Lasso().fit(X_train, y_train)
            

        y_pred = fittedModel.predict(X_test)
        
        #scores
        avgAdjustedR2.append(1 - (1 - fittedModel.score(X_train, y_train)) * (len(X_train) - 1) / (len(X_train) - X_train.shape[1] - 1))
        avgSSE.append(np.sum((y_test - y_pred) ** 2))
        avgMAPE.append(mean_absolute_percentage_error(y_test, y_pred))

    avgAdjustedR2, avgSSE, avgMAPE = np.mean(avgAdjustedR2), np.mean(avgSSE), np.mean(avgMAPE)

   #print results
    print("R2 = " + str(avgAdjustedR2))
    print("SSE = " + str(avgSSE))
    print("MAPE = " + str(avgMAPE))
    print(r2_score([1], [2]))




In [22]:
modelScore(features, emissions, 'linear regression')

R2 = 0.946761532714051
SSE = 19.697207782900612
MAPE = 0.04873289978762851
nan




In [23]:
modelScore(features, emissions, 'decision tree')

R2 = 1.0
SSE = 62.916666666666664
MAPE = 0.08404252290512627
nan




In [24]:
modelScore(features, emissions, 'random forest')

R2 = 0.9722385672436585
SSE = 37.456539583333324
MAPE = 0.06307360899438141
nan




In [25]:
modelScore(features, emissions, 'support vector regression')

R2 = 0.9060843829425553
SSE = 23.649005892460583
MAPE = 0.04674170011550882
nan




In [27]:
modelScore(features, emissions, 'polynomial regression')

ValueError: X has 12 features, but LinearRegression is expecting 91 features as input.

In [28]:
modelScore(features, emissions, 'xgboost')



R2 = 0.9874222368403315
SSE = 38.03783664951061
MAPE = 0.06440490683705988
nan




In [29]:
modelScore(features, emissions, 'lasso')

R2 = 0.916074903573146
SSE = 23.44672381260511
MAPE = 0.0479361956012399
nan


