In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import cross_val_score, KFold, train_test_split
from sklearn.model_selection import LeaveOneOut
from sklearn.preprocessing import PolynomialFeatures

from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn import tree
from sklearn.svm import SVR
import xgboost as xgb
from sklearn.linear_model import Lasso
from sklearn.neural_network import MLPRegressor

from sklearn.metrics import r2_score
from sklearn.metrics import mean_absolute_percentage_error

In [2]:
df = pd.read_csv('IphoneData.csv')
newdf = df.drop(columns=['NAME', 'CO2E'])
features = newdf.to_numpy()
emissions = df['CO2E'].to_numpy()

In [3]:
def modelScore(X, y, model):
    avgR2 = avgSSE = avgMAPE = 0
    #cross validation using K folds
    kFolds = 5
    for i, (trainIndex, testIndex) in enumerate(KFold(n_splits=kFolds, shuffle=True, random_state=42).split(X)):
        X_train = X[trainIndex]
        X_test = X[testIndex]
        y_train = y[trainIndex]
        y_test = y[testIndex]
    

        if model == 'linear regression':
            fittedModel = LinearRegression().fit(X_train, y_train)
            
        elif model == 'decision tree':
            dtModel = tree.DecisionTreeRegressor()
            fittedModel = dtModel.fit(X_train, y_train)
                
        elif model == 'random forest':
            y_train = y_train.flatten()
            RF = RandomForestRegressor(max_features='log2', n_estimators=200, max_depth=20)
            fittedModel = RF.fit(X_train, y_train)
            
        elif model == 'support vector regression':
            y_train = y_train.flatten()
            fittedModel = SVR(kernel = 'linear', C=1).fit(X_train, y_train)
    
        elif model == 'polynomial regression':
            polyFeatures = PolynomialFeatures(degree=3, interaction_only=True, include_bias=True, order='C')
            polyX_train = polyFeatures.fit_transform(X_train)
            polyX_test = polyFeatures.transform(X_test)
            fittedModel = LinearRegression().fit(polyX_train, y_train)
            X_test = polyX_test

        elif model == 'xgboost':
            xgbModel = xgb.XGBModel(learning_rate=0.225, n_estimators=225, booster='gblinear',
                                        reg_alpha=0, reg_lambda=0, eval_metric='auc', 
                                        objective='reg:squaredlogerror', updater='coord_descent', 
                                        feature_selector='thrifty')
            fittedModel = xgbModel.fit(X_train, y_train)
        
        elif model =='lasso':
            lassoModel = Lasso(tol=.0001, max_iter=2000, selection='random', alpha=.15)

            fittedModel = lassoModel.fit(X_train, y_train)
        
        elif model =="neural network":
            nn = MLPRegressor(alpha=0.001, batch_size=25, hidden_layer_sizes=(10, 10, 10, 10),
             learning_rate_init=0.05, max_iter=5000, random_state=42, tol=1e-1, max_fun=25000,
             solver='lbfgs', activation='identity')

            fittedModel = nn.fit(X_train, y_train)

        y_pred = fittedModel.predict(X_test)

        #scores
        avgR2 += r2_score(y_test, y_pred)    
        avgSSE += np.sum((y_test - y_pred) ** 2)
        avgMAPE += mean_absolute_percentage_error(y_test, y_pred)



    avgR2, avgSSE, avgMAPE = avgR2 / kFolds, avgSSE / kFolds, avgMAPE / kFolds
   #print results
    """print(model)
    print("R2 = " + str(avgR2))
    print("SSE = " + str(avgSSE))
    print("MAPE = " + str(avgMAPE))"""
    print(str(avgR2))
    print(str(avgSSE))
    print(str(avgMAPE))




In [4]:
modelScore(features, emissions, 'neural network')

0.9158294656879136
166.71653292690127
0.04583188851292793


In [5]:
modelScore(features, emissions, 'linear regression')

0.9132785915590607
172.6195696345183
0.04614202622279183


In [6]:
modelScore(features, emissions, 'decision tree')

0.6584630762800598
700.0
0.0895217284405809


In [7]:
modelScore(features, emissions, 'random forest')

0.7435343463686201
601.0196599999999
0.08280872413154787


In [8]:
modelScore(features, emissions, 'support vector regression')

In [None]:
modelScore(features, emissions, 'polynomial regression')

0.9176046244206117
213.04590046008312
0.030430745629900446


In [None]:
modelScore(features, emissions, 'xgboost')

0.9073441886813531
195.56138811718557
0.04682619770749446


In [None]:
modelScore(features, emissions, 'lasso')

0.9012053165006069
199.95554621445203
0.049636099664693366


In [None]:
modelsList = ['linear regression', 'decision tree', 'random forest', 'support vector regression', 'polynomial regression', 'xgboost', 'lasso', 'neural network']
multipleFeatures = df[['DISPLAY (inch)', 'STORAGE (gb)', 'MEMORY (gb)', 'GPU (cores)', 'NEURAL ENGINE (cores)', 'WEIGHT (gm)', 'FRONT CAMERA (mp)', 'Sum of megapixels', 'MAIN CAMERA (mp)']].to_numpy()
featuresList = ['DISPLAY (inch)', 'STORAGE (gb)', 'MEMORY (gb)', 'CPU (cores)', 'GPU (cores)', 'NEURAL ENGINE (cores)', 'WEIGHT (gm)', 'FRONT CAMERA (mp)', 'BATTERY (mAh)', 'number of camera (back)', 'Sum of megapixels', 'MAIN CAMERA (mp)']

In [None]:
for model in modelsList:
    modelScore(multipleFeatures, emissions, model)

0.9157485339743229
166.44599819781314
0.04576645538607911
0.7197884663504238
504.2
0.07843891003384804
0.7931730005278
498.302065
0.07614224268569467
0.8805241675950792
222.9595547182927
0.04619554470783489
0.7814500959290859
591.3187512985676
0.051069960445606445
0.8966873806417877
215.77824660046608
0.04789759321934304
0.8973009280068478
205.79506031323936
0.04993141128064318
0.9197432165739599
158.92734549349956
0.04448302132170516


In [None]:
"""for x in featuresList:
    print(x)
    for model in modelsList:
        print(modelScore(df[x].to_numpy().reshape(-1, 1), emissions, model))
    print("")  """

'for x in featuresList:\n    print(x)\n    for model in modelsList:\n        print(modelScore(df[x].to_numpy().reshape(-1, 1), emissions, model))\n    print("")  '