In [6]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from xgboost import XGBRegressor
import lightgbm as lgbm
import matplotlib.pyplot as plt

In [7]:
ensemble_dataset = pd.DataFrame(columns = ['accuracy_linear', 'accuracy_lin_rf_ensemble', 'accuracy_lin_rf_gbm_ensemble', 
                                'accuracy_lin_rf_gbm_xgb_ensemble', 'accuracy_lin_rf_gbm_xgb_lgbm_ensemble', 'num_rows',
                                'num_cols'])

In [8]:
import glob
x = glob.glob(r'C:\Users\rohit\Desktop\MSc. Computer Science - Data Science\Machine Learning\Group project\datasets\regression\*.csv')

In [9]:
for i in range(0,len(x)):
    dataset = pd.read_csv(x[i])
    
    #Dropping all NA rows
    dataset= dataset.dropna()

    # Labels are the values we want to predict
    labels = np.array(dataset.iloc[:,-1])
    
    # Remove the labels from the features, axis 1 refers to the columns
    dataset= dataset.iloc[:, :-1]
    
    #One hot encoding
    dataset1 = pd.get_dummies(dataset)
    
    # Saving feature names for later use
    dataset1_list = list(dataset1.columns)
    
    # Convert to numpy array
    dataset1 = np.array(dataset1)
    
    # Using Skicit-learn to split data into training and testing sets
    from sklearn.model_selection import train_test_split
    
    # Split the data into training and testing sets
    train_dataset1, test_dataset1, train_labels, test_labels = train_test_split(dataset1, labels, 
                                                                                test_size = 0.2, random_state = 42)
    
    #LinearRegression
    linear_reg = LinearRegression().fit(train_dataset1, train_labels)

    pred_linear = linear_reg.predict(test_dataset1)
    pred_linear

    #Accuracy of linear regression model
    accuracy_linear = r2_score(test_labels, pred_linear)
    accuracy_linear

    #RandomForest
    rf_reg = RandomForestRegressor(n_estimators = 1000, random_state = 1).fit(train_dataset1, train_labels)

    pred_rf = rf_reg.predict(test_dataset1)
    pred_rf

    #Accuracy of rf model
    accuracy_rf = r2_score(test_labels, pred_rf)
    accuracy_rf

    #ensembled predictions 
    pred = (pred_rf + pred_linear)/2

    #Accuracy of ensembled model
    accuracy_lin_rf_ensemble = r2_score(test_labels, pred)
    accuracy_lin_rf_ensemble

    #Gradient Boosting
    GBM_reg = GradientBoostingRegressor().fit(train_dataset1, train_labels)

    pred_gbm = GBM_reg.predict(test_dataset1)
    pred_gbm

    #Accuracy of gbm model
    accuracy_gbm = r2_score(test_labels, pred_gbm)
    accuracy_gbm

    #ensembled predictions for linear, rf, gbm
    pred = (pred_rf + pred_linear + pred_gbm)/3

    #Accuracy of ensembled model
    accuracy_lin_rf_gbm_ensemble = r2_score(test_labels, pred)
    accuracy_lin_rf_gbm_ensemble

    #Xtreme Gradient Boosting
    XGB_reg = XGBRegressor().fit(train_dataset1, train_labels)

    pred_xgb = XGB_reg.predict(test_dataset1)
    pred_xgb

    #Accuracy of xgb model
    accuracy_xgb = r2_score(test_labels, pred_xgb)
    accuracy_xgb

    #ensembled predictions for linear, rf, gbm,xgb
    pred = (pred_rf + pred_linear + pred_gbm + pred_xgb)/4

    #Accuracy of ensembled model
    accuracy_lin_rf_gbm_xgb_ensemble = r2_score(test_labels, pred)
    accuracy_lin_rf_gbm_xgb_ensemble

    #Light Gradient Boosting
    LGBM_reg = lgbm.LGBMRegressor().fit(train_dataset1, train_labels)

    pred_lgbm = LGBM_reg.predict(test_dataset1)
    pred_lgbm

    #Accuracy of lgbm model
    accuracy_lgbm = r2_score(test_labels, pred_lgbm)
    accuracy_lgbm

    #ensembled predictions for linear, rf, gbm, xgb, lgbm
    pred = (pred_rf + pred_linear + pred_gbm + pred_xgb + pred_lgbm)/5

    #Accuracy of ensembled model
    accuracy_lin_rf_gbm_xgb_lgbm_ensemble = r2_score(test_labels, pred)
    accuracy_lin_rf_gbm_xgb_lgbm_ensemble
    
    ensemble_dataset = ensemble_dataset.append({'accuracy_linear' :accuracy_linear, 
                                            'accuracy_lin_rf_ensemble': accuracy_lin_rf_ensemble, 
                                            'accuracy_lin_rf_gbm_ensemble':accuracy_lin_rf_gbm_ensemble, 
                                            'accuracy_lin_rf_gbm_xgb_ensemble':accuracy_lin_rf_gbm_xgb_ensemble,
                                            'accuracy_lin_rf_gbm_xgb_lgbm_ensemble':accuracy_lin_rf_gbm_xgb_lgbm_ensemble, 
                                            'num_rows': len(train_dataset1),
                                            'num_cols': len((dataset1[0]))},
                                             ignore_index=True)
    
    (ensemble_dataset)

In [10]:
ensemble_dataset

Unnamed: 0,accuracy_linear,accuracy_lin_rf_ensemble,accuracy_lin_rf_gbm_ensemble,accuracy_lin_rf_gbm_xgb_ensemble,accuracy_lin_rf_gbm_xgb_lgbm_ensemble,num_rows,num_cols
0,0.980836,0.988569,0.988381,0.987775,0.988986,28137.0,14.0
1,0.783593,0.85868,0.872337,0.876821,0.877222,1070.0,11.0
2,0.759473,0.83664,0.850653,0.855397,0.859428,204347.0,46.0
3,0.248264,0.357146,0.378723,0.385725,0.394855,49939.0,23.0
4,0.513047,0.53997,0.542225,0.541918,0.546596,8929.0,52.0
5,0.819744,0.880715,0.87236,0.863836,0.871465,5197.0,10.0
6,0.074863,0.124252,0.132668,0.134895,0.138982,5757.0,30.0
7,0.157779,0.47089,0.531737,0.55294,0.563237,61484.0,15.0
8,0.990501,0.995155,0.9961,0.996285,0.996382,36777.0,15.0
9,0.038272,0.0198,0.02352,0.024296,0.025223,3616.0,43.0
