In [1]:
%load_ext jupyternotify
%autonotify -a 60

<IPython.core.display.Javascript object>

In [2]:
from sklearn import preprocessing
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split

from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()

from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.multioutput import MultiOutputRegressor

from scipy import stats

import pandas as pd
import numpy as np
import lightgbm as lgb
import sys

import warnings
warnings.filterwarnings('ignore')

In [3]:
def openFile(path):
    
    # Display upto 500 characters per column
    pd.set_option('display.max_colwidth', 500)
   
    
    # Open dataset
    df_original = pd.read_csv(path + 'dataset_Facebook.csv', delimiter = ';')
    
    # Training and target features
    base_features = df_original.columns[:7]  
    target_features = df_original.columns[7:]
    
    # Drop empty fields
    df = df_original.dropna()
    
    # Change the Type feature to category first before using one-hot-encoding
    # All dataFrames below this point can be used for ML regression algorithms
    # The DFs above this can be used for regression IFF the 'Type' feature is dropped
    df_1hoten = df
    df_1hoten['Type'] = df['Type'].astype('category').cat.codes
    
    # Drop outliers that lie outside 3 standard deviations from the mean
    df_no_outliers = df
    df_no_outliers = df_no_outliers[(np.abs(stats.zscore(df)) < 3).all(axis = 1)]
    
    # DataFrame with scaled training features
    df_scaled = df
    df_scaled[base_features] = scaler.fit_transform(df_scaled[base_features].values)

    return df_original, df, df_1hoten, df_no_outliers, df_scaled, base_features, target_features

In [4]:
def MAE(true, target):
    return mean_absolute_error(true, target)

In [5]:
def MAPE(y_true, y_pred): 
    y_true, y_pred = np.array(y_true), np.array(y_pred)
    return np.mean(np.abs(np.divide((y_true - y_pred), y_true, out = np.zeros_like((y_true - y_pred)), where=y_true!=0))) * 100

In [6]:
def model(baseFeatures, targets, algorithm, algorithmName):
    
    # Split the data into training and testing
    test_size = 0.3
    train_X, test_X, y_train, y_test = train_test_split(baseFeatures, targets, random_state=1, test_size = test_size)
    
    # The bottom 4 lines are good practice for larger datasets. For this dataset, it is not necessary to use these
    #   cv = KFold(n_splits=4, random_state=2019, shuffle=True)
    #   for train_index, test_index in cv.split(baseFeatures,target):
    #     algorithm.fit(X.iloc[train_index], y.iloc[train_index])
    #     predicts = algorithm.predict(X.iloc[test_index])

    # Train the model and predict outcomes based on each model
    model = algorithm.fit(train_X, y_train)  
    predicts = model.predict(test_X)
    
    
    # Loop through actual values and predicted values for comparison
    # This allows us to calculate the Mean Absolute Error (MAE) and Mean Absolute Percentage Error (MAPE)
    # The standard deviation of the actual values is also calculated to put results in perspective
    # The scores will be given next to their respected target variable
    
    totalColumns = len(targets.columns)

    true_ys = []
    pred_ys = []

    colName = []
    MAE_ = []
    MAPE_ = []
    StDev = []

    for i in range(totalColumns):
        true_ys.append(y_test[y_test.columns[i]].values)
        pred_ys.append([row[i] for row in predicts])

        colName.append(targets.columns[i])
        MAE_.append(MAE(true_ys[i], pred_ys[i]))
        MAPE_.append(MAPE(true_ys[i], pred_ys[i])) 
        StDev.append(stats.tstd(y_test[y_test.columns[i]].values))
        
    # Create dataframe along with the name of the algorithm
    DF = pd.DataFrame({'Column Name': colName, 'MAE': MAE_, 'MAPE': MAPE_, 'St. Deviation': StDev})
    DF.name = algorithmName
    
    return DF

In [7]:
# Call this after the main() function
# metrics = variable the main() function is stored in

def all_metrics(metrics):
    for i in range(len(metrics)):
        print(metrics[i].name)
        display(metrics[i])

In [8]:
# Save this function inside a variable
# For example Z = main()

def main(dataFrame, base_features, target_features, algorithms):
    
    # Empty list to append x model metrics to
    # x = number of algorithm 
    DF_model = []
    
    # Assign training and target features from selected dataframe
    X = dataFrame[base_features]
    y = dataFrame[target_features]
    
    # Loop through each algorithm and apply it to the  dataframe
    # Keep the name of the algorithm used so we can link the results to the algorithm
    for count, algorithm in enumerate(algorithms):
        regr = MultiOutputRegressor(algorithm, n_jobs = -1)
        model_ = model(X, y, regr, algorithmNames[count])
        DF_model.append(model_)
    
    return DF_model

In [9]:
# Algorithms

rfR = RandomForestRegressor()
linR = LinearRegression()
logR = LogisticRegression()
svr = SVR(kernel='rbf', gamma='scale')

regr = MultiOutputRegressor(svr, n_jobs = -1)

In [10]:
algorithms = [rfR, linR, logR, svr]

In [11]:
algorithmNames = ['Random Forest Regressor', 'Linear Regression', 'Logistic Regression', 'Support Vector Machine', \
                  'LightGBM']

In [12]:
# path = sys.argv[0]
path = '/Users/adil/Documents/Additional Work/fospha/'

In [13]:
df_original, df, df_1hoten, df_no_outliers, df_scaled, base_features, target_features = openFile(path)

In [14]:
print('Original df:             ', df_original.shape)
print('Cleaned df:              ', df.shape)
print('1 hot Encoded df:        ', df_1hoten.shape)
print('Scaled base_features df: ', df_scaled.shape)
print('No outliers df:          ', df_no_outliers. shape)

Original df:              (500, 19)
Cleaned df:               (495, 19)
1 hot Encoded df:         (495, 19)
Scaled base_features df:  (495, 19)
No outliers df:           (444, 19)


###### 1 hot Encoded df

In [15]:
metrics = main(df_1hoten, base_features, target_features, algorithms)

In [16]:
all_metrics(metrics)

Random Forest Regressor


Unnamed: 0,Column Name,MAE,MAPE,St. Deviation
0,Lifetime Post Total Reach,15400.103356,212.95181,22953.365368
1,Lifetime Post Total Impressions,40939.503356,281.663798,103865.529883
2,Lifetime Engaged Users,587.278523,129.264674,828.712649
3,Lifetime Post Consumers,448.467114,121.487512,740.374066
4,Lifetime Post Consumptions,983.376063,237.279674,2228.661941
5,Lifetime Post Impressions by people who have liked your Page,22999.993624,205.466796,91911.269302
6,Lifetime Post reach by people who like your Page,5262.51745,127.023074,7880.540362
7,Lifetime People who have liked your Page and engaged with your post,365.710067,107.634759,599.413117
8,comment,9.140268,152.899572,17.113544
9,like,162.954139,227.753668,273.813818


Linear Regression


Unnamed: 0,Column Name,MAE,MAPE,St. Deviation
0,Lifetime Post Total Reach,14116.212358,267.588263,22953.365368
1,Lifetime Post Total Impressions,36709.785273,301.31373,103865.529883
2,Lifetime Engaged Users,514.423039,189.196766,828.712649
3,Lifetime Post Consumers,426.711226,185.810533,740.374066
4,Lifetime Post Consumptions,991.673732,276.192932,2228.661941
5,Lifetime Post Impressions by people who have liked your Page,21385.896145,233.108734,91911.269302
6,Lifetime Post reach by people who like your Page,5152.568732,156.83846,7880.540362
7,Lifetime People who have liked your Page and engaged with your post,346.529847,129.330768,599.413117
8,comment,8.155898,128.953968,17.113544
9,like,151.193993,232.093323,273.813818


Logistic Regression


Unnamed: 0,Column Name,MAE,MAPE,St. Deviation
0,Lifetime Post Total Reach,13284.295302,147.356901,22953.365368
1,Lifetime Post Total Impressions,34437.791946,121.801263,103865.529883
2,Lifetime Engaged Users,597.362416,70.892111,828.712649
3,Lifetime Post Consumers,532.47651,78.68492,740.374066
4,Lifetime Post Consumptions,1014.630872,126.856925,2228.661941
5,Lifetime Post Impressions by people who have liked your Page,18869.986577,74.128997,91911.269302
6,Lifetime Post reach by people who like your Page,5170.395973,88.157389,7880.540362
7,Lifetime People who have liked your Page and engaged with your post,362.181208,74.498151,599.413117
8,comment,7.832215,62.825036,17.113544
9,like,132.610738,104.481794,273.813818


Support Vector Machine


Unnamed: 0,Column Name,MAE,MAPE,St. Deviation
0,Lifetime Post Total Reach,12439.958881,93.745584,22953.365368
1,Lifetime Post Total Impressions,31307.453061,88.01051,103865.529883
2,Lifetime Engaged Users,527.413055,107.966279,828.712649
3,Lifetime Post Consumers,465.757259,109.096873,740.374066
4,Lifetime Post Consumptions,996.619274,133.712188,2228.661941
5,Lifetime Post Impressions by people who have liked your Page,17803.640636,77.511108,91911.269302
6,Lifetime Post reach by people who like your Page,4805.679742,78.88572,7880.540362
7,Lifetime People who have liked your Page and engaged with your post,336.705194,83.998193,599.413117
8,comment,7.212269,57.646672,17.113544
9,like,126.246433,173.288419,273.813818


###### Scaled base_features df

In [17]:
metrics_scaled = main(df_scaled, base_features, target_features, algorithms)

In [18]:
all_metrics(metrics_scaled)

Random Forest Regressor


Unnamed: 0,Column Name,MAE,MAPE,St. Deviation
0,Lifetime Post Total Reach,14388.155436,202.055479,22953.365368
1,Lifetime Post Total Impressions,38884.564228,244.189123,103865.529883
2,Lifetime Engaged Users,567.52349,137.030068,828.712649
3,Lifetime Post Consumers,491.945503,109.991524,740.374066
4,Lifetime Post Consumptions,1028.885235,155.802422,2228.661941
5,Lifetime Post Impressions by people who have liked your Page,20915.964206,176.630393,91911.269302
6,Lifetime Post reach by people who like your Page,5448.890604,139.809935,7880.540362
7,Lifetime People who have liked your Page and engaged with your post,359.081208,104.924651,599.413117
8,comment,8.039262,115.38417,17.113544
9,like,166.161521,211.630044,273.813818


Linear Regression


Unnamed: 0,Column Name,MAE,MAPE,St. Deviation
0,Lifetime Post Total Reach,14116.212358,267.588263,22953.365368
1,Lifetime Post Total Impressions,36709.785273,301.31373,103865.529883
2,Lifetime Engaged Users,514.423039,189.196766,828.712649
3,Lifetime Post Consumers,426.711226,185.810533,740.374066
4,Lifetime Post Consumptions,991.673732,276.192932,2228.661941
5,Lifetime Post Impressions by people who have liked your Page,21385.896145,233.108734,91911.269302
6,Lifetime Post reach by people who like your Page,5152.568732,156.83846,7880.540362
7,Lifetime People who have liked your Page and engaged with your post,346.529847,129.330768,599.413117
8,comment,8.155898,128.953968,17.113544
9,like,151.193993,232.093323,273.813818


Logistic Regression


Unnamed: 0,Column Name,MAE,MAPE,St. Deviation
0,Lifetime Post Total Reach,13284.295302,147.356901,22953.365368
1,Lifetime Post Total Impressions,34437.791946,121.801263,103865.529883
2,Lifetime Engaged Users,597.362416,70.892111,828.712649
3,Lifetime Post Consumers,532.47651,78.68492,740.374066
4,Lifetime Post Consumptions,1014.630872,126.856925,2228.661941
5,Lifetime Post Impressions by people who have liked your Page,18869.986577,74.128997,91911.269302
6,Lifetime Post reach by people who like your Page,5170.395973,88.157389,7880.540362
7,Lifetime People who have liked your Page and engaged with your post,362.181208,74.498151,599.413117
8,comment,7.832215,62.825036,17.113544
9,like,132.610738,104.481794,273.813818


Support Vector Machine


Unnamed: 0,Column Name,MAE,MAPE,St. Deviation
0,Lifetime Post Total Reach,12439.958881,93.745584,22953.365368
1,Lifetime Post Total Impressions,31307.453061,88.01051,103865.529883
2,Lifetime Engaged Users,527.413055,107.966279,828.712649
3,Lifetime Post Consumers,465.757259,109.096873,740.374066
4,Lifetime Post Consumptions,996.619274,133.712188,2228.661941
5,Lifetime Post Impressions by people who have liked your Page,17803.640636,77.511108,91911.269302
6,Lifetime Post reach by people who like your Page,4805.679742,78.88572,7880.540362
7,Lifetime People who have liked your Page and engaged with your post,336.705194,83.998193,599.413117
8,comment,7.212269,57.646672,17.113544
9,like,126.246433,173.288419,273.813818


###### No outliers df

In [19]:
No_Outliers_metrics = main(df_no_outliers, base_features, target_features, algorithms)

In [20]:
all_metrics(No_Outliers_metrics)

Random Forest Regressor


Unnamed: 0,Column Name,MAE,MAPE,St. Deviation
0,Lifetime Post Total Reach,7027.204963,155.553722,10079.241935
1,Lifetime Post Total Impressions,15361.428607,206.533122,17270.276407
2,Lifetime Engaged Users,322.200373,148.843199,561.224046
3,Lifetime Post Consumers,272.430597,132.245065,537.552101
4,Lifetime Post Consumptions,551.322388,184.150624,916.951856
5,Lifetime Post Impressions by people who have liked your Page,6280.87245,119.267846,8277.840741
6,Lifetime Post reach by people who like your Page,2739.866318,92.737636,4132.535029
7,Lifetime People who have liked your Page and engaged with your post,203.24291,118.862832,429.976939
8,comment,4.802985,90.191113,5.341723
9,like,87.337687,206.499807,104.880366


Linear Regression


Unnamed: 0,Column Name,MAE,MAPE,St. Deviation
0,Lifetime Post Total Reach,8395.296087,210.057428,10079.241935
1,Lifetime Post Total Impressions,15294.429129,224.808635,17270.276407
2,Lifetime Engaged Users,328.515275,169.182651,561.224046
3,Lifetime Post Consumers,293.441605,157.968708,537.552101
4,Lifetime Post Consumptions,497.827716,199.205713,916.951856
5,Lifetime Post Impressions by people who have liked your Page,6258.635986,126.31245,8277.840741
6,Lifetime Post reach by people who like your Page,3068.127435,110.071594,4132.535029
7,Lifetime People who have liked your Page and engaged with your post,222.667295,138.6461,429.976939
8,comment,4.122611,79.332439,5.341723
9,like,83.207141,207.93387,104.880366


Logistic Regression


Unnamed: 0,Column Name,MAE,MAPE,St. Deviation
0,Lifetime Post Total Reach,6096.38806,83.759919,10079.241935
1,Lifetime Post Total Impressions,9483.716418,63.565825,17270.276407
2,Lifetime Engaged Users,418.955224,91.455954,561.224046
3,Lifetime Post Consumers,388.343284,82.176036,537.552101
4,Lifetime Post Consumptions,588.186567,105.226184,916.951856
5,Lifetime Post Impressions by people who have liked your Page,7007.559701,76.13247,8277.840741
6,Lifetime Post reach by people who like your Page,3595.738806,73.016941,4132.535029
7,Lifetime People who have liked your Page and engaged with your post,228.925373,86.641722,429.976939
8,comment,4.559701,77.61194,5.341723
9,like,85.865672,98.42711,104.880366


Support Vector Machine


Unnamed: 0,Column Name,MAE,MAPE,St. Deviation
0,Lifetime Post Total Reach,5451.224402,74.085543,10079.241935
1,Lifetime Post Total Impressions,9164.978744,72.030025,17270.276407
2,Lifetime Engaged Users,363.403541,155.52989,561.224046
3,Lifetime Post Consumers,339.907521,141.637423,537.552101
4,Lifetime Post Consumptions,566.949233,169.959333,916.951856
5,Lifetime Post Impressions by people who have liked your Page,5313.928485,64.896924,8277.840741
6,Lifetime Post reach by people who like your Page,2805.069418,65.856076,4132.535029
7,Lifetime People who have liked your Page and engaged with your post,242.755573,117.188599,429.976939
8,comment,3.452898,46.982226,5.341723
9,like,73.113401,188.041487,104.880366
