Research Question / Hypothesis
----

What are the most important features to predict media influence (Total Interaction) for Facebook Page.



Load Data
-----

In [8]:
reset -sf

In [9]:
import pandas as pd
import numpy as np

import warnings
warnings.filterwarnings("ignore")

In [10]:
facebook_df = pd.read_csv('dataset_Facebook.csv', sep=';')
type(facebook_df)

pandas.core.frame.DataFrame

In [11]:
facebook_df.shape

(500, 19)

Feature Engineering
----

In [17]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.impute  import KNNImputer
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model  import LinearRegression, Lasso, Ridge, ElasticNet, HuberRegressor, BayesianRidge
from sklearn.pipeline      import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.model_selection  import RandomizedSearchCV
from sklearn.base import BaseEstimator

X = facebook_df.drop(['Total Interactions'], axis=1)
y = facebook_df['Total Interactions']
X.shape, y.shape

((500, 18), (500,))

#### convert categorical to numerical and deal with missing Data 

In [18]:
categorical_features = (X.dtypes == object)

numeric_features = (X.dtypes != object)

numeric_transformer = Pipeline(steps=[
    ('knn_imputer', KNNImputer()),
    ('scaler', StandardScaler())])

# Ignore categorical none values
categorical_transformer = OneHotEncoder(handle_unknown='ignore')

# preprocess the features
preprocessor = ColumnTransformer([('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)])

# Build the preprocessor pipeline
pipe = Pipeline(steps=[('preprocessor', preprocessor)])

Cross Validation and Model Selection
----

In [24]:
class DummyEstimator(BaseEstimator):
    def fit(self): pass
    def score(self): pass
    
# Create a pipeline

pipe = Pipeline([('preprocessor', preprocessor),
                 ('lr', DummyEstimator())]) # Placeholder Estimator

search_space = [{'lr': [LinearRegression()]},
                {'lr': [Lasso()]},
                {'lr': [BayesianRidge()]}]

lr_algos_rand = RandomizedSearchCV(estimator=pipe, 
                                    param_distributions=search_space, 
                                    n_iter=25,
                                    cv=5, 
                                    n_jobs=-1,
                                    verbose=1)

X_train, X_test, y_train, y_test = train_test_split(X, y)

best_model = lr_algos_rand.fit(X_train, y_train)

Fitting 5 folds for each of 3 candidates, totalling 15 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done  15 out of  15 | elapsed:    2.3s finished


In [25]:
best_model.get_params()

{'cv': 5,
 'error_score': nan,
 'estimator__memory': None,
 'estimator__steps': [('preprocessor',
   ColumnTransformer(transformers=[('num',
                                    Pipeline(steps=[('knn_imputer', KNNImputer()),
                                                    ('scaler', StandardScaler())]),
                                    Page total likes                                                        True
   Type                                                                   False
   Category                                                                True
   Post Month                                                              True
   Post Weekday                                                            True
   Post Hour                                                               True
   Paid                                                                    True
   Lifetime Post Total Reach                                               True
   Lifetime Post To

### The best model

In [26]:
best_model.best_estimator_.get_params()['lr']

LinearRegression()

Evalustion Metric
----

In [27]:
pipe = Pipeline(steps=[('preprocessor', preprocessor),
                      ('lr', LinearRegression())])

X_train, X_validation, y_train, y_validation = train_test_split(X,y)
pipe.fit(X_train, y_train)
y_pred = pipe.predict(X_validation)

In [31]:
def mean_cubed_error(y_true, y_pred):
    output_errors = np.average(np.abs((y_true - y_pred) ** 3), axis=0)
    return output_errors

from sklearn.metrics import mean_squared_error
mse = mean_squared_error(y_validation, y_pred)
print(f"Mean squared error: {mse:>12,.2f}")
mce = mean_cubed_error(y_validation, y_pred) 
print(f"Mean cubed error: {mce:>12,.2f}")

from sklearn.metrics import explained_variance_score 
print(f"Explained_variance_score: {explained_variance_score(y_validation, y_pred):.5}")

from sklearn.metrics import max_error
print(f"Max Error: {max_error(y_validation, y_pred):.5f}")

from sklearn.metrics import mean_absolute_error
print(f"Mean abusolute error: { mean_absolute_error(y_validation, y_pred): .5}")

from sklearn.metrics import explained_variance_score
print(f"Explained variance score {explained_variance_score(y_validation, y_pred): .5}")

from sklearn.metrics import r2_score
print(f"R2 score: {r2_score(y_validation, y_pred):.5f}")

from sklearn.metrics import mean_absolute_error
print(f"mean absolute error: {mean_absolute_error(y_validation, y_pred):.5f}")

Mean squared error:         0.13
Mean cubed error:         0.20
Explained_variance_score: 1.0
Max Error: 2.42729
Mean abusolute error:  0.19705
Explained variance score  1.0
R2 score: 1.00000
mean absolute error: 0.19705
