Research Question / Hypothesis
----

What are the most import features to predict media influence (Total Interaction) for Facebook Page.



Load Data
-----

In [1]:
reset -sf

In [3]:
import pandas as pd
import numpy as np

import warnings
warnings.filterwarnings("ignore")

In [4]:
facebook_df = pd.read_csv('dataset_Facebook.csv', sep=';')

In [5]:
type(facebook_df)

pandas.core.frame.DataFrame

In [75]:
facebook_df.T

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,490,491,492,493,494,495,496,497,498,499
Page total likes,139441,139441,139441,139441,139441,139441,139441,139441,139441,139441,...,85979,85979,85979,85093,85093,85093,81370,81370,81370,81370
Type,Photo,Status,Photo,Photo,Photo,Status,Photo,Photo,Status,Photo,...,Photo,Photo,Link,Photo,Photo,Photo,Photo,Photo,Photo,Photo
Category,2,2,3,2,2,2,3,3,2,3,...,3,3,1,3,3,3,2,1,3,2
Post Month,12,12,12,12,12,12,12,12,12,12,...,1,1,1,1,1,1,1,1,1,1
Post Weekday,4,3,3,2,2,1,1,7,7,6,...,6,6,5,1,7,7,5,5,4,4
Post Hour,3,10,3,10,3,9,3,9,3,10,...,11,3,11,2,10,2,8,2,11,4
Paid,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,
Lifetime Post Total Reach,2752,10460,2413,50128,7244,10472,11692,13720,11844,4694,...,5280,6184,45920,8412,5400,4684,3480,3778,4156,4188
Lifetime Post Total Impressions,5091,19057,4373,87991,13594,20849,19479,24137,22538,8668,...,8703,10228,5808,13960,9218,7536,6229,7216,7564,7292
Lifetime Engaged Users,178,1457,177,2211,671,1191,481,537,1530,280,...,951,956,753,1179,810,733,537,625,626,564


Feature Engineering
----

In [7]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.impute  import KNNImputer
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model  import LinearRegression, Lasso, Ridge, ElasticNet, HuberRegressor, BayesianRidge
from sklearn.pipeline      import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.model_selection  import RandomizedSearchCV

X = facebook_df.drop(['Total Interactions'], axis=1)
y = facebook_df['Total Interactions']
X.shape, y.shape

((500, 18), (500,))

#### convert categorical to numerical and deal with missing Data 

In [24]:
categorical_features = (X.dtypes == object)

numeric_features = (X.dtypes != object)

In [25]:
numeric_transformer = Pipeline(steps=[
    ('knn_imputer', KNNImputer()),
    ('scaler', StandardScaler())])

# Ignore categorical none values
categorical_transformer = OneHotEncoder(handle_unknown='ignore')

# preprocess the features
preprocessor = ColumnTransformer([('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)])

In [26]:
# Build the pipelint
pipe = Pipeline(steps=[('preprocessor', preprocessor)])

Cross validation
----

In [51]:
class DummyEstimator(BaseEstimator):
    def fit(self): pass
    def score(self): pass
    
# Create a pipeline

pipe = Pipeline([('preprocessor', preprocessor),
                 ('lr', DummyEstimator())]) # Placeholder Estimator

In [52]:
search_space = [{'lr': [LinearRegression()]},
                {'lr': [Lasso()]},
                {'lr': [BayesianRidge()]}]

In [53]:
lr_algos_rand = RandomizedSearchCV(estimator=pipe, 
                                    param_distributions=search_space, 
                                    n_iter=25,
                                    cv=5, 
                                    n_jobs=-1,
                                    verbose=1)

In [54]:
best_model = lr_algos_rand.fit(X_train, y_train)

Fitting 5 folds for each of 3 candidates, totalling 15 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done  15 out of  15 | elapsed:    3.0s finished


In [72]:
best_model.get_params()

{'cv': 5,
 'error_score': nan,
 'estimator__memory': None,
 'estimator__steps': [('preprocessor',
   ColumnTransformer(transformers=[('num',
                                    Pipeline(steps=[('knn_imputer', KNNImputer()),
                                                    ('scaler', StandardScaler())]),
                                    Page total likes                                                        True
   Type                                                                   False
   Category                                                                True
   Post Month                                                              True
   Post Weekday                                                            True
   Post Hour                                                               True
   Paid                                                                    True
   Lifetime Post Total Reach                                               True
   Lifetime Post To

### The best model

In [56]:

best_model.best_estimator_.get_params()['lr']

LinearRegression()

Evalustion Metric
----

In [62]:
pipe = Pipeline(steps=[('preprocessor', preprocessor),
                      ('lr', LinearRegression())])

X_train, X_validation, y_train, y_validation = train_test_split(X,y)
pipe.fit(X_train, y_train)
y_pred = pipe.predict(X_validation)

In [63]:
def mean_cubed_error(y_true, y_pred):
    output_errors = np.average(np.abs((y_true - y_pred) ** 3), axis=0)
    return output_errors

In [64]:
mse = mean_squared_error(y_validation, y_pred)
print(f"Mean squared error: {mse:>12,.2f}")
mce = mean_cubed_error(y_validation, y_pred) 
print(f"Mean cubed error: {mce:>12,.2f}")

Mean squared error:         0.06
Mean cubed error:         0.03


In [66]:
from sklearn.metrics import explained_variance_score 

print(f"Explained_variance_score: {explained_variance_score(y_validation, y_pred)}")

Explained_variance_score: 0.9999992516955262


In [67]:
from sklearn.metrics import max_error
print(f"Max Error: {max_error(y_validation, y_pred):.5f}")

Max Error: 1.09788


In [68]:
from sklearn.metrics import mean_absolute_error
print(f"Mean abusolute error: { mean_absolute_error(y_validation, y_pred): .5}")

Mean abusolute error:  0.19258


In [70]:
from sklearn.metrics import explained_variance_score
print(f"Explained variance score {explained_variance_score(y_validation, y_pred): .5}")

Explained variance score  1.0


In [71]:
from sklearn.metrics import r2_score
print(f"R2 score: {r2_score(y_validation, y_pred):.5f}")

R2 score: 1.00000


In [73]:
from sklearn.metrics import mean_absolute_error
print(f"mean absolute error: {mean_absolute_error(y_validation, y_pred):.5f}")

mean absolute error: 0.19258
