Research Question / Hypothesis
----

What are the most important features to predict media influence (Total Interaction) for Facebook Page.



Load Data
-----

In [41]:
reset -sf

In [42]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings("ignore")

In [43]:
# data type is ready
facebook_df = pd.read_csv('dataset_Facebook.csv', sep=';')
type(facebook_df)

pandas.core.frame.DataFrame

In [56]:
# data shape is appropriate
facebook_df.shape

(500, 19)

Feature Engineering'
----

In [45]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.impute  import KNNImputer
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model  import LinearRegression, Lasso, Ridge, ElasticNet, HuberRegressor, BayesianRidge
from sklearn.pipeline      import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.model_selection  import RandomizedSearchCV
from sklearn.base import BaseEstimator
from sklearn import feature_selection

# split X and y data set
X = facebook_df.drop(['Total Interactions'], axis=1)
y = facebook_df['Total Interactions']
X.shape, y.shape

((500, 18), (500,))

#### convert categorical to numerical and deal with missing Data 

In [46]:
# get the categorical features
categorical_features = (X.dtypes == object)

# get the numerical features

numeric_features = (X.dtypes != object)

# build a pipeling to preprocess numerical variables
numeric_transformer = Pipeline(steps=[
    ('knn_imputer', KNNImputer()),  # fill missing values with k nearest neighbor
    ('scaler', StandardScaler())])  # standarlize numerical values

# use OneHotEncoder to deal with missing categorical values
categorical_transformer = OneHotEncoder(handle_unknown='ignore')

# preprocessor for the data set, combine categorial and numerical preprocessors. 
preprocessor = ColumnTransformer([('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)])

Cross Validation and Model Selection
----

In [52]:
# dummy estimator will be a place holder in the pipepline
class DummyEstimator(BaseEstimator):
    def fit(self): pass
    def score(self): pass
    
# feature selection
top_feat = feature_selection.SelectKBest()

# Create a pipeline
pipe = Pipeline([('preprocessor', preprocessor), # preprocessor
                 ('feat', top_feat), # feature selection
                 ('lr', DummyEstimator())]) # Place holder Estimator

# build a search space
# put in three models I am going to search. From my model check home work, I found those three performs the best
search_space = [{'lr': [LinearRegression()]}, 
                {'lr': [Lasso()]},
                {'lr': [BayesianRidge()]}]

# random search cross validaation
lr_algos_rand = RandomizedSearchCV(estimator=pipe, # the pipeline
                                    param_distributions=search_space, # para search space
                                    n_iter=25, # number of iterations 
                                    cv=5,  # 5 folds to run cross validation
                                    n_jobs=-1) # use all the cores to run the code

# split train and test dataset
X_train, X_test, y_train, y_test = train_test_split(X, y)

# fit the model
best_model = lr_algos_rand.fit(X_train, y_train)

In [53]:
# get some information of the best model
best_model.get_params()

{'cv': 5,
 'error_score': nan,
 'estimator__memory': None,
 'estimator__steps': [('preprocessor',
   ColumnTransformer(transformers=[('num',
                                    Pipeline(steps=[('knn_imputer', KNNImputer()),
                                                    ('scaler', StandardScaler())]),
                                    Page total likes                                                        True
   Type                                                                   False
   Category                                                                True
   Post Month                                                              True
   Post Weekday                                                            True
   Post Hour                                                               True
   Paid                                                                    True
   Lifetime Post Total Reach                                               True
   Lifetime Post To

### The best model

In [54]:
best_model.best_estimator_.get_params()['lr']

LinearRegression()

### Important Features

In [50]:
# get the best features from the selected model
best_features = best_model.best_estimator_.named_steps["feat"].get_support(indices=True)

# print out the best features
facebook_df.columns[best_features]

Index(['Paid', 'Lifetime Post Total Reach', 'Lifetime Post Total Impressions',
       'Lifetime Engaged Users', 'Lifetime Post Consumptions',
       'Lifetime Post Impressions by people who have liked your Page',
       'Lifetime Post reach by people who like your Page',
       'Lifetime People who have liked your Page and engaged with your post',
       'comment', 'like'],
      dtype='object')

Evalustion Metric
----

In [40]:
# a pipeline for our best model
pipe = Pipeline(steps=[('preprocessor', preprocessor),
                      ('lr', LinearRegression())])

# fit the best model
pipe.fit(X_train, y_train)

# get the predicted values
y_pred = pipe.predict(X_test)

def mean_cubed_error(y_true, y_pred):
    output_errors = np.average(np.abs((y_true - y_pred) ** 3), axis=0)
    return output_errors

# Mean squared error: to check how close a regression line is to the test set of points 
from sklearn.metrics import mean_squared_error
mse = mean_squared_error(y_test, y_pred)
print(f"Mean squared error: {mse:>12,.2f}")

# Mean cubed error: check the accuracy
mce = mean_cubed_error(y_test, y_pred) 
print(f"Mean cubed error: {mce:>12,.2f}")

# Max Error: check the max error in the test set
from sklearn.metrics import max_error
print(f"Max Error: {max_error(y_test, y_pred):.5f}")

# Mean abusolute error: Check the accumulated errors from the test set 
from sklearn.metrics import mean_absolute_error
print(f"Mean abusolute error: { mean_absolute_error(y_test, y_pred): .5}")

# R2 score: check how much the model can explain the result
from sklearn.metrics import r2_score
print(f"R2 score: {r2_score(y_test, y_pred):.5f}")


Mean squared error:         0.06
Mean cubed error:         0.03
Max Error: 1.07447
Mean abusolute error:  0.18273
R2 score: 1.00000


### Summary 

The best model is linear regression, and the important features inclues but not limited: Paid, Lifetime Post Total Reach, Lifetime Engaged Users, Lifetime Post Impressions by people who have liked your Page, Lifetime Post reach by people who like your Page, comment, and like.

This is a pretty straightforward linear regression problem, thus the model is expected to perform well. Also the important features auto selected by the algorithm also makes sense. 

From business point of view, I think this is important for both Facebook and Facebook customers. Facebook can proves that their paid promition services is significant to total interaction in a Page. Customers can also use this to make better decision on how to promote their products. 

The model is good enough, but not perfect. For example: some features such as post month, post weekday, post hour may be co-dependent variables. We can furthur investigate the features


