Dataset description page: https://archive.ics.uci.edu/ml/datasets/Facebook+Comment+Volume+Dataset

# 1. Pre-processing of data

In [None]:
# Loading the dataset(zipped) with a google drive id
! gdown 1ak_rxZ1XrenunEZK62t9U61XYe8mp-EU

# Unzipping the dataset to a folder named "unzipped_data". The directory can be found by clicking on the Files icon on the leftmost column
# The command automatically checks for existence of unzipped data so re-running it does not lead to multiple unzips
# However, running it for the first time will generate a bunch of checker prints. To remove them for simplicity, simply re-run this cell
!unzip -u "/content/Dataset.zip" -d "/content/unzipped_data"

In [None]:
# Imports
import pandas as pd
import numpy as np
import sklearn.preprocessing

# Read data
df1 = pd.read_csv("/content/unzipped_data/Dataset/Training/Features_Variant_1.csv", header = None)


In [None]:
column_names = {0: "page_likes",
                1: "page_checkins", #page visits
                2: "page_talking_about",
                3: "page_category",
                4: "min_CC1",
                5: "max_CC1",
                6: "ave_CC1",
                7: "med_CC1",
                8: "std_CC1",
                9: "min_CC2",
                10: "max_CC2",
                11: "ave_CC2",
                12: "med_CC2",
                13: "std_CC2",
                14: "min_CC3",
                15: "max_CC3",
                16: "ave_CC3",
                17: "med_CC3",
                18: "std_CC3",
                19: "min_CC4",
                20: "max_CC4",
                21: "ave_CC4",
                22: "med_CC4",
                23: "std_CC4",
                24: "min_CC5",
                25: "max_CC5",
                26: "ave_CC5",
                27: "med_CC5",
                28: "std_CC5",
                29: "CC1",
                30: "CC2",
                31: "CC3",
                32: "CC4",
                33: "CC5",
                34: "base_time",
                35: "post_length",
                36: "post_share_count",
                37: "post_isPromoted",
                38: "h_local",
                39: "published_on_sun",
                40: "published_on_mon",
                41: "published_on_tue",
                42: "published_on_wed",
                43: "published_on_thu",
                44: "published_on_fri",
                45: "published_on_sat",
                46: "base_day_sun",
                47: "base_day_mon",
                48: "base_day_tue",
                49: "base_day_wed",
                50: "base_day_thu",
                51: "base_day_fri",
                52: "base_day_sat",
                53: "num_comments_in_next_h"}

In [None]:
# Define normalization function
from sklearn.preprocessing import MinMaxScaler

def norm(df):
  df_X = df.drop(columns='num_comments_in_next_h')
  df_y = df[['num_comments_in_next_h']]
  # print(df_y)
  scaler = MinMaxScaler()
  scaler.fit(df_X)
  df_X_norm = scaler.transform(df_X)
  df_X_norm = pd.DataFrame(df_X_norm, columns = df_X.columns)
  df_norm = pd.concat([df_X_norm, df_y], axis=1)
  # print(df_norm[['num_comments_in_next_h']])
  return df_norm


# Rename columns - drop statistics columns  
df1 = df1.rename(columns = column_names)
df1 = df1.drop(df1.iloc[:, 4:29], axis = 1)

# Convert catogory to dummy variables
dummy_df1 = pd.get_dummies(df1, columns=['page_category'])

# Split train & test data
df1_train = dummy_df1.sample(frac = 0.8, random_state=3244)
df1_test = dummy_df1.drop(df1_train.index)

df1_test_reindex = df1_test.reset_index(drop=True)
df1_train_reindex = df1_train.reset_index(drop=True)

# Normalize train and test data separately
df1_train_norm = norm(df1_train_reindex)
df1_test_norm = norm(df1_test_reindex)

y_train = pd.DataFrame(df1_train_norm.loc[:,["num_comments_in_next_h"]], dtype=float)
X_train = pd.DataFrame(df1_train_norm.loc[:, df1_train_norm.columns != "num_comments_in_next_h"], dtype=float)

y_test = pd.DataFrame(df1_test_norm.loc[:,["num_comments_in_next_h"]], dtype=float)
X_test = pd.DataFrame(df1_test_norm.loc[:, df1_test_norm.columns != "num_comments_in_next_h"], dtype=float)

# 2. MLP Model (sklearn)

In [None]:
# Hyperparameter tuning
from sklearn.neural_network import MLPRegressor
from sklearn.model_selection import GridSearchCV

regr = MLPRegressor(max_iter=100)
parameter_space = {
    'hidden_layer_sizes': [(256,128,64,16,4),(133,128,64,4), (128, 64, 16, 4), (133, 64, 4), (64,32,4), (64,4), (4)],
    # 'activation': [ 'relu', 'logistic', 'tanh'], after running, relu gives the best performance, so we omit the other two here for faster computation
    'activation': [ 'relu'],
    # 'solver': ['adam', 'sgd'],
    'solver': ['adam'],
    'alpha': [0.0001, 0.001],
    'learning_rate': ['constant'],
}

clf = GridSearchCV(regr, parameter_space, n_jobs=-1, cv=3)
clf.fit(X_train, y_train)

# Best paramete set
print('Best parameters found:\n', clf.best_params_)

# All results 
means = clf.cv_results_['mean_test_score']
stds = clf.cv_results_['std_test_score']
for mean, std, params in zip(means, stds, clf.cv_results_['params']):
    print("%0.3f (+/-%0.03f) for %r" % (mean, std * 2, params))

KeyboardInterrupt: ignored

Results are copied here as it takes around 30+ minutes to run: 

Best parameters found:
 {'activation': 'relu', 'alpha': 0.0001, 'hidden_layer_sizes': (64, 4), 'learning_rate': 'constant', 'solver': 'adam'}


0.039 (+/-1.223) for {'activation': 'relu', 'alpha': 0.0001, 'hidden_layer_sizes': (256, 128, 64, 16, 4), 'learning_rate': 'constant', 'solver': 'adam'}


0.311 (+/-0.581) for {'activation': 'relu', 'alpha': 0.0001, 'hidden_layer_sizes': (133, 128, 64, 4), 'learning_rate': 'constant', 'solver': 'adam'}


0.353 (+/-0.507) for {'activation': 'relu', 'alpha': 0.0001, 'hidden_layer_sizes': (128, 64, 16, 4), 'learning_rate': 'constant', 'solver': 'adam'}


0.510 (+/-0.226) for {'activation': 'relu', 'alpha': 0.0001, 'hidden_layer_sizes': (133, 64, 4), 'learning_rate': 'constant', 'solver': 'adam'}


0.517 (+/-0.277) for {'activation': 'relu', 'alpha': 0.0001, 'hidden_layer_sizes': (64, 32, 4), 'learning_rate': 'constant', 'solver': 'adam'}


0.543 (+/-0.130) for {'activation': 'relu', 'alpha': 0.0001, 'hidden_layer_sizes': (64, 4), 'learning_rate': 'constant', 'solver': 'adam'}


0.276 (+/-0.096) for {'activation': 'relu', 'alpha': 0.0001, 'hidden_layer_sizes': 4, 'learning_rate': 'constant', 'solver': 'adam'}


-0.070 (+/-0.888) for {'activation': 'relu', 'alpha': 0.001, 'hidden_layer_sizes': (256, 128, 64, 16, 4), 'learning_rate': 'constant', 'solver': 'adam'}


0.095 (+/-0.720) for {'activation': 'relu', 'alpha': 0.001, 'hidden_layer_sizes': (133, 128, 64, 4), 'learning_rate': 'constant', 'solver': 'adam'}


0.428 (+/-0.365) for {'activation': 'relu', 'alpha': 0.001, 'hidden_layer_sizes': (128, 64, 16, 4), 'learning_rate': 'constant', 'solver': 'adam'}


0.446 (+/-0.366) for {'activation': 'relu', 'alpha': 0.001, 'hidden_layer_sizes': (133, 64, 4), 'learning_rate': 'constant', 'solver': 'adam'}


0.436 (+/-0.233) for {'activation': 'relu', 'alpha': 0.001, 'hidden_layer_sizes': (64, 32, 4), 'learning_rate': 'constant', 'solver': 'adam'}


0.369 (+/-0.494) for {'activation': 'relu', 'alpha': 0.001, 'hidden_layer_sizes': (64, 4), 'learning_rate': 'constant', 'solver': 'adam'}


0.248 (+/-0.292) for {'activation': 'relu', 'alpha': 0.001, 'hidden_layer_sizes': 4, 'learning_rate': 'constant', 'solver': 'adam'}

In [None]:
from sklearn.neural_network import MLPRegressor

# Using the best performing hyperparameters
regr = MLPRegressor(random_state = 1, max_iter=600, hidden_layer_sizes = (64, 4)).fit(X_train, y_train)
score = regr.score(X_test, y_test)
print("R2 score:", score)

  y = column_or_1d(y, warn=True)


R2 score: 0.3706078974251238


In [None]:
from sklearn.metrics import mean_squared_error
y_pred = regr.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
print("MSE:", mse)

805.8783915309074
