In [2]:
import math
from datetime import datetime
import numpy as np
import pandas as pd
import seaborn as sns # for visualiation
import matplotlib.pyplot as plt # plotting
import matplotlib
matplotlib.style.use('ggplot')
%matplotlib inline

df = pd.read_csv('./facebook_with_reactions.csv')
outcomes = df[['Rating']]

## drop useless columns 
df = df.drop(columns=['Debate', 'status_link', 'permalink_url', 'Post URL', 
                      'status_message', 'link_name', 'share_count', 'Unnamed: 0', 'account_id',
                      'status_id', 'status_type', 'status_published', 'post_id', 'reaction_count'])

## change Rating to numeric for classification
df.Rating = df.Rating.replace(['mostly false', 'no factual content', 'mixture of true and false', 'mostly true'], [0,0,0,1]).astype(int)

## remove spaces in columns
df.columns = [c.replace(' ', '_') for c in df.columns]

## make dummy columns (make categories numeric)
df = pd.get_dummies(df)

In [3]:
#df = df.drop(df.query('Rating == 1').sample(frac=.5).index)
df.groupby('Rating').count()
# number of records for each outcome

Unnamed: 0_level_0,comment_count,num_reactions,num_comments,num_shares,num_likes,num_loves,num_wows,num_hahas,num_sads,num_angrys,...,Date_Published_2016-09-20,Date_Published_2016-09-21,Date_Published_2016-09-22,Date_Published_2016-09-23,Date_Published_2016-09-26,Date_Published_2016-09-27,Post_Type_link,Post_Type_photo,Post_Type_text,Post_Type_video
Rating,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,554,554,554,554,554,554,554,554,554,554,...,554,554,554,554,554,554,554,554,554,554
1,1612,1612,1612,1612,1612,1612,1612,1612,1612,1612,...,1612,1612,1612,1612,1612,1612,1612,1612,1612,1612


In [347]:
# Because the data is unbalanced we need to weight the outcomes differently.
# Below is the rough estimates if we were to make each outcome worth the same. 
# Here we say that `mostly true` is stays the same and all other outcomes are weighted
# to equal the same as `mostly true`

###### THESE CLASS WEIGHTS ARE SUBJECT TO CHANGE BASED ON MODEL PERFORMANCE ######

## mostly true = 1
## mix of T/F ~ 7
## mostly false ~ 21
## no facts ~ 6

class_weight = {0: 21.,
                1: 6.,
                2: 7.,
                3: 1.}

## insert above parameter into .fit() function of model as **class_weight**

In [4]:
import mord as m
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

tr_feat, test_feat, tr_out, test_out = train_test_split(
   df.drop(columns='Rating'),      # features
   df.Rating,    # outcome
   random_state=0,
   test_size=0.20, # percentage of data to use as the test set
   
)

In [7]:
clf = LogisticRegression(random_state=0, solver='sag',
                         multi_class='multinomial').fit(tr_feat, tr_out)

preds = clf.predict(test_feat)

clf.predict_proba(test_feat) 

clf.score(test_feat, test_out)

0.7580645161290323

In [12]:
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import RobustScaler
from sklearn.preprocessing import Normalizer
from sklearn.preprocessing import StandardScaler

from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import RFECV
from sklearn.feature_selection import chi2
from sklearn.feature_selection import f_classif

from sklearn.neural_network import MLPRegressor


from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import make_pipeline 

In [70]:
### Multi-layer Perceptron
mod = MLPRegressor(random_state=0)

scal = RobustScaler()
select_k_best = SelectKBest(f_classif)
m_pipe = make_pipeline(scal, select_k_best, mod)

m_param_grid = {
                'mlpregressor__activation':['relu', 'logistic'], 
                'mlpregressor__learning_rate':['adaptive'],
                'mlpregressor__solver':['lbfgs', 'adam'],
                'mlpregressor__hidden_layer_sizes':[(33,33,33), (500)]}
#mod.early_stopping=True # terminate training when validation score is not improving
mod.shuffle=True # shuffle samples when training

mlp = GridSearchCV(m_pipe, m_param_grid, cv=3, n_jobs=-1)
mlp.fit(tr_feat, tr_out);



In [73]:
mlp.score(test_feat, test_out)

0.34078457159389053

In [72]:
mlp.best_params_

{'mlpregressor__activation': 'logistic',
 'mlpregressor__hidden_layer_sizes': 500,
 'mlpregressor__learning_rate': 'adaptive',
 'mlpregressor__solver': 'adam'}