In [1]:
import math
from datetime import datetime
import numpy as np
import pandas as pd
import seaborn as sns # for visualiation
import matplotlib.pyplot as plt # plotting
import matplotlib
matplotlib.style.use('ggplot')
%matplotlib inline

df = pd.read_csv('./facebook_with_reactions.csv')
outcomes = df[['Rating']]

## drop useless columns 
df = df.drop(columns=['Debate', 'status_link', 'permalink_url', 'Post URL', 
                      'status_message', 'link_name', 'share_count', 'Unnamed: 0', 'account_id',
                      'status_id', 'status_type', 'status_published'])

## change Rating to numeric for classification
df.Rating = df.Rating.replace(['mostly false', 'no factual content', 'mixture of true and false', 'mostly true'], [0,1,2,3]).astype(int)

## remove spaces in columns
df.columns = [c.replace(' ', '_') for c in df.columns]

## make dummy columns (make categories numeric)
df = pd.get_dummies(df)

In [3]:
df.groupby('Rating').count()
# number of records for each outcome

Unnamed: 0_level_0,post_id,reaction_count,comment_count,num_reactions,num_comments,num_shares,num_likes,num_loves,num_wows,num_hahas,...,Date_Published_2016-09-20,Date_Published_2016-09-21,Date_Published_2016-09-22,Date_Published_2016-09-23,Date_Published_2016-09-26,Date_Published_2016-09-27,Post_Type_link,Post_Type_photo,Post_Type_text,Post_Type_video
Rating,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,78,78,78,78,78,78,78,78,78,78,...,78,78,78,78,78,78,78,78,78,78
1,257,257,257,257,257,257,257,257,257,257,...,257,257,257,257,257,257,257,257,257,257
2,219,219,219,219,219,219,219,219,219,219,...,219,219,219,219,219,219,219,219,219,219
3,1612,1612,1612,1612,1612,1612,1612,1612,1612,1612,...,1612,1612,1612,1612,1612,1612,1612,1612,1612,1612


In [4]:
# Because the data is unbalanced we need to weight the outcomes differently.
# Below is the rough estimates if we were to make each outcome worth the same. 
# Here we say that `mostly true` is stays the same and all other outcomes are weighted
# to equal the same as `mostly true`

###### THESE CLASS WEIGHTS ARE SUBJECT TO CHANGE BASED ON MODEL PERFORMANCE ######

## mostly true = 1
## mix of T/F ~ 7
## mostly false ~ 21
## no facts ~ 6

class_weight = {0: 21.,
                1: 6.,
                2: 7.,
                3: 1.}

## insert above parameter into .fit() function of model as **class_weight**

In [5]:
df.columns

Index(['post_id', 'Rating', 'reaction_count', 'comment_count', 'num_reactions',
       'num_comments', 'num_shares', 'num_likes', 'num_loves', 'num_wows',
       'num_hahas', 'num_sads', 'num_angrys', 'Category_left',
       'Category_mainstream', 'Category_right', 'Page_ABC News Politics',
       'Page_Addicting Info', 'Page_CNN Politics', 'Page_Eagle Rising',
       'Page_Occupy Democrats', 'Page_Politico', 'Page_Right Wing News',
       'Page_The Other 98%', 'Date_Published_2016-09-19',
       'Date_Published_2016-09-20', 'Date_Published_2016-09-21',
       'Date_Published_2016-09-22', 'Date_Published_2016-09-23',
       'Date_Published_2016-09-26', 'Date_Published_2016-09-27',
       'Post_Type_link', 'Post_Type_photo', 'Post_Type_text',
       'Post_Type_video'],
      dtype='object')

In [20]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

In [21]:
tr_feat, test_feat, tr_out, test_out = train_test_split(
   df.drop(columns='Rating'),      # features
   df.Rating,    # outcome
   random_state=0,
   test_size=0.20, # percentage of data to use as the test set
   
)

In [36]:
clf = LogisticRegression(random_state=0, solver='sag',
                         multi_class='multinomial').fit(df.drop(columns=['Rating']), df.Rating)

In [37]:
preds = clf.predict(df.drop(columns=['Rating']))

clf.predict_proba(df.drop(columns=['Rating'])) 

clf.score(df.drop(columns=['Rating']), df.Rating)

0.7442289935364728

In [38]:
preds

array([3, 3, 3, ..., 3, 3, 3])