In [1]:
import pandas as pd
import numpy as np
import statsmodels.api as sm
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
first_post = pd.read_json('data/politicos/json/mixed.json')

In [3]:
#first_post.reset_index(inplace=True)
#first_post.head()

In [4]:
# Author- not needed in analysis, keep for now as index
first_post.set_index('author', inplace=True)
#first_post.drop('author', axis=1, inplace=True)

In [5]:
# Label / outcome variable
multi_post = first_post['total_posts']>1
first_post.drop(['total_posts', 'post_ids', 'post_datetimes', 'first_post_datetime', 'last_post_datetime'], axis=1, inplace=True)


In [6]:
# Ups/downs
first_post.rename(columns={'first_post_ups':'ups',
                           'first_post_downs':'downs'}, inplace=True)

In [7]:
# Responses
first_post.rename(columns={'first_post_avg_response_ups':'responses_ups_avg',
                           'first_post_avg_response_downs':'responses_downs_avg',
                           'first_post_total_responses':'responses_total'}, inplace=True)
first_post.drop(['first_post_responses','first_post_response_ups', 'first_post_response_downs'], axis=1, inplace=True) #not doing text analysis for now
first_post.fillna(0, inplace=True) #response stats are NaN if no responses
# responses_ups_avg is actually an interaction term multiplied by has_responses
first_post['has_responses'] = (first_post['responses_total']>0).astype(int)

In [8]:
# Body
first_post['word_count'] = first_post['first_post_body'].apply(lambda post: len(post.split()))
first_post.drop(['first_post_link_id', 'first_post_id', 'first_post_body'], axis=1, inplace=True)

In [9]:
# Parent type
first_post['is_response'] = first_post['parent_type']=='t1'
first_post['is_response'] = first_post['is_response'].astype(int)
first_post.drop('parent_type', axis=1, inplace=True)

In [10]:
# Downs and Response downs - drop bc no data in this sample
first_post.drop('responses_downs_avg', axis=1, inplace=True)
first_post.drop('downs', axis=1, inplace=True)

In [11]:
# Const
first_post = sm.add_constant(first_post)
#first_post['const'] = pd.Series(np.ones(first_post.shape[0]))

In [12]:
first_post.head()

Unnamed: 0_level_0,const,responses_ups_avg,responses_total,ups,has_responses,word_count,is_response
author,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
netdroid9,1,0,1,3,1,67,1
grimsley33,1,0,0,4,0,91,0
bluedice,1,0,0,0,0,8,0
Cody2,1,1,1,26,1,3,1
firebat87,1,0,0,1,0,33,1


In [13]:
logit = sm.Logit(multi_post, first_post)

In [14]:
result = logit.fit()

Optimization terminated successfully.
         Current function value: 0.638717
         Iterations 7


In [15]:
result.summary()

0,1,2,3
Dep. Variable:,total_posts,No. Observations:,768.0
Model:,Logit,Df Residuals:,761.0
Method:,MLE,Df Model:,6.0
Date:,"Wed, 12 Aug 2015",Pseudo R-squ.:,0.03055
Time:,21:59:46,Log-Likelihood:,-490.53
converged:,True,LL-Null:,-505.99
,,LLR p-value:,2.63e-05

0,1,2,3,4,5
,coef,std err,z,P>|z|,[95.0% Conf. Int.]
const,0.0854,0.122,0.701,0.483,-0.153 0.324
responses_ups_avg,0.0132,0.034,0.387,0.699,-0.053 0.080
responses_total,-0.0401,0.089,-0.453,0.651,-0.214 0.134
ups,0.0086,0.008,1.075,0.283,-0.007 0.024
has_responses,0.6951,0.218,3.185,0.001,0.267 1.123
word_count,0.0013,0.001,1.035,0.301,-0.001 0.004
is_response,0.2361,0.156,1.515,0.130,-0.069 0.542


In [16]:
np.exp(result.params)

const                1.089170
responses_ups_avg    1.013243
responses_total      0.960665
ups                  1.008604
has_responses        2.003837
word_count           1.001313
is_response          1.266364
dtype: float64

In [17]:
result.aic

995.06899933514751

In [21]:
logit2 = sm.Logit(multi_post, sm.add_constant(first_post['has_responses']))
result2 = logit2.fit()

Optimization terminated successfully.
         Current function value: 0.642759
         Iterations 5


In [22]:
result2.summary()

0,1,2,3
Dep. Variable:,total_posts,No. Observations:,768.0
Model:,Logit,Df Residuals:,766.0
Method:,MLE,Df Model:,1.0
Date:,"Wed, 12 Aug 2015",Pseudo R-squ.:,0.02441
Time:,22:03:12,Log-Likelihood:,-493.64
converged:,True,LL-Null:,-505.99
,,LLR p-value:,6.674e-07

0,1,2,3,4,5
,coef,std err,z,P>|z|,[95.0% Conf. Int.]
const,0.2408,0.094,2.561,0.010,0.057 0.425
has_responses,0.7774,0.160,4.874,0.000,0.465 1.090


In [23]:
result2.aic

991.27848008426326

In [24]:
np.exp(result2.params)

const            1.272277
has_responses    2.175857
dtype: float64

In [27]:
logit3 = sm.Logit(multi_post, sm.add_constant(first_post[['has_responses', 'ups']]))
result3 = logit3.fit()

Optimization terminated successfully.
         Current function value: 0.641130
         Iterations 7


In [28]:
result3.summary()

0,1,2,3
Dep. Variable:,total_posts,No. Observations:,768.0
Model:,Logit,Df Residuals:,765.0
Method:,MLE,Df Model:,2.0
Date:,"Wed, 12 Aug 2015",Pseudo R-squ.:,0.02689
Time:,22:07:00,Log-Likelihood:,-492.39
converged:,True,LL-Null:,-505.99
,,LLR p-value:,1.235e-06

0,1,2,3,4,5
,coef,std err,z,P>|z|,[95.0% Conf. Int.]
const,0.2271,0.095,2.397,0.017,0.041 0.413
has_responses,0.7252,0.163,4.450,0.000,0.406 1.045
ups,0.0075,0.006,1.191,0.234,-0.005 0.020


In [29]:
result3.aic

990.77630537239315

In [30]:
np.exp(result3.params)

const            1.254951
has_responses    2.065192
ups              1.007536
dtype: float64

Don't add ups to the model. It barely improves AIC and Pseudo-Rsq. It adds extra variable to to the model. Its p-value is not significant. It coefficient is near 0.

In [31]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import 

In [32]:
rfc = RandomForestClassifier(n_estimators=10, max_features=3, max_depth=5)

In [33]:
rfc.fit(first_post, multi_post)

RandomForestClassifier(bootstrap=True, compute_importances=None,
            criterion='gini', max_depth=5, max_features=3,
            max_leaf_nodes=None, min_density=None, min_samples_leaf=1,
            min_samples_split=2, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0)

In [34]:
preds = rfc.predict(first_post)