In [1]:
import pandas as pd
import numpy as np
import statsmodels.api as sm
from sklearn.externals import joblib
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
# Turn JSON lines into JSON array
def json_line_to_array(filename):
    with open(filename) as f:
        lines = f.readlines()
        lines = map(lambda line: line.rstrip(), lines) #remove \n
        json_str = '[' + ','.join(lines) + ']'
        return json_str

In [3]:
reddit_pos = pd.read_json(json_line_to_array('data/politicos/json/balanced_multi.json'))
reddit_neg = pd.read_json(json_line_to_array('data/politicos/json/balanced_single.json'))
# reddit_pos = pd.read_json('data/politicos/json/multi.json')
# reddit_neg = pd.read_json('data/politicos/json/single.json')
first_post = pd.concat([reddit_pos, reddit_neg], axis=0)

In [4]:
# Label / outcome variable
multi_post = first_post['total_posts']>1
first_post.drop(['total_posts', 'post_ids', 'post_datetimes', 'last_post_datetime'], axis=1, inplace=True)
multi_post.index.rename('multi_post', inplace=True)
multi_post.name = 'multi_post'

# Ups/downs
first_post.rename(columns={'first_post_ups':'ups',
                           'first_post_downs':'downs'}, inplace=True)
first_post['has_ups'] = first_post['ups'].apply(lambda ups: 1 if ups > 0 else 0)
#first_post['neg_ups'] = first_post['ups'].apply(lambda ups: 1 if ups < 0 else 0)
# bug: sample is missing downs

# Responses
first_post['responses_avg_word_ct'] = first_post['first_post_responses'].apply(lambda responses: 0 if isinstance(responses, float) else np.sum([len(response.split()) for response in responses]) * 1.0 / len(responses) )
first_post['has_long_response'] = first_post['first_post_responses'].apply(lambda responses: 0 if isinstance(responses, float) else (1 if np.max([len(response.split()) for response in responses])>20 else 0))
first_post.rename(columns={'first_post_avg_response_ups':'responses_ups_avg',
                           'first_post_avg_response_downs':'responses_downs_avg',
                           'first_post_total_responses':'responses_total'}, inplace=True)
first_post.drop(['first_post_responses','first_post_response_ups', 'first_post_response_downs'], axis=1, inplace=True) #not doing text analysis for now
first_post.fillna(0, inplace=True) #response stats are NaN if no responses
# responses_ups_avg is actually an interaction term multiplied by has_responses
first_post['has_responses'] = (first_post['responses_total']>0).astype(int)

# Body
first_post['word_count'] = first_post['first_post_body'].apply(lambda post: len(post.split()))
first_post['long_post'] = first_post['word_count'].apply(lambda wc: 1 if wc > 20 else 0)

# Parent type
first_post['is_response'] = first_post['parent_type']=='t1'
first_post['is_response'] = first_post['is_response'].astype(int)

In [6]:
# Compare comments that got upvotes vs no upvotes with similar:
# long_post
# has_responses
# has_long_response
# is_response
# date

In [7]:
# Percentage of people who got upvotes in each category
for col in ['long_post','has_responses','has_long_response','is_response']:
    print pd.crosstab(first_post[col], first_post['has_ups']).apply(lambda row: row * 1.0 / row.sum(), axis=1)
# Counterintuitive: more people got upvotes when they got no response
# To investigate: responses but no upvotes

has_ups           0         1
long_post                    
0          0.176225  0.823775
1          0.179275  0.820725
has_ups               0         1
has_responses                    
0              0.161852  0.838148
1              0.203942  0.796058
has_ups                   0         1
has_long_response                    
0                  0.169428  0.830572
1                  0.207979  0.792021
has_ups             0         1
is_response                    
0            0.202845  0.797155
1            0.148102  0.851898


In [8]:
# No ups, multi-post
multi_post_no_ups = reddit_pos[ reddit_pos['first_post_ups']==0 ]['first_post_body']

In [9]:
# Ups, single-post
single_post_ups = reddit_neg[ reddit_neg['first_post_ups']>0 ][['first_post_body', 'first_post_ups']]
#single_post_ups['first_post_body'] = single_post_ups['first_post_body'].apply(lambda line: line.encode('utf-8'))
#single_post_ups.to_csv('single_post_ups.csv')

In [11]:
# Response, no upvotes
first_post['first_post_body'] = first_post['first_post_body'].apply(lambda line: line.encode('utf-8'))
#response_no_upvotes = first_post[ np.logical_and(first_post['has_responses']==1, first_post['has_ups']==0) ]['first_post_body']
#response_no_upvotes.to_csv('response_no_upvotes.csv')

In [12]:
np.mean(first_post['has_ups'])

0.82232733164345262

In [13]:
np.mean(first_post['has_responses'])

0.37588522748059688