In [55]:
import pandas as pd
import numpy as np
import json
from copy import deepcopy
import itertools
from collections import defaultdict

from datetime import datetime
from dateutil.parser import parse

%matplotlib inline
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()
matplotlib.rcParams['figure.dpi'] = 300

In [265]:
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn import linear_model, model_selection, feature_extraction, ensemble

In [None]:
class G2GDat():
    '''
    Read in the scraped data, and prepare it for analysis.
    '''
    def __init__(self, user_file, question_file):
        with open(user_file) as json_data:
            self.Users = json.load(json_data)
            
        with open(question_file) as json_data:
            self.Questions = json.load(json_data)
            
            
    def peek(self, dictName):
        '''
        Look at a single item in a dictionary
        '''
        outDict = {}
        if dictName == 'users':
            key, val = self.Users.iteritems().next()
            outDict[deepcopy(key)]=deepcopy(val)
            return outDict
        elif dictName == 'questions':
            key, val = self.Questions.iteritems().next()
            outDict[deepcopy(key)] = deepcopy(val)
            return outDict        
        else:
            return 'I don\'t know how to do that.'
                
         
    def flattenQuestions(self):
        '''
        extract mean sentiment, and n-measures
        '''
        for quest,content in self.Questions.iteritems():
            try:
                summed_sent = sum(content['stats']['tone'])
                count = len(content['stats']['tone'])
                if count == 0:
                    summed_sent = 0.
                    mean_sent = 0.
                else:
                    mean_sent = summed_sent/float(count)
                self.Questions[quest]['stats']['summed_sent'] = summed_sent
                self.Questions[quest]['stats']['mean_sent'] = mean_sent
            except:
                self.Questions[quest]['stats'] = {'summed_sent':0., 'mean_sent':0., 'count':0}
            

    def calcUserPerQuestion(self):
        '''
        extract summed sentiment contribution per question for each user
        '''
        for user, uservals in self.Users.iteritems():
            questDict = {}
            i = 0
            for quest in uservals['questionIds']:
                if not quest in questDict:
                    questDict[quest]={'count':0., 'summed':0.}
                questDict[quest]['count'] += 1.
                questDict[quest]['summed'] += uservals['textSent'][i]
                i += 1
            self.Users[user]['questDict'] = questDict


    def calcUserResidual(self):
        '''
        extract user_deviation from question tone
        mean user-corrected question sentiment, and
        '''
        for user, uservals in self.Users.iteritems():
            u_n_vals = []
            u_sum_vals = []
            q_n_vals = []
            q_sum_vals = []
            for quest, val in uservals['questDict'].iteritems():
                u_n_vals += [(val['count'] + 0.)]
                u_sum_vals += [(val['summed'] + 0.)]
                questionData = self.Questions[quest]['stats']
                q_n_vals += [(questionData['count'] + 0.)]
                q_sum_vals += [(questionData['summed_sent'] + 0.)]
            #should be weighted by n-measures?
            #but will weight all qs the same, no matter how many utterances.
            #u_n_vals = [(1. if x < 1. else x) for x in u_n_vals]
            u_sent_means = np.divide(u_sum_vals, u_n_vals)
            q_corrected_n = np.asarray(q_n_vals) - np.asarray(u_n_vals)
            q_corrected_n[q_corrected_n < 1] = 1.
            q_corrected_sum = np.asarray(q_sum_vals) - np.asarray(u_sum_vals)
            q_corrected_means = np.divide(q_corrected_sum, q_corrected_n)
            u_delt_sent = u_sent_means - q_corrected_means
            mean_delt_sent = np.mean(u_delt_sent)
            mean_q_corrected = np.mean(q_corrected_means)
            self.Users[user]['mean_delt_sent'] = mean_delt_sent
            self.Users[user]['mean_q_corrected'] = mean_q_corrected
            self.Users[user]['mean_sent'] = np.mean(u_sent_means)
            ####################
            ## now do log-ratios
            n_texts = np.sum(u_n_vals)
            self.Users[user]['n_texts'] = n_texts
            trans_sent = (mean_delt_sent + 1.15) / 2.15 # min 0
            trans_sent = trans_sent * (n_texts / (1. + n_texts)) # correct for the number of texts
            self.Users[user]['trans_delt_sent'] = np.log(trans_sent / (1. - trans_sent))
            trans_q_adjust = (mean_q_corrected + 1.) / 2.
            trans_q_adjust = trans_q_adjust * (n_texts / (1. + n_texts))
            self.Users[user]['trans_q_adjust'] = np.log(trans_q_adjust / (1. - trans_q_adjust))
        
    def calcEvokedFeelings(self):
        '''
        Go through the questions, extract summed sentiment, and n-measures
        Go through the users, count per-question-input-number, and per-question-mean-sentiment 
            and also per-question-sentiment not counting the users' input.
        '''
        self.flattenQuestions()
        self.calcUserPerQuestion()
        self.calcUserResidual() 

        
    def convertG2Gcsv(self):
        '''
        process the syllable data and stuff to be simple metrics
        then convert to csv and save.
        '''
        q_data = self.Questions.copy()
        data = self.Users.copy()
        
        for key, value in q_data.iteritems():
            try:
                tone_list = value['stats'].pop('tone', None)
                mean_tone = np.mean(tone_list)
                value['mean_tone'] = mean_tone
                value['count'] = tone_list=value['stats'].pop('count', None)
            except:
                print key

        for key, value in data.iteritems():
            nSylls = np.array(value.pop('nSylls', None))
            textLen = np.array(value.pop('textLens', None))
            textLen = textLen+0.
            textSent = np.array(value.pop('textSent', None))

            textDay = str(value.pop('day', None))
            textMon = str(value.pop('mon', None))
            textYear = str(value.pop('year', None))
            date = textDay + ' ' + textMon + ' ' + textYear
            start_date = (datetime.now() - datetime.strptime(date, '%d %b %Y')).days
            value['start_date'] = start_date
            
            textDays = value.pop('days', None)
            textDays.sort(reverse=False)  
            textSum = np.sum(textLen)
            
            qD=value.pop('questDict', None)
            qID=value.pop('questionIds', None)
            value['contributions'] = float(value['contributions'].replace(',', ''))
            nTexts= value['n_texts']
            invTexts=1/nTexts #to avoid weird values near 0 screwing things up 
            textWeights=textLen/textSum

            textEarly=float(textDays[int(np.floor(nTexts*0.75))])
            textMid=float(textDays[int(np.floor(nTexts*0.5))])
            textLate=float(textDays[int(np.floor(nTexts*0.25))])

            weightedSent=np.sum(textSent*textWeights) #sentiment weighted by frequency
            scaledSent=(weightedSent+1)/2
            sentRat=(scaledSent+invTexts)/(1-scaledSent+invTexts)
            logSentRatio=np.log(sentRat) #for regression

            textComplexity=np.mean(nSylls/textLen)# average ratio of syllables-by-length
            meanTextLength=np.mean(textLen)
            
            commit_length = textEarly - textLate
            if commit_length < 0:
                commit_length = 0
            
            # positive vs negative
            get_down = int(value['getDown'].replace(',', ''))
            get_up = int(value['getUp'].replace(',', ''))
            give_down = int(value['giveDown'].replace(',', ''))
            give_up = int(value['giveUp'].replace(',', ''))
            value['pos_get'] = np.log((get_up + 1.) / (get_down + 1.))
            value['pos_give'] = np.log((give_up + 1.) / (give_down + 1.))
                
            # persistence
            value['commitLength'] = np.log(commit_length + 1.)
            # higher values of text_persist should mean more consistent, or increased participation over time
            time_num = textEarly - textMid
            if time_num < 0:
                time_num = 0
            time_denom = textEarly - textLate
            if time_denom < 0:
                time_denom = 0
            value['textPersist'] = np.log((time_num + 1.) / (time_denom + 1.))
            value['active'] = textDays[(len(textDays) -1)]

            # give vs get
            answers = int(value['nAnswers'].replace(',', ''))
            questions = int(value['nPosts'].replace(',', ''))
            comments = int(value['nComments'].replace(',', ''))
            value['asks_vs_answers'] = np.log((questions + 1.) / (answers + 1.))
            give_votes = give_up + give_down
            get_votes = get_up + get_down
            value['give_vs_get_votes'] = np.log((give_votes + 1.) / (get_votes + 1.))
            
            # productivity
            writs = answers + questions + comments
            value['scaled_writs'] = np.log((writs + 1.) / (commit_length + 1.0)) # writs: total g2g contributions
            value['scaled_texts'] = np.log((nTexts + 1.) / (commit_length + 1.0)) # texts: g2g contributions I counted.
            value['scaled_thanks'] = np.log((float(value['thanks'].replace(',', '')) + 1.) / (commit_length + 1.0))
            value['log_contributions'] = np.log(value['contributions'] + 1.)
            value['scaled_contributions'] = np.log((value['contributions'] + 1.) / (commit_length + 1.0))
            value['logSentRatio'] = logSentRatio
            value['textComplexity'] = textComplexity
            value['meanTextLength']= meanTextLength

        pdData=pd.DataFrame.from_dict(data, orient='index', )
        pdData.to_csv('userData2.csv')   
        return pdData
    
data = G2GDat('userData_trans.txt', 'questionData.txt')
data.calcEvokedFeelings()
user_dat = data.convertG2Gcsv()

The dataset, for the Q&A board, and the site as a whole collected covers several major domains:

* __Productivity__ How much did a user contribute? For how long?
    * ```commitLength``` : how long (days) was their active period (defined as days between first and third quartiles of posts)?
    * ```textPersist``` : how many contributions do they make in the latter portion of their active period relative to the first? 
    * ```scaled_writs``` : rate of production of questions, answers, comments (G2G).
    * ```log_contributions``` : the overall log number of contributions made.
    * ```scaled_contributions``` : how much genealogical work did they do, scaled by the length of their tenure?
* __Positivity__ How positive was a user, and how much positivity did they elicit?
    * ```logSentRatio``` : how positive are a user's texts?
    * ```pos_give``` : what ratio of upvotes does a user give?
    * ```pos_get``` : what ratio of upvotes does a user get? 
    * ```trans_delt_sent``` : how much more positive is a focal user than the other people they're interacting with, across all the posts they've contributed to?
    * ```trans_q_adjust``` : after subtracting a focal user's contribution, how positive is the sentiment of the posts they contribute to?
* __Helping__ How much did a user demand of others, or offer assistance?
    * ```asks_vs_answers``` : how many questions asked, relative to help offered.
    * ```scaled_thanks``` : how many thanks did a user receive?
    * ```give_vs_get_votes``` : how many questions and responses did they vote on, vs receiving votes?
* __Sophistication__ How 'smart' was a user, judging by their texts?
    * ```textComplexity``` : how many syllables are the words they use on average (divided by text length (characters).
    * ```meanTextLength``` : how long (characters) was their mean text?
    
Note that 'scaled' means scaled by the duration of the user's active period (not the period since they opened the account). All proportions were logit transformed, and all counts and ratios were log transformed (after ensuring this was appropriate, given the empirical distributions).

In [309]:
colnames = ['start_date','commitLength','textPersist', 'scaled_writs', 'scaled_contributions', 'log_contributions',
                    'logSentRatio',
                   'pos_give', 'pos_get', 'trans_delt_sent', 'trans_q_adjust','asks_vs_answers', 'scaled_thanks', 
                   'give_vs_get_votes', 'textComplexity', 'meanTextLength']

X = user_dat.loc[((user_dat['n_texts'] > 1) & (user_dat['contributions'] > 10)),colnames]
len(X)


In [None]:
# Diagnostic plot 1
fig, axes = plt.subplots(nrows=2, ncols=2)
fig.tight_layout() # Or equivalently,  "plt.tight_layout()"
fig_list = ['scaled_contributions', 'trans_q_adjust', 'logSentRatio','textComplexity']
fig_captions = ['contributions (scaled)', 'evoked positivity', 'positive sentiment (log)', 'text complexity']
for i, fig in enumerate(fig_list):
    print i
    plt.subplot(2,2, (i+1))
    plt.xlabel(fig_captions[i])
    plt.hist(X[fig], bins=50, color = def_color)
    plt.subplots_adjust(wspace = 0.3) 
    plt.subplots_adjust(hspace = 0.5) 
    if i == 3:
        plt.xticks(np.arange(0.05, 0.55, 0.1))
plt.show()

In [None]:
# Diagnostic plot 2
colnames = ['start_date','commitLength','textPersist', 'scaled_writs', 'scaled_contributions', 'log_contributions',
                    'logSentRatio',
                   'pos_give', 'pos_get', 'trans_delt_sent', 'trans_q_adjust','asks_vs_answers', 'scaled_thanks', 
                   'give_vs_get_votes', 'textComplexity', 'meanTextLength']

fig, axes = plt.subplots(nrows=2, ncols=2)
def_color = sns.color_palette("Blues_d")[2]
ax = sns.regplot(x='asks_vs_answers', y='scaled_contributions', data=X, order = 1, ax = axes[0,0], color =  def_color)
ax.set(xlabel='asks vs answers', ylabel='scaled contributions')
ax = sns.regplot(x='logSentRatio', y='scaled_contributions', data=X, order = 1, ax = axes[0,1], color =  def_color)
ax.set(xlabel='sentiment ratio (log)', ylabel='scaled contributions')
ax = sns.regplot(x='give_vs_get_votes', y='scaled_contributions', data=X, order = 1, ax = axes[1,0], color =  def_color)
ax.set(xlabel='give vs get votes', ylabel='scaled contributions')
ax = sns.regplot(x=np.log(X['meanTextLength']), y=X['scaled_contributions'], order = 1, ax = axes[1,1], color =  def_color)
ax.set(xlabel='mean text length', ylabel='scaled contributions')

plt.subplots_adjust(wspace = 0.5) 
plt.subplots_adjust(hspace = 0.5) 


In [None]:
## Pipeline for productivity

from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
colnames2 = ['textPersist', 'scaled_writs', 'logSentRatio','pos_give', 'pos_get', 
             'trans_delt_sent', 'trans_q_adjust','asks_vs_answers', 'scaled_thanks', 
             'give_vs_get_votes', 'textComplexity', 'meanTextLength']

#X = user_dat.loc[((user_dat['n_texts'] > 1) & (user_dat['contributions'] > 10)),colnames]
scale = StandardScaler()
X2 = X.loc[(X['start_date'] > 700), colnames2]
scaled_contributions = np.array(X.loc[(X['start_date'] > 700), 'scaled_contributions'])

X_train, X_test, y_train, y_test = model_selection.train_test_split(
X2, scaled_contributions, test_size=0.3, random_state=42)

production_pipeline = Pipeline([
    ('rescale', StandardScaler()),
    ('pca', PCA(12)),
    ('random_forest', ensemble.RandomForestRegressor(n_estimators = 1000, min_samples_split = 10))
    ])

production_pipeline.fit(X_train, y_train)
print production_pipeline.score(X_test, y_test)

pd.DataFrame(production_pipeline.named_steps['pca'].components_.T)

In [None]:
### GridSearchCV to optimize pipeline
dims_grid = {'pca__n_components':np.array(range(1,13)),}

grid = GridSearchCV(estimator=production_pipeline, param_grid=dims_grid)
grid_result=grid.fit(X2, log_contributions)

print grid_result.best_estimator_

In [None]:
#Diagnostic plot 3
colnames = ['start_date','commitLength','textPersist', 'scaled_writs', 'scaled_contributions', 'commitLength',
                    'logSentRatio',
                   'pos_give', 'pos_get', 'trans_delt_sent', 'trans_q_adjust','asks_vs_answers', 'scaled_thanks', 
                   'give_vs_get_votes', 'textComplexity', 'meanTextLength']

fig, axes = plt.subplots(nrows=2, ncols=2)
def_color = sns.color_palette("Blues_d")[2]
ax = sns.regplot(x='asks_vs_answers', y='trans_q_adjust', data=X, order = 1, ax = axes[0,0], color =  def_color)
ax.set(xlabel='asks vs answers', ylabel='evoked positivity')
ax = sns.regplot(x='scaled_contributions', y='trans_q_adjust', data=X, order = 1, ax = axes[0,1], color =  def_color)
ax.set(xlabel='contributions (scaled)', ylabel='evoked positivity')
ax = sns.regplot(x='logSentRatio', y='trans_q_adjust', data=X, order = 1, ax = axes[1,0], color =  def_color)
ax.set(xlabel='positive sentiment (log)', ylabel='evoked positivity')
ax = sns.regplot(x='textComplexity', y=X['trans_q_adjust'], data=X, order = 1, ax = axes[1,1], color =  def_color)
ax.set(xlabel='text complexity', ylabel='evoked positivity')

plt.subplots_adjust(wspace = 0.5) 
plt.subplots_adjust(hspace = 0.5) 


In [274]:
## Pipeline for evoked positivity
colnames3 = ['scaled_contributions','textPersist', 'scaled_writs', 'logSentRatio','pos_give', 'pos_get', 
             'trans_delt_sent','asks_vs_answers', 'scaled_thanks', 
             'give_vs_get_votes', 'textComplexity', 'meanTextLength']

#X = user_dat.loc[((user_dat['n_texts'] > 1) & (user_dat['contributions'] > 10)),colnames]
scale = StandardScaler()
X3 = X.loc[(X['start_date'] > 700), colnames3]
trans_q_adjust = np.array(X.loc[(X['start_date'] > 700), 'trans_q_adjust'])
X_train, X_test, y_train, y_test = model_selection.train_test_split(
X3, trans_q_adjust, test_size=0.3, random_state=42)

pipeline_sentiment = Pipeline([
    ('rescale', StandardScaler()),
    ('pca', PCA(11)),
    #('linear', linear_model.LinearRegression())
    ('random_forest', ensemble.RandomForestRegressor(n_estimators = 1000, min_samples_split = 10))
    #('ridge', linear_model.Ridge(alpha=1.0))
    ])

pipeline_sentiment.fit(X_train, y_train)
print pipeline_sentiment.score(X_test, y_test)

In [None]:
#dims_grid = {'pca__n_components':np.array(range(1,13)),'ridge__alpha':np.logspace(-4., 0, 20)}
dims_grid = {'pca__n_components':np.array(range(1,13)),}

grid = GridSearchCV(estimator=pipeline_sentiment, param_grid=dims_grid)
grid_result=grid.fit(X3, trans_q_adjust)

print grid_result.best_estimator_

In [288]:
# To examine the contents of the pipeline
pd.DataFrame(pipeline_sentiment.named_steps['pca'].components_.T)
pipeline_sentiment.named_steps['linear'].coef_

array([-2.55526825e-02,  1.31339884e-01, -4.10111307e-02, -7.42468723e-05,
        3.88204958e-02, -5.00563585e-02, -8.55702074e-03, -3.25474439e-02,
        1.04663282e-01, -7.74811122e-03,  9.39746629e-03,  5.14963696e-02])