In [1]:
%matplotlib inline
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
import re
from scipy.optimize import minimize
import sklearn.preprocessing as preprocessing

In [2]:
def return_csv_files(path):
    pattern = r'^.*\.csv$'
    return [f for f in os.listdir(path) if bool(re.match(pattern,f))]
    
path = 'data/backlog/'
csv_files = return_csv_files(path)
df = pd.DataFrame()
for csv in csv_files:
    csv_df = pd.read_csv(path + csv)
    df = pd.concat([df,csv_df])
df['created'] = pd.to_datetime(df.created, unit = 's')
df.index = range(0,len(df))
# 18,001 total examples with no duplicates

In [3]:
''' 
Randomly shuffle the dataframe and generates a training, cross-validation, and test set
'''
random_df = df.reindex(np.random.permutation(df.index))
training_df = random_df.iloc[0:12000]
cross_validation_df = random_df.iloc[12000:15000]
test_df = random_df.iloc[15000:]

In [4]:
def sigmoid(z):
    return np.power(1 + np.exp(-z), -1)

def logistic_cost_regularized(theta, Xdata, Ydata, lam=1):
    # NB Xdata must be a numpy array

    m = float(len(Xdata))
    
    J = (
        - 1/m * (np.log(sigmoid(Xdata.dot(theta))).dot(Ydata) + 
        np.log(1-sigmoid(Xdata.dot(theta))).dot(1-Ydata) ) + 
        lam/(2*m) * np.sum(theta[1:] ** 2) 
        )
    if np.isnan(J):
        return(np.inf)
    return(J)

def logistic_grad_regularized(theta, Xdata, Ydata, lam=1):
    # NB Xdata must be a numpy array
    m = float(len(Xdata))
    
    grad = theta.copy() * lam
    grad[0] = 0
    grad += Xdata.T.dot(sigmoid(Xdata.dot(theta))-Ydata)
    grad *= 1/m 
    
    return grad 

In [5]:
def softmax(X_data, Y_data, num_Y):
    poly = preprocessing.PolynomialFeatures(2)
    X_Transformed = poly.fit_transform(X_data.values.reshape(len(X_data),1))
    scaler = preprocessing.StandardScaler().fit(X_Transformed)
    X_Transformed = scaler.transform(X_Transformed) 
    softmax_weights = []
    for i in range(len(num_Y)+1):
        Y = Y_data.apply(lambda d: 1 if d == i else 0).values
        theta = np.zeros((X_Transformed.shape[1],))
        res = minimize(logistic_cost_regularized, theta, args=(X_Transformed,Y, 1)
                       , method = 'Nelder-Mead', jac=logistic_grad_regularized, options={'maxiter':400})
        softmax_weights.append(res.x)
    return softmax_weights

def predict_values(X_Data, thetas, weights):
    poly = preprocessing.PolynomialFeatures(2)
    X_Transformed = poly.fit_transform(X_Data.values.reshape(len(X_Data),1))
    scaler = preprocessing.StandardScaler().fit(X_Transformed)
    X_Transformed = scaler.transform(X_Transformed) 
    hyp = []
    for theta in thetas:
        hyp.append(X_Transformed.dot(theta))
    probs = []
    for h in np.array(hyp).T:
        num = np.exp(h)
        den = np.exp(h).sum()
        num *= weights
        probs.append(num/den)
    return probs

In [6]:
def normalize(column):
    mean = np.mean(column)
    sd = np.std(column)
    return column.apply(lambda d: (d-mean)/sd)

def bucketize(separators, num):
    for s in range(len(separators)):
        if num < separators[s]:
            return s
    return len(separators)

def accuracy(predicted_Y, Y):
    return sum(predicted_Y == Y)/float(len(predicted_Y))

def accuracy_on_set(df, theta, weights, cutoffs): 
    data = df[['num_comments', 'score']]
    X_data = data.num_comments
    Y_data = data.score.apply(lambda d: bucketize(cutoffs,d))
    prob_Y = pd.DataFrame(predict_values(X_data, thetas, weights))
    predicted_Y = prob_Y.apply(lambda d: d.argmax(), axis=1).rename('predicted')
    prob_Y.index = Y_data.index
    predicted_Y.index = Y_data.index
    #comparison = pd.concat([prob_Y, predicted_Y, Y, training_df.num_comments], axis=1)
    return accuracy(predicted_Y,Y_data)

In [7]:
'''
Groups the labels into 4 karma categories: low, medium-low, medium-high, high
    cutoffs(var) is the karma score cutoffs for the groups
    softmax(func) runs the data through a softmax regression model and returns the training weights for each group
    weights(var) are the weights that are apply on the probabilities to return an accurate cutoff
    accuracy_on_set(func) predicts the Y from the model and compares it to the actual Y and returns a prediction
'''

cutoffs = [50, 150, 350]
weights = np.array([.50,1.1,1.1,1.])

data = training_df[['num_comments', 'score']]
X_data = data.num_comments
Y_data = data.score.apply(lambda d: bucketize(cutoffs,d))
thetas = softmax(X_data, Y_data, cutoffs)



In [8]:
print 'Training Accuracy:', accuracy_on_set(training_df, thetas, weights, cutoffs)
print 'Validation Accuracy:', accuracy_on_set(cross_validation_df, thetas, weights, cutoffs)
print 'Test Accuracy:', accuracy_on_set(test_df, thetas, weights, cutoffs)

Training Accuracy: 0.634833333333
Validation Accuracy: 0.625
Test Accuracy: 0.676441186271


In [9]:
data = training_df[['num_comments', 'score']]
X_data = data.num_comments
Y_data = data.score.apply(lambda d: bucketize(cutoffs,d))
prob_Y = pd.DataFrame(predict_values(X_data, thetas, weights))
predicted_Y = prob_Y.apply(lambda d: d.argmax(), axis=1).rename('predicted')
predicted_Y.value_counts()

0    7320
3    2014
1    1846
2     820
Name: predicted, dtype: int64