In [139]:
import numpy as np
import pandas as pd
import scipy
import matplotlib.pyplot as plt
from sklearn import linear_model
from scipy.stats import pearsonr
%matplotlib inline

In [174]:
user_info = pd.read_csv("data/user-features")
question_info = pd.read_csv("data/question-features")
train_info = pd.read_csv("data/invited_info_train.txt", sep="\t", header=None, names=[
    "question_id", "user_id", "answered"
])

test_info = pd.read_csv("data/validate_nolabel.txt", sep=",", header=None, names=[
    "question_id", "user_id", "answered"
])

NUMBER_OF_USERS = len(user_info)
NUMBER_OF_QUESTIONS = len(question_info)

user_index = { }
for index, row in user_info.iterrows():
    user_index[ row['user_id'] ] = index

question_index = { }
for index, row in question_info.iterrows():
    question_index[ row['question_id'] ] = index
    


In [225]:
UNASKED = 0
IGNORED = -0.001

# Setting value for ignored
train_info.ix[train_info.answered == 0, 'answered'] = IGNORED

def pearsoncorr(x,y):
    return pearsonr(x, y)[ 0 ]

def featurize(user):
    v = np.repeat(0, NUMBER_OF_QUESTIONS)

    def update_vector(r):
        v[ question_index[r['question_id']] ] = r['answered']

    # Questions the user has answered
    train_info[train_info.user_id == user].apply(update_vector, axis=1)

    return v

# User-User CF
def recommend(question, user, K=3):
    qi = question_index[ question]

    # active user
    active_user = featurize(user)

    # users who've be asked this question
    users = train_info[train_info.question_id == question]
    # users who've answered this question
    users = users[users.answered == 1]['user_id']    
    user_vectors = map(featurize, users)

    # top K
    top_k = sorted(user_vectors, key=lambda x: pearsoncorr(active_user, x) )[ :K ]
    
    # predicted rating
    weighted_sum = reduce(lambda m, u: m + ((u[qi] - u.mean()) * pearsoncorr(active_user, u)), top_k, 0)
    sum_of_weights = reduce(lambda m, u: m + pearsoncorr(active_user, u), top_k, 0)
    
    if sum_of_weights == 0 or np.isnan(weighted_sum) or np.isnan(sum_of_weights):
        recommended = 0
    else:
        recommended = active_user.mean() + weighted_sum / sum_of_weights

    return recommended

In [226]:
recommendations = test_info.apply(lambda row: recommend(row['question_id'], row['user_id']), axis=1)

0
0
1.00088330744
0
0.997586586089
0.999607390005
0
0
0
0.99902977228
0.999876466955
0
0.99821145589
0
0.697817716702
1.00172959525
0.998526242971
0.621698522277
0.997158739963
0.998138559965
0
0.997232195886
0
0.993170988265
0.999194972123
0
0.999667616369
0.995570427226
0
0
0
0
0.998023471279
0
1.00154027045
0.995644919344
0
0.994529521162
1.000775508
0
0
0
0
0.998141632282
0
0
0
0
0
0
0
0
0
0
0.995894243945
1.00012353305
0
0
0.998981462358
0
0
0.998226641828
0.996825269022


KeyboardInterrupt: 

In [None]:
res = test_info[['question_id', 'user_id']].copy()
res['prediction'] = recommendations
res.columns = ['qid','uid','label']
res.to_csv('results.csv', sep=",", index=None)