In [32]:
import numpy as np, matplotlib.pyplot as plt
import pandas as pd
import requests
from bs4 import BeautifulSoup
import time
import json
import sys
from scipy import stats
from sklearn.cross_validation import *

%matplotlib inline

In [61]:
smalldf = pd.read_csv("small.csv")

In [62]:
smalldf.head()

Unnamed: 0,bID,pID,AB,H,PA,RL,SAC,SO,TB,W,matchID,AVG,OBP,ov_AB,ov_H,ov_PA,ov_SAC,ov_SO,ov_TB,ov_W,ov_FACED,ov_AVG,ov_OBP,ov_SO_PCT,ov_W_PCT,ov_H_PCT,ovp_AB,ovp_H,ovp_PA,ovp_SAC,ovp_SO,ovp_TB,ovp_W,ovp_FACED,ovp_SO_PCT,ovp_W_PCT,ovp_H_PCT,ovp_AVG,ovp_OBP
0,schic002,benea001,9,0,9,R,1,4,0,0,schic002_benea001,0.0,0.0,146,22,150,14,57,30,4,14,0.150685,0.173333,0.38,0.026667,0.146667,3046,827,3425,46,609,1713,379,136,0.17781,0.110657,0.24146,0.271504,0.352117
1,schic002,browk001,11,1,11,R,0,6,1,0,schic002_browk001,0.090909,0.090909,146,22,150,14,57,30,4,14,0.150685,0.173333,0.38,0.026667,0.146667,4582,1159,4989,47,892,1974,407,222,0.178793,0.081579,0.232311,0.252946,0.313891
2,schic002,fassj001,11,2,11,L,1,5,4,0,schic002_fassj001,0.181818,0.181818,146,22,150,14,57,30,4,14,0.150685,0.173333,0.38,0.026667,0.146667,3006,847,3357,40,582,1587,351,185,0.173369,0.104558,0.252309,0.28177,0.356866
3,schic002,glavt001,14,3,14,L,1,3,5,0,schic002_glavt001,0.214286,0.214286,146,22,150,14,57,30,4,14,0.150685,0.173333,0.38,0.026667,0.146667,7561,2115,8413,102,985,3852,852,280,0.117081,0.101272,0.251397,0.279725,0.352668
4,schic002,hampm001,16,2,16,L,1,6,2,0,schic002_hampm001,0.125,0.125,146,22,150,14,57,30,4,14,0.150685,0.173333,0.38,0.026667,0.146667,4782,1361,5339,53,666,2355,557,232,0.124742,0.104327,0.254917,0.284609,0.359243


In [21]:
def compute_supports(df):
    ubids=df.bID.unique()
    pitch = df.groupby('pID').bID.unique()
    bdict={}
    for e,v in zip(pitch.index.values, pitch.values):
        bdict[e] = np.array([item in v for item in ubids])
    pitchers=bdict.keys()
    supports=[]
    for i,p1 in enumerate(pitchers):
        for j,p2 in enumerate(pitchers):
            if  i < j:
                supmask = (bdict[p1] & bdict[p2])
                common_batters = np.sum(supmask)
                supports.append(common_batters)
    print "mean support",np.mean(supports), "median support", np.median(supports)
    return supports, bdict

In [37]:
def recompute_frame(ldf):
    """
    takes a dataframe ldf, makes a copy of it, and returns the copy
    with all conglomerations recomputed
    this is used when a frame is subsetted.
    """
    ldfb=ldf.groupby('bID')
    ldfp=ldf.groupby('pID')
    nldf=ldf.copy()
    #Conglomerate pitcher stats
    nldf.set_index(['pID'], inplace=True)
    for col in ['AB', 'PA', 'H', 'TB', 'SAC', 'SO', 'W']:
        nldf['ovp_'+col] = ldfp[col].sum()
    nldf['ovp_AVG'] = nldf['ovp_H']/nldf['ovp_AB']
    nldf['ovp_FACED']= ldfp.AB.count()
    nldf['ovp_OBP'] = (nldf['ovp_H'] + nldf['ovp_W'])/nldf['ovp_PA']
    for col in ['SO', 'W', 'H']:
        nldf['ovp_' + col + '_PCT'] = nldf['ovp_' + col] / nldf['ovp_PA']
    nldf.reset_index(inplace=True)
    
    #Conglomerate batter stats
    nldf.set_index(['bID'], inplace=True)
    for col in ['AB', 'PA', 'H', 'TB', 'SAC', 'SO', 'W']:
        nldf['ov_'+col] = ldfb[col].sum()
    nldf['ov_AVG'] = nldf['ov_H']/nldf['ov_AB']
    nldf['ov_FACED']= ldfb.AB.count()
    nldf['ov_OBP'] = (nldf['ov_H'] + nldf['ov_W'])/nldf['ov_PA']
    for col in ['SO', 'W', 'H']:
        nldf['ov_' + col + '_PCT'] = nldf['ov_' + col] / nldf['ov_PA']
    nldf.reset_index(inplace=True)
    return nldf

In [23]:
def make_smaller(df, pacountp, pacountb):
    smallidf1=df[(df.ovp_PA > pacountp) & (df.ov_PA > pacountb)]
    smalldf=recompute_frame(smallidf1)
    return smalldf

In [24]:
def make_smaller_2(df, col, limit, greater=True):
    if greater:
        smallidf = df[(df[col] > limit)]
    else:
        smallidf = df[(df[col] < limit)]
    return recompute_frame(smallidf)

Split into test and training sets

In [64]:
print smalldf.shape,
smalldf[smalldf.ov_FACED > 100].shape

(109688, 39)

(88234, 39)




In [65]:
trainlist=[]
testlist=[]
validatelist=[]
take=21 #21 matchups between validation and test set
for k, v in smalldf.groupby('bID'):
    if len(v) > 100: #batter has faced at least 150 pitchers
        train_rows, test_valid_rows = train_test_split(v, test_size=take)
        trainlist.append(train_rows)
        valid_rows, test_rows = train_test_split(test_valid_rows, test_size=0.4)
        validatelist.append(valid_rows) 
        testlist.append(test_rows) 
    else:
        trainlist.append(v)
traindf=pd.concat(trainlist)
validatedf=pd.concat(validatelist)
testdf=pd.concat(testlist)
print traindf.shape, validatedf.shape, testdf.shape

(101036, 39) (4944, 39) (3708, 39)


In [66]:
#Make sure each pitcher ID was encountered in training set
maskval= np.in1d(validatedf.pID, traindf.pID) 
masktest = np.in1d(testdf.pID, traindf.pID)
print np.sum(~maskval), np.sum(~masktest)

0 0


In [67]:
traindf=recompute_frame(traindf)
validatedf=recompute_frame(validatedf)
testdf=recompute_frame(testdf)
validatedf=validatedf[['bID', 'pID','AVG']]
testdf=testdf[['bID', 'pID', 'AVG']]
traindf.head()

Unnamed: 0,bID,pID,AB,H,PA,RL,SAC,SO,TB,W,matchID,AVG,OBP,ov_AB,ov_H,ov_PA,ov_SAC,ov_SO,ov_TB,ov_W,ov_FACED,ov_AVG,ov_OBP,ov_SO_PCT,ov_W_PCT,ov_H_PCT,ovp_AB,ovp_H,ovp_PA,ovp_SAC,ovp_SO,ovp_TB,ovp_W,ovp_FACED,ovp_SO_PCT,ovp_W_PCT,ovp_H_PCT,ovp_AVG,ovp_OBP
0,aberb001,bellr003,9,1,10,R,0,0,1,1,aberb001_bellr003,0.111111,0.2,359,78,385,4,41,120,26,30,0.21727,0.27013,0.106494,0.067532,0.202597,1460,443,1648,19,210,1004,188,125,0.127427,0.114078,0.268811,0.303425,0.382888
1,aberb001,buehm001,13,2,14,L,0,3,6,1,aberb001_buehm001,0.153846,0.214286,359,78,385,4,41,120,26,30,0.21727,0.27013,0.106494,0.067532,0.202597,9014,2506,9658,126,1316,4670,644,414,0.13626,0.06668,0.259474,0.278012,0.326154
2,aberb001,burkj001,8,2,9,R,0,1,2,1,aberb001_burkj001,0.25,0.333333,359,78,385,4,41,120,26,30,0.21727,0.27013,0.106494,0.067532,0.202597,3889,1124,4252,52,644,2135,363,211,0.151458,0.085372,0.264346,0.28902,0.349718
3,aberb001,castf001,7,1,11,R,0,0,1,4,aberb001_castf001,0.142857,0.454545,359,78,385,4,41,120,26,30,0.21727,0.27013,0.106494,0.067532,0.202597,2101,600,2329,29,335,1210,228,143,0.143839,0.097896,0.257621,0.285578,0.355517
4,aberb001,clemr001,25,7,26,R,1,2,9,1,aberb001_clemr001,0.28,0.307692,359,78,385,4,41,120,26,30,0.21727,0.27013,0.106494,0.067532,0.202597,6218,1515,6880,63,1470,2828,662,265,0.213663,0.096221,0.220203,0.243647,0.316424


In [70]:
ybar = traindf.H.sum() / float(traindf.AB.sum())
ybar

0.27389865959721776

In [71]:
bIDs=traindf.bID.unique()#unique-user-ids
pIDs=traindf.pID.unique()#unique-item-ids

In [72]:
bidmap={v:k for k,v in enumerate(bIDs)}#of length U
pidmap={v:k for k,v in enumerate(pIDs)}#of length M

Calculate dictionaries of pitcher and batter biases

In [None]:
batter_biases = {}
pitcher_biases = {}

for u_index, u_id in enumerate(bids):
    batter_rows = traindf[traindf.bID == b_id]
    y_u = batter_rows.user_avg.unique()[0]
    u_baseline = y_u - ybar
    user_biases[u_id] = u_baseline
    
for i_index, i_id in enumerate(uiids):
    item_rows = traindf[traindf.business_id == i_id]
    y_i = item_rows.business_avg.unique()[0]
    i_baseline = y_i - ybar
    item_biases[i_id] = i_baseline

Compare results function from pset 4

In [25]:
def compare_results(stars_actual, stars_predicted, ylow=1, yhigh=6, model="", predicteds=False, onsame=False, axis=False):
    """
    plot predicted results against actual results. Takes 2 arguments: a
    numpy array of actual ratings and a numpy array of predicted ratings
    scatterplots the predictions, a unit slope line, line segments joining the mean,
    and a filled in area of the standard deviations."
    """
    if onsame:
        ax=onsame
    elif axis:
        ax=axis
    else:
        fig=plt.figure()
        ax=plt.gca()
    df=pd.DataFrame(dict(actual=stars_actual, predicted=stars_predicted))
    xp=[]
    yp=[]
    for k,v in df.groupby('actual'):
        xp.append(k)
        yp.append(v.predicted.mean())        
    cl, = ax.plot(xp,yp, 's-', label="means for %s" % model)
    c=cl.get_color()
    sig=df.groupby('actual').predicted.std().values
    ax.fill_between(xp, yp - sig, yp + sig, color=c, alpha=0.2)
    if predicteds:
        ax.plot(df.actual, df.predicted, '.', color=c, alpha=0.1, label="predicted for %s" % model)

    if not onsame:
        ax.plot([1,5],[1,5], 'k', label="slope 1")
        ax.set_xlabel("actual")
        ax.set_ylabel("predicted")
        ax.set_ylim([ylow,yhigh])
        ax.set_xlim([0.9, 5.1])
    ax.legend(frameon=False, loc="upper left")
    rmse=get_rmse(stars_actual, stars_predicted)
    print "RMSE for %s" % model, rmse
    return ax,rmse

Below is an idea for a function that can be used to determine batter similarity. The function takes the batter-pitcher pair to predict, and it returns similar batters who have faced the given pitcher.

In [17]:
def sim_batters(bID, pID, df):
    batter_row = df.loc[df['bID'] == bID]
    AVG = batter_row['AVG']
    OBP = batter_row['OBP']
    if (batter_row['RL'] == 'R'):
        hand = 0
    else:
        hand = 1
    SAC = batter_row['SAC']
    # continue for other statistics
    
    # find list of bIDs who have faced given pID
    p_batters = df.loc[df['pID'] == pID]
    
    similarity = {}
    # for each batter, calculate similarity
    for b in p_batters:
        comp_b_row = df.loc[df['bID'] == b]
        comp_AVG = batter_row['AVG']
        comp_OBP = batter_row['OBP']
        if (batter_row['RL'] == 'R'):
            comp_hand = 0
        else:
            comp_hand = 1
        comp_SAC = batter_row['SAC']
        # continue for other statistics
        
        #calculate similarity
        AVG_score = AVG_weight*(AVG - comp_AVG)/AVG
        OBP_score = OBP_weight*(OBP - comp_OPB)/OBP
        if (hand == comp_hand):
            hand_score = hand_weight
        else:
             hand_score = 0   
        SAC_score = SAC_weight*(SAC - comp_SAC)/SAC
        total_score = AVG_score + OBP_score + hand_score + SAC_score
        similarity[b] = total_score
    # return top x bIDs with lowest scores
    sorted_sim = sorted(similarity.items(), key=operator.itemgetter(1))
    sim_batters = sorted_sim.keys
    return sim_batters[:10]
    

Below are functions from pset 4 that might be useful for the KNN model

In [40]:
from scipy.stats.stats import pearsonr
# calculate pearson sim between 2 pitchers
def pearson_sim(pitcher1, pitcher2, n_common):
    p1_avg = pitcher1['ov_AVG']
    p2_avg = pitcher2['ov_AVG']
    p1_actual = pitcher1['AVG']
    p2_actual = pitcher2['AVG']
    norm1 = []
    norm2 = []
    for i in range(len(rest1)):
        norm1.append(rest1[i] - p1_actual[i])
        norm2.append(rest2[i] - p2_actual[i])
    if (n_common == 0) or (n_common==1):
        rho = 0;
    else:
        rho = sp.stats.pearsonr(norm1,norm2)[0]
    if np.isnan(rho):
        return 0;
    return rho

In [41]:
def get_restaurant_reviews(pID, df, set_of_batters):
    """
    given a pitcher id and a set of batters, return the sub-dataframe of their
    averages.
    """
    mask = (df.user_id.isin(set_of_batters)) & (df.pID==pID)
    avgs = df[mask]
    avgs = avgs[avgs.user_id.duplicated()==False]
    return avgs

In [42]:
class Database:
    "A class representing a database of similarities and common supports"
    
    def __init__(self, rindexmap, supports):
        "the constructor, takes a map of restaurant id's to integers"
        database={}
        self.rindexmap=rindexmap
        self.supports=supports
        l_keys=len(self.rindexmap.keys())
        self.database_sim=np.zeros([l_keys,l_keys])
        self.database_sup=np.zeros([l_keys, l_keys], dtype=np.int)

    def set_supports(self, supports):
        self.supports=supports
        
    def get(self, b1, b2):
        "returns a tuple of similarity,common_support given two business ids"
        sim=self.database_sim[self.rindexmap[b1]][self.rindexmap[b2]]
        nsup=self.database_sup[self.rindexmap[b1]][self.rindexmap[b2]]
        return (sim, nsup)

In [43]:
db=Database(uiidmap, supports)

NameError: name 'uiidmap' is not defined

In [44]:
def mapper1(row):
    return row[1], (row[2], row[5], row[14])

In [45]:
def combiner(items):
    indict={}
    for key, value in items:
        if not indict.has_key(key):
            indict[key]=[]
        indict[key].append(value)
    return indict.items()

In [46]:
def reducer1(the_input):
    bID, values = the_input
    avgs=[]
    for pID,AVG,ov_AVG in values:
        avgs.append((pID,(AVG, ov_AVG)))
    return bID, avgs

In [47]:
from itertools import combinations_with_replacement
def mapper2(list_input):
    nlist = []
    comb = list(combinations_with_replacement(list_input[1], 2))
    for item in comb:
        if item[0][0] > item[1][0]:
            biz_pair = item[1][0], item[0][0]
            star_pair = item[1][1], item[0][1]
        else:
            biz_pair = item[0][0], item[1][0]
            star_pair = item[0][1], item[1][1]
        tup = (biz_pair, star_pair)
        nlist.append(tup)
    return nlist

In [48]:
def combiner_list(itemslist):
    indict={}
    for items in itemslist:
        for key, value in items:
            if not indict.has_key(key):
                indict[key]=[]
            indict[key].append(value)
    return indict.items()

In [49]:
def reducer2(item_dict):
    p1_id = item_dict[0][0]
    p2_id = item_dict[0][1]
    AVG_1 = [x[0][0] for x in item_dict[1]]
    AVG_2 = [x[1][0] for x in item_dict[1]]
    ov_AVG_1 = [x[0][1] for x in item_dict[1]]
    ov_AVG_2 = [x[1][1] for x in item_dict[1]]
    n_common = len(item_dict[1])
    p1_dict = {'ov_AVG': ov_AVG_1, 'AVG': AVG_1}
    p2_dict = {'ov_AVG': ov_AVG_2, 'AVG': AVG_2}
    rho = pearson_sim(p1_dict, p2_dict, n_common)
    return (p1_id, p2_id),(rho, n_common)

In [50]:
def map_reduce(tuples):
    mapped1=map(mapper1, tuples)
    combine1=combiner(mapped1)
    reduced1=reduce(lambda x,y: x + [reducer1(y)], combine1, [])
    mapped2=map(mapper2,reduced1)
    combine2=combiner_list(mapped2)
    output=reduce(lambda x,y: x + [reducer2(y)], combine2, [])
    return output

In [51]:
tuples=traindf.itertuples()
sims=map_reduce(tuples)

KeyboardInterrupt: 

In [52]:
def populate_from_mr(db, df, mapredlist):
    for tpair,vpair in mapredlist:
        i1=db.rindexmap[tpair[0]]
        i2=db.rindexmap[tpair[1]]
        db.database_sim[i1][i2]=vpair[0]
        db.database_sup[i1][i2]=vpair[1]
        db.database_sim[i2][i1]=vpair[0]
        db.database_sup[i2][i1]=vpair[1]

In [None]:
populate_from_mr(db, traindf, sims)

Checking work with populate_by_calculating

In [None]:
db2=Database( uiidmap, supports)

In [53]:
def calculate_similarity(db, df, p1, p2, similarity_func):
    # find common reviewers
    common_reviewers = db.supports[db.rindexmap[p1]][db.rindexmap[p2]]
    n_common=len(common_reviewers)
    if p1==p2:
        return 1., n_common
    #get reviews
    p1_ov_AVG = get_restaurant_reviews(p1, df, common_reviewers)
    p2_ov_AVG = get_restaurant_reviews(p2, df, common_reviewers)
    sim=similarity_func(p1_ov_AVG, p2_ov_AVG, n_common)
    return sim, n_common

def populate_by_calculating(db, df, similarity_func):
    """
    a populator for every pair of businesses in df. takes similarity_func like
    pearson_sim as argument
    """
    items=db.rindexmap.items()
    for b1, i1 in items:
        for b2, i2 in items:
            if i1 <= i2:
                sim, nsup=calculate_similarity(db, df, b1, b2, similarity_func)
                db.database_sim[i1][i2]=sim
                db.database_sim[i2][i1]=sim
                db.database_sup[i1][i2]=nsup
                db.database_sup[i2][i1]=nsup

In [None]:
%%time
populate_by_calculating(db2, traindf, pearson_sim)

In [None]:
tpair=('FV0BkoGOd3Yu_eJnXY15ZA', 'O-Xa9GCFWI65YiBD5Jw_hA')
print db.get(tpair[0],tpair[1]),db2.get(tpair[0],tpair[1])

In [54]:
def shrunk_sim(sim, n_common, reg=3.):
    "takes a similarity and shrinks it down by using the regularizer"
    ssim=(n_common*sim)/(n_common+reg)
    return ssim

In [None]:
"""
Function
--------
knearest

Parameters
----------
restaurant_id : string
    The id of the restaurant whose nearest neighbors we want
set_of_restaurants : array
    The set of restaurants from which we want to find the nearest neighbors
dbase : instance of Database class.
    A database of similarities, on which the get method can be used to get the similarity
  of two businesses. e.g. dbase.get(rid1,rid2)
k : int
    the number of nearest neighbors desired, default 7
reg: float
    the regularization.
    
  
Returns
--------
A sorted list
    of the top k similar restaurants. The list is a list of tuples
    (business_id, shrunken similarity, common support).
"""
from operator import itemgetter
def knearest(restaurant_id, set_of_restaurants, dbase, k=7, reg=3.):
    """
    Given a restaurant_id, dataframe, and database, get a sorted list of the
    k most similar restaurants from the set of restaurants.
    """
    similars=[]
    for other_rest_id in set_of_restaurants:
        if other_rest_id!=restaurant_id:
            sim, nc=dbase.get(restaurant_id, other_rest_id)
            ssim=shrunk_sim(sim, nc, reg=reg)
            simdist=(1. - ssim)/2.
            similars.append((other_rest_id, simdist, nc ))
    similars=sorted(similars, key=itemgetter(1))
    return similars[0:k]

In [None]:
def get_users_restaurants(df, user_id):
    dfuser=df[df.user_id==user_id]
    dfuserdedup=dfuser.drop_duplicates('business_id')
    return dict(zip(dfuserdedup.business_id.values, dfuserdedup.stars.values))

In [None]:
"""
Function
--------
rating

Parameters
----------
set_of_restaurants: Dictionary
    The dictionary of restaurant: star-rating pairs you want to make the prediction from.
    This would be the output of a function like get_users_restaurants
train_map: Dictionary
    A dictionary with keys mean, users and items which have estimates of
    overall average or intercept, user coefficients(averages), and
    item coefficients(averages) respectively
dbase : instance of Database class.
    A database of similarities, on which the get method can be used to get the similarity
  of two businessed. e.g. dbase.get(rid1,rid2)
restaurant_id : string
    The id of the restaurant whose nearest neighbors we want
user_id : string
    The id of the user, in whose reviewed restaurants we want to find the neighbors
k : int
    the number of nearest neighbors desired, default 7
reg: float
    the regularization.
    
  
Returns
--------
A float
    which is the imputed rating that we predict that user_id will make for restaurant_id
    
Notes
--------
If the sum of scores is 0, return the baseline estimate of the ranking.
"""
#your code here
# Note: this function was inspired in part by the solutions to the 2013 hw4
def rating(set_of_restaurants, train_map, dbase, restaurant_id, user_id, k=7, reg=3.):
    mu=train_map['mean']
    user_bias = train_map['users'][user_id]
    nsum=0.
    scoresum=0.
    nears=knearest(restaurant_id, set_of_restaurants, dbase, k=k, reg=reg)
    restaurant_bias=train_map['items'][restaurant_id]
    scores=[]
    for r,s,nc in nears:
        ssim = 1-s
        scoresum=scoresum+ssim
        scores.append(ssim)
        r_biases = train_map['items'][r]
        r_stars = set_of_restaurants[r]
        rminusb=(r_stars - (r_biases + user_bias + mu))
        nsum=nsum+ssim*rminusb
    baseline=(user_bias +restaurant_bias + mu)
    if scoresum > 0.:
        val =  nsum/scoresum + baseline
    else:
        val=baseline
    return val

In [None]:
trainuser=traindf.loc[0].user_id
testrest=testdf[testdf.user_id==trainuser].business_id.values[0]
print trainuser, testrest

In [None]:
def get_actual(df, userid, bizid):
    return df[(df.user_id==userid) & (df.business_id==bizid)]['stars'].values[0]

print "Actual", get_actual(testdf, trainuser, testrest)
print "Predicted",rating(get_users_restaurants(traindf, trainuser), train_avgs, db, testrest, trainuser, k=2, reg=3.)

In [None]:
def get_ratings_user_nbd(indf, traindf, train_map, db, k=2, reg=3.):
    zips=zip(indf.business_id, indf.user_id, indf.stars)
    preds=[]
    actuals=[]
    for (r,u,actual) in zips:
        pred=rating(get_users_restaurants(traindf, u),train_map, db, r,u, k, reg)
        preds.append(pred)
        actuals.append(actual)
    return np.array(preds), np.array(actuals)

In [None]:
pt, at = get_ratings_user_nbd(traindf, traindf, train_avgs, db, k=4, reg=4.)
compare_results(at,pt, model="knn(user) on training k=4, reg=4", predicteds=True)