In [3]:
import numpy as np, matplotlib.pyplot as plt
import pandas as pd
import requests
from bs4 import BeautifulSoup
import time
import json
import sys
from scipy import stats
from sklearn.cross_validation import *

%matplotlib inline

Below is a function that can be used to determine batter similarity. The function takes the batter-pitcher pair to predict, and it returns similar batters who have faced the given pitcher.

In [None]:
def sim_batters(bID, pID, df):
    batter_row = df.loc[df['bID'] == bID]
    AVG = batter_row['AVG']
    OBP = batter_row['OBP']
    if batter_row['RL'] == 'R'
        hand = 0
    else
        hand = 1
    SAC = batter_row['SAC']
    # continue for other statistics
    
    # find list of bIDs who have faced given pID
    p_batters = df.loc[df['pID'] == pID]
    
    similarity = {}
    # for each batter, calculate similarity
    for b in p_batters:
        comp_b_row = df.loc[df['bID'] == b]
        comp_AVG = batter_row['AVG']
        comp_OBP = batter_row['OBP']
        if batter_row['RL'] == 'R'
            comp_hand = 0
        else
            comp_hand = 1
        comp_SAC = batter_row['SAC']
        # continue for other statistics
        
        #calculate similarity
        AVG_score = AVG_weight*(AVG - comp_AVG)/AVG
        OBP_score = OBP_weight*(OBP - comp_OPB)/OBP
        hand_score = hand_weight*hand*comp_hand
        SAC_score = SAC_weight*(SAC - comp_SAC)/SAC
        total_score = AVG_score + OBP_score + hand_score + SAC_score
        similarity[b] = total_score
    # return top x bIDs with highest similarities
    sorted_sim = sorted(similarity.items(), key=operator.itemgetter(1))
    sim_batters = sorted_sim.keys
    return sim_batters[:10]
    

Below are function from pset 4 that might be useful for the KNN model

In [None]:
from scipy.stats.stats import pearsonr
"""
Given a subframe or dictionary of restaurant 1 reviews and a subframe or dictionary of restaurant 2 reviews,
where the reviewers are those who have reviewed both restaurants, return 
the pearson correlation coefficient between the user average subtracted ratings.
The case for zero common reviewers is handled separately. If the correlation is
NaN if any of the individual variances are 0 (the n=1 case), return 0 instead
"""
#your code here
def pearson_sim(rest1_reviews, rest2_reviews, n_common):
# note: I added I try, except to handle the .values needed for dataframes
    try:
        rest1 = rest1_reviews['user_avg']
        rest2 = rest2_reviews['user_avg']
        rest1_actual = rest1_reviews['stars']
        rest2_actual = rest2_reviews['stars']
        norm1 = []
        norm2 = []
        for i in range(len(rest1)):
            norm1.append(rest1[i] - rest1_actual[i])
            norm2.append(rest2[i] - rest2_actual[i])
        if (n_common == 0) or (n_common==1):
            rho = 0;
        else:
            rho = sp.stats.pearsonr(norm1,norm2)[0]
        if np.isnan(rho):
            return 0;
        return rho
    except:
        rest1 = rest1_reviews['user_avg'].values
        rest2 = rest2_reviews['user_avg'].values
        rest1_actual = rest1_reviews['stars'].values
        rest2_actual = rest2_reviews['stars'].values
        norm1 = []
        norm2 = []
        for i in range(len(rest1)):
            norm1.append(rest1[i] - rest1_actual[i])
            norm2.append(rest2[i] - rest2_actual[i])
        if (n_common == 0) or (n_common==1):
            rho = 0;
        else:
            rho = sp.stats.pearsonr(norm1,norm2)[0]
        if np.isnan(rho):
            return 0;
        return rho

In [None]:
def get_restaurant_reviews(restaurant_id, df, set_of_users):
    """
    given a resturant id and a set of reviewers, return the sub-dataframe of their
    reviews.
    """
    mask = (df.user_id.isin(set_of_users)) & (df.business_id==restaurant_id)
    reviews = df[mask]
    reviews = reviews[reviews.user_id.duplicated()==False]
    return reviews

In [None]:
class Database:
    "A class representing a database of similarities and common supports"
    
    def __init__(self, rindexmap, supports):
        "the constructor, takes a map of restaurant id's to integers"
        database={}
        self.rindexmap=rindexmap
        self.supports=supports
        l_keys=len(self.rindexmap.keys())
        self.database_sim=np.zeros([l_keys,l_keys])
        self.database_sup=np.zeros([l_keys, l_keys], dtype=np.int)

    def set_supports(self, supports):
        self.supports=supports
        
    def get(self, b1, b2):
        "returns a tuple of similarity,common_support given two business ids"
        sim=self.database_sim[self.rindexmap[b1]][self.rindexmap[b2]]
        nsup=self.database_sup[self.rindexmap[b1]][self.rindexmap[b2]]
        return (sim, nsup)

In [None]:
db=Database(uiidmap, supports)

In [None]:
trainuser=traindf.loc[0].user_id
testrest=testdf[testdf.user_id==trainuser].business_id.values[0]
print trainuser, testrest

In [None]:
def get_actual(df, userid, bizid):
    return df[(df.user_id==userid) & (df.business_id==bizid)]['stars'].values[0]

print "Actual", get_actual(testdf, trainuser, testrest)
print "Predicted",rating(get_users_restaurants(traindf, trainuser), train_avgs, db, testrest, trainuser, k=2, reg=3.)

In [None]:
def get_ratings_user_nbd(indf, traindf, train_map, db, k=2, reg=3.):
    zips=zip(indf.business_id, indf.user_id, indf.stars)
    preds=[]
    actuals=[]
    for (r,u,actual) in zips:
        pred=rating(get_users_restaurants(traindf, u),train_map, db, r,u, k, reg)
        preds.append(pred)
        actuals.append(actual)
    return np.array(preds), np.array(actuals)

In [None]:
pt, at = get_ratings_user_nbd(traindf, traindf, train_avgs, db, k=4, reg=4.)
compare_results(at,pt, model="knn(user) on training k=4, reg=4", predicteds=True)