In [1]:
from math import sqrt
from collections import defaultdict

# Load data
def load_data(path):
    females = {}
    females_list = defaultdict(list)
    males = {}
    count = 0
    for line in open(path):
        # Skip column headers
        if not(count == 0):
            (female, male, posting, grade, anal, oral, kiss) = line.split(",")
            females[female] = female
            females_list[female].append(male)
            males.setdefault(male,{})
            
            # Define final raiting
            fgrade = 0.0
            score = 0.0
            if grade == '-1':
                fgrade = 1.0
                if anal == '1':
                    score += 0.5
                elif anal == '-1':
                    score -= 0.10
                if oral == '1':
                    score += 0.5
                elif oral == '-1':
                    score -= 0.10
                if kiss == '1':
                    score += 0.5
                elif kiss == '-1':
                    score -= 0.10
            elif grade == '0':
                fgrade = 4.0
                if anal == '1':
                    score += 0.75
                elif anal == '-1':
                    score -= 0.20
                if oral == '1':
                    score += 0.75 
                elif oral == '-1':
                    score -= 0.20
                if kiss == '1':
                    score += 0.75
                elif kiss == '-1':
                    score -= 0.20
            elif grade == '1':
                fgrade = 7.0
                if anal == '1':
                    score += 1.0
                elif anal == '-1':
                    score -= 0.20
                if oral == '1':
                    score += 1.0
                elif oral == '-1':
                    score -= 0.20
                if kiss == '1':
                    score += 1.0
                elif kiss == '-1':
                    score -= 0.20
            males[male][females[female]] = float(float(fgrade) + float(score))
        count += 1
    print 'Number of Males shared accross all females', len(count_males(females_list))
    return males

# Get count shared clientele
def count_males(fl):
    ml = {}
    for key in fl:
        for m in fl[key]:
            if m in ml:
                ml[m] += 1
            else:
                ml[m] = 0
                
    scores = list()
    for key in ml:
        if ml[key] > 0:
            scores.append(tuple((ml[key], key)))
    scores.sort()
    scores.reverse()
    return scores

#Returns a distance-base similarity score for person1 and person2
def sim_distance(prefs, person1, person2):
    #Get the list of shared_items
    si = {}
    for item in prefs[person1]:
        if item in prefs[person2]:           
            si[item] = 1

    #if they have no rating in common, return 0
    if len(si) == 0: 
        return 0

    #Add up the squares of all differences
    sum_of_squares = sum([pow(prefs[person1][item]-prefs[person2][item],2) for item in prefs[person1] if item in prefs[person2]])
    return 1 / (1 + sum_of_squares)

#Returns the Pearson correlation coefficient for p1 and p2 
def sim_pearson(prefs,p1,p2):
    #Get the list of mutually rated items
    si = {}
    for item in prefs[p1]:
        if item in prefs[p2]: 
            si[item] = 1

    #if they are no rating in common, return 0
    if len(si) == 0:
        return 0

    #sum calculations
    n = len(si)

    #sum of all preferences
    sum1 = sum([prefs[p1][it] for it in si])
    sum2 = sum([prefs[p2][it] for it in si])

    #Sum of the squares
    sum1Sq = sum([pow(prefs[p1][it],2) for it in si])
    sum2Sq = sum([pow(prefs[p2][it],2) for it in si])

    #Sum of the products
    pSum = sum([prefs[p1][it] * prefs[p2][it] for it in si])

    #Calculate r (Pearson score)
    num = pSum - (sum1 * sum2/n)
    den = sqrt((sum1Sq - pow(sum1,2)/n) * (sum2Sq - pow(sum2,2)/n))
    if den == 0:
        return 0

    r = num/den

    return r

#Returns the best matches for person from the prefs dictionary
#Number of the results and similiraty function are optional params.
def topMatches(prefs,person,n=5,similarity=sim_pearson):
    scores = [(similarity(prefs,person,other),other)
                for other in prefs if other != person]
    scores.sort()
    scores.reverse()
    return scores[0:n]

#Gets recommendations for a person by using a weighted average
#of every other user's rankings
def getRecommendations(prefs,person,similarity=sim_pearson):
    totals = {}
    simSums = {}

    for other in prefs:
        #don't compare me to myself
        if other == person:
            continue
        sim = similarity(prefs,person,other)

        #ignore scores of zero or lower
        if sim <= 0: 
            continue
        for item in prefs[other]:
            #only score books i haven't seen yet
            if item not in prefs[person] or prefs[person][item] == 0:
                #Similarity * score
                totals.setdefault(item,0)
                totals[item] += prefs[other][item] * sim
                #Sum of similarities
                simSums.setdefault(item,0)
                simSums[item] += sim

    #Create the normalized list
    rankings = [(total/simSums[item],item) for item,total in totals.items()]

    #Return the sorted list
    rankings.sort()
    rankings.reverse()
    return rankings

#Function to transform Person, item - > Item, person
def transformPrefs(prefs):
    results = {}
    for person in prefs:
        for item in prefs[person]:
            results.setdefault(item,{})

            #Flip item and person
            results[item][person] = prefs[person][item]
    return results

# Get top N simliarty Nodes
def get_top_sim(data, n1, n, similarity=sim_pearson):
    scores = list()
    for key in data:
        if not(n1 == key):
            a = similarity(data, n1, key)
            if a > 0:
                scores.append(tuple((a, key)))        
    scores.sort()
    scores.reverse()
    return scores[0:n]

In [2]:
buyers = load_data('../data/filtered.csv')

Number of Males shared accross all females 5165


# Buyers

In [394]:
get_top_sim(buyers, '370', 20)

[(1.000000000000405, '4722'),
 (1.0000000000002518, '8425'),
 (1.0000000000002518, '3430'),
 (1.0000000000001381, '9993'),
 (1.0000000000001381, '9840'),
 (1.0000000000001381, '3652'),
 (1.0000000000001381, '14858'),
 (1.0000000000001381, '14780'),
 (1.0000000000001381, '12103'),
 (1.0000000000001381, '10583'),
 (1.0000000000000646, '2595'),
 (1.000000000000056, '12229'),
 (1.0000000000000426, '7219'),
 (1.0000000000000426, '3654'),
 (1.0000000000000142, '6790'),
 (1.0000000000000118, '6715'),
 (1.0000000000000109, '4032'),
 (1.000000000000005, '6023'),
 (1.0000000000000042, '11216'),
 (1.000000000000004, '7654')]

In [395]:
get_top_sim(buyers, '370', 20, similarity=sim_distance)

[(1.0, '9864'),
 (1.0, '9843'),
 (1.0, '9772'),
 (1.0, '9741'),
 (1.0, '973'),
 (1.0, '97'),
 (1.0, '965'),
 (1.0, '9636'),
 (1.0, '9607'),
 (1.0, '9594'),
 (1.0, '9586'),
 (1.0, '9585'),
 (1.0, '9584'),
 (1.0, '9581'),
 (1.0, '958'),
 (1.0, '9569'),
 (1.0, '9553'),
 (1.0, '951'),
 (1.0, '9454'),
 (1.0, '9395')]

In [376]:
topMatches(buyers, '370', 10)

[(1.000000000000405, '4722'),
 (1.0000000000002518, '8425'),
 (1.0000000000002518, '3430'),
 (1.0000000000001381, '9993'),
 (1.0000000000001381, '9840'),
 (1.0000000000001381, '3652'),
 (1.0000000000001381, '14858'),
 (1.0000000000001381, '14780'),
 (1.0000000000001381, '12103'),
 (1.0000000000001381, '10583')]

In [377]:
topMatches(buyers, '370', 10, similarity=sim_distance)

[(1.0, '9864'),
 (1.0, '9843'),
 (1.0, '9772'),
 (1.0, '9741'),
 (1.0, '973'),
 (1.0, '97'),
 (1.0, '965'),
 (1.0, '9636'),
 (1.0, '9607'),
 (1.0, '9594')]

In [378]:
getRecommendations(buyers,'370')[0:4]

[(9.000000000000002, '9291'),
 (9.000000000000002, '6190'),
 (9.000000000000002, '5224'),
 (9.000000000000002, '4817')]

In [379]:
getRecommendations(buyers,'370', similarity=sim_distance)[0:4] 

[(9.000000000000002, '9064'),
 (9.000000000000002, '8617'),
 (9.000000000000002, '8132'),
 (9.000000000000002, '5782')]

# Sellers

In [380]:
sellers = transformPrefs(buyers)

In [396]:
get_top_sim(sellers, '18', 20)

[(1.00000000000002, '561'),
 (1.000000000000016, '5630'),
 (1.0000000000000089, '7222'),
 (1.000000000000007, '316'),
 (1.000000000000007, '1048'),
 (1.0000000000000053, '207'),
 (1.0000000000000038, '3122'),
 (1.0000000000000007, '31'),
 (1.0000000000000007, '12447'),
 (1.0000000000000002, '8730'),
 (1.0, '9280'),
 (1.0, '7952'),
 (1.0, '6943'),
 (1.0, '3934'),
 (1.0, '357'),
 (1.0, '3262'),
 (1.0, '317'),
 (1.0, '244'),
 (1.0, '15735'),
 (1.0, '13993')]

In [397]:
get_top_sim(sellers, '18', 20, similarity=sim_distance)

[(1.0, '9866'),
 (1.0, '9673'),
 (1.0, '9538'),
 (1.0, '9537'),
 (1.0, '9386'),
 (1.0, '9239'),
 (1.0, '9225'),
 (1.0, '9159'),
 (1.0, '9135'),
 (1.0, '9042'),
 (1.0, '8915'),
 (1.0, '8906'),
 (1.0, '8818'),
 (1.0, '8663'),
 (1.0, '8622'),
 (1.0, '8584'),
 (1.0, '8570'),
 (1.0, '849'),
 (1.0, '8335'),
 (1.0, '8227')]

In [383]:
topMatches(sellers,'18')

[(1.00000000000002, '561'),
 (1.000000000000016, '5630'),
 (1.0000000000000089, '7222'),
 (1.000000000000007, '316'),
 (1.000000000000007, '1048')]

In [384]:
topMatches(sellers,'18', similarity=sim_distance)

[(1.0, '9866'), (1.0, '9673'), (1.0, '9538'), (1.0, '9537'), (1.0, '9386')]

In [385]:
getRecommendations(sellers,'18')[0:5]

[(9.000000000000002, '9823'),
 (9.000000000000002, '9136'),
 (9.000000000000002, '9068'),
 (9.000000000000002, '7261'),
 (9.000000000000002, '4387')]

In [386]:
getRecommendations(sellers,'18', similarity=sim_distance)[0:5]

[(9.000000000000004, '12363'),
 (9.000000000000002, '9675'),
 (9.000000000000002, '9584'),
 (9.000000000000002, '8527'),
 (9.000000000000002, '8492')]