# IMPORTS

In [2]:
import sqlite3
import pandas as pd
import numpy as np
import math
sqlite3.sqlite_version
%matplotlib inline

from sklearn.neighbors import NearestNeighbors

database = sqlite3.connect('database.sqlite')

# DATA CLEANING
- seřadíme záznamy podle _ProductId_

- odstraníme duplikáty, aby naše výsledky byly bez biasu (velmi podobne produkty, stejne review) - nechavame jen prvni

- díváme se na velikost _HelpfulnessNumerator_ a _HelpfulnessDenominator_


- vytvoření dictionary _users_products_, kde si pro každého uživatele ukládáme koupené produkty a číselné hodnocení, které jim dal (dictionary kvůli průměrně konstantní časové složitosti dotazu)

In [3]:
df = pd.read_sql_query("select * from Reviews limit 6000", database)
df

Unnamed: 0,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text
0,1,B001E4KFG0,A3SGXH7AUHU8GW,delmartian,1,1,5,1303862400,Good Quality Dog Food,I have bought several of the Vitality canned d...
1,2,B00813GRG4,A1D87F6ZCVE5NK,dll pa,0,0,1,1346976000,Not as Advertised,Product arrived labeled as Jumbo Salted Peanut...
2,3,B000LQOCH0,ABXLMWJIXXAIN,"Natalia Corres ""Natalia Corres""",1,1,4,1219017600,"""Delight"" says it all",This is a confection that has been around a fe...
3,4,B000UA0QIQ,A395BORC6FGVXV,Karl,3,3,2,1307923200,Cough Medicine,If you are looking for the secret ingredient i...
4,5,B006K2ZZ7K,A1UQRSCLF8GW1T,"Michael D. Bigham ""M. Wassir""",0,0,5,1350777600,Great taffy,Great taffy at a great price. There was a wid...
...,...,...,...,...,...,...,...,...,...,...
5995,5996,B000I6PXLC,A1EWO3RDAXB4VO,J. OTTO,1,7,1,1278720000,This is not that great,You have to put a lot of sugar in this to mak...
5996,5997,B003SE52K8,AXSV7TB3PCP95,Linda Morton,5,5,5,1313280000,"The ""paws down"" favorite among canned food!","My female Pixie Bob cat, Pawnee, is extremely ..."
5997,5998,B003SE52K8,A14JSPGLFT4C68,Brian Field,4,4,5,1331596800,Worked great for my cat,This is a great food and my cat loves it! The ...
5998,5999,B003SE52K8,AHDUT7IGZW7HY,"AlleyCat Advocat ""Laure""",4,4,5,1312761600,IBD food that my cat will actually eat!!!!!!!!,I was at my wits end with hypoallergenic foods...


In [4]:
#Deduplication of entries
df = df.drop_duplicates(subset={"UserId","ProfileName","Time","Text"}, keep='first')
df.shape

(5978, 10)

In [5]:
df = df[df.HelpfulnessNumerator<=df.HelpfulnessDenominator]

In [6]:
users_products = {}
for index, radek in df.iterrows():
    product_info = []
    user_id = radek.UserId
    product_id = radek.ProductId
    score = radek.Score
    product_info.append(product_id)
    product_info.append(score)

    if user_id not in users_products:
        users_products[user_id] = []
    users_products[user_id] += [product_info]
users_products

{'A3SGXH7AUHU8GW': [['B001E4KFG0', 5]],
 'A1D87F6ZCVE5NK': [['B00813GRG4', 1]],
 'ABXLMWJIXXAIN': [['B000LQOCH0', 4]],
 'A395BORC6FGVXV': [['B000UA0QIQ', 2]],
 'A1UQRSCLF8GW1T': [['B006K2ZZ7K', 5]],
 'ADT0SRK1MGOEU': [['B006K2ZZ7K', 4]],
 'A1SP2KVKFXXRU1': [['B006K2ZZ7K', 5]],
 'A3JRGQVEQN31IQ': [['B006K2ZZ7K', 5]],
 'A1MZYO9TZK0BBI': [['B000E7L2R4', 5]],
 'A21BT40VZCCYT4': [['B00171APVA', 5]],
 'A3HDKO7OW0QNK4': [['B0001PB9FE', 5]],
 'A2725IB4YY9JEB': [['B0009XLVG0', 5], ['B001L1KH6Y', 5]],
 'A327PCT23YH90': [['B0009XLVG0', 1]],
 'A18ECVX2RJ7HUE': [['B001GVISJM', 4]],
 'A2MUGFV2TDQ47K': [['B001GVISJM', 5],
  ['B000G6RYNE', 3],
  ['B001CWZXIY', 5],
  ['B00139TT72', 5]],
 'A1CZX3CP8IKQIJ': [['B001GVISJM', 5]],
 'A3KLWF6WQ5BNYO': [['B001GVISJM', 2]],
 'AFKW14U97Z6QO': [['B001GVISJM', 5]],
 'A2A9X58G2GTBLP': [['B001GVISJM', 5]],
 'A3IV7CL2C13K2U': [['B001GVISJM', 5]],
 'A1WO0KGLPR5PV6': [['B001GVISJM', 5]],
 'AZOF9E17RGZH8': [['B001GVISJM', 5]],
 'ARYVQL4N737A1': [['B001GVISJM', 5]],
 'AJ

In [7]:
most_popular = pd.read_sql_query("""
select ProductId, sum(Score)
from Reviews
where Score in (3, 4, 5)
group by ProductId
order by sum(Score) desc
limit 6000
""", database)

most_popular

Unnamed: 0,ProductId,sum(Score)
0,B007JFMH8M,4156
1,B003B3OOPA,2932
2,B002QWP8H0,2849
3,B002QWP89S,2849
4,B002QWHJOU,2849
...,...,...
5995,B001II46YO,67
5996,B001GVIRCU,67
5997,B001FA1SI2,67
5998,B001EQ5GX0,67


# Interaction matrix
- vytvářím tabulku _im_, kde si ukládám interakce mezi každým uživatelem

In [8]:
im = df[['UserId', 'ProductId', 'Score']] #interaction matrix
im = im.drop_duplicates(['UserId', 'ProductId'])
im = im.pivot(index='UserId', columns='ProductId')
im

Unnamed: 0_level_0,Score,Score,Score,Score,Score,Score,Score,Score,Score,Score,Score,Score,Score,Score,Score,Score,Score,Score,Score,Score,Score
ProductId,B00002NCJC,B00002Z754,B000084DVR,B000084E1U,B0000CGFV4,B0000D94SZ,B0000DC5IY,B0000E65W9,B0000GGI00,B0000TU8EO,...,B008YAXFWI,B008YGWIZM,B009166ECC,B0092X7B5S,B0092XAMDQ,B0093NIWVO,B0096E5196,B009HINRX8,B009UOFU20,B009WSNWC4
UserId,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
A10012K7DF3SBQ,,,,,,,,,,,...,,,,,,,,,,
A1001TYW5FZYD9,,,,,,,,,,,...,,,,,,,,,,
A102XKYZE9Q9L4,,,,,,,,,,,...,,,,,,,,,,
A10317LUD1C1VJ,,,,,,,,,,,...,,,,,,,,,,
A103EZCS9H8WW1,,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
AZVQMPXZEHQGZ,,,,,,,,,,,...,,,,,,,,,,
AZVR7NPPEDMMN,,,,,,,,,,,...,,,,,,,,,,
AZX7EZIIFMFI7,,,,,,,,,,,...,,,,,,,,,,
AZYJE40XW6MFG,,,,,,,,,,,...,,,,,,,,,,


In [9]:
X = im.values
print(X)

[[nan nan nan ... nan nan nan]
 [nan nan nan ... nan nan nan]
 [nan nan nan ... nan nan nan]
 ...
 [nan nan nan ... nan nan nan]
 [nan nan nan ... nan nan nan]
 [nan nan nan ... nan nan nan]]


In [10]:
for a in range(0, len(X)):
    aa = X[a]
    for b in range(0, len(aa)):
        if math.isnan(X[a][b]):
            X[a][b] = 0
X

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

# NEAREST NEIGHBORS
- hledám uživatele, kteří si kupovali nejpodobnější produkty a byli podobně spokojení (každého uživatele si reprezentujeme jako vektor určený jeho hodnoceními pro různé produkty - potom sousedy můžeme najít jednoduše nalezením nejbližších vektorů)
- lze zvolit různé metriky na určení vzdálenosti vektorů - já pro jednoduchost používám funkci _NearestNeighbors_ z knihovny _sklearn.neighbors_, která používá euklidovskou normu
- sousedy každého uživatele si ukládám do dictionary _nearest_neighbors_ (dictionary používám kvůli dobré časové složitosti)

In [11]:
knn = NearestNeighbors(n_neighbors=5, algorithm='ball_tree').fit(X)
knn

In [None]:
distances, indices = knn.kneighbors(X)
indices

In [None]:
distances

In [None]:
knn_graph = knn.kneighbors_graph(X, mode='distance').toarray()
# metric='cosine', include_self=False
knn_graph

In [None]:
nearest_neighbors = {}
for i in range(knn_graph.shape[0]):
    user_id = im.index.values[i]
    x = []

    for j in range(knn_graph.shape[1]):
        if knn_graph[i][j] != 0.0:
            neighbor_id = im.index.values[j]
            x.append(neighbor_id)

    nearest_neighbors[user_id] = x

nearest_neighbors

In [None]:
users_products

In [None]:
# users_with_rec_prods = []
recommendations = {}
for user_id in nearest_neighbors.keys():
    recomms_priorities = {}
    if user_id not in recommendations:
        recommendations[user_id] = []
    already_bought = []
    for item in users_products[user_id]:
        already_bought.append(item[0])

    for neighbor in nearest_neighbors[user_id]:
        for product in users_products[neighbor]:
            if product[0] not in already_bought: #pokud uzivatel jeste produkt nekoupil
                if not int(product[1]) < 3: #pokud se sousedovi film libil
                    if not product[0] in recomms_priorities:
                        recomms_priorities[product[0]] = 0
                    recomms_priorities[product[0]] += int(product[1])

    sorted_recomms = {k: v for k, v in sorted(recomms_priorities.items(), key=lambda item: item[1])}

    for s in range(10):
        for item in sorted_recomms.items():
            recommendations[user_id].append(item[0])
        if s == len(sorted_recomms.items())-1:
            break
        if user_id not in users_with_rec_prods:
            users_with_rec_prods.append(user_id)

# INPUT
- Po zadání _UserId_ a požadovaného počtu produktů k doporučení budou vráceny produkty seřazené od toho nejrelevantnějšího.

In [None]:
user_id = input('Enter the Id of user you wish to receive recommendations for (for example A2W3ABLWMJ84NS, A2WH53MCZ2WE3, A2WJH1QD0M9X51, A2WVF9ZQ068DN0, A2ZNLPYMZOXLGX) : ')
n = input('Enter number of recommended products: ')

def main(user_id, n):
    n = int(n)
    to_return = []
    for recommendation in recommendations[user_id]:
        if len(to_return) < n:
            to_return.append(recommendation)
    for recommendation in most_popular.ProductId:
        if len(to_return) < n:
            to_return.append(recommendation)
    return to_return

main(user_id, n)