<a href="https://colab.research.google.com/github/cbobadillaunsa/movielens/blob/main/BI_MovieLens.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Recommendation System

# Small Dataset

## Imports

In [1]:
import math
import pandas as pd

## Code

### Read CSV file and get specifics column

In [2]:
# Read CSV file by name
def readFile(filename, delim=','):
    data = pd.read_csv(filename, delimiter=delim)
    return data


# Search user preferences
def buscar(user, data):
    lista = data[user].tolist()
    return lista

### Distances

In [3]:
# Euclidian distance
def euclidiana(user1, user2):
    dist = 0.0
    count = 0
    for i in range(len(user1)):
        if (not math.isnan(user1[i])) and (not math.isnan(user2[i])):
            x = user1[i]
            y = user2[i]
            dist += math.pow(x - y, 2)
            count += 1

    if count == 0:
        return 9999.99
    return math.sqrt(dist)


# Manhattan distance
def manhattan(user1, user2):
    dist = 0.0
    count = 0
    for i in range(len(user1)):
        if (not math.isnan(user1[i])) and (not math.isnan(user2[i])):
            x = user1[i]
            y = user2[i]
            dist += abs(x - y)
            count += 1

    if count == 0:
        return 9999.99
    return dist


# Cosine distance
def coseno(user1, user2):
    xy = xx = yy = 0.0
    count = 0
    for i in range(len(user1)):
        if (not math.isnan(user1[i])) and (not math.isnan(user2[i])):
            x = user1[i]
            y = user2[i]
            xy += x * y
            xx += x * x
            yy += y * y
            count += 1

    den = math.sqrt(xx) * math.sqrt(yy)
    if den == 0:
      return math.nan
    if count == 0:
      return math.nan
    return xy / den  # -1 a +1


# Pearson distance
def pearson(user1, user2):
    xy = zx = zy = n = zxx = zyy = 0.0
    count = 0
    for i in range(len(user1)):
        if (not math.isnan(user1[i])) and (not math.isnan(user2[i])):
            x = user1[i]
            y = user2[i]
            xy += x * y
            zx += x
            zy += y
            n += 1
            zxx += x * x
            zyy += y * y
            count += 1

    a = xy - (zx * zy) / n
    b = math.sqrt(zxx - (zx * zx) / n) * math.sqrt(zyy - (zy * zy) / n)
    if b == 0:
      return math.nan
    if count == 0:
      return math.nan
    return a / b  # -1 a +1

### K-nn

In [4]:
# K-Nearest neighbour
def knn(N, distancia, usuario, data):  # N numero de vecinos
    funName = distancia.__name__
    print('k-nn', funName)

    listDist, listName = initVectDist(funName, N)
    otherusers = data.columns.values.tolist()
    vectoruser = buscar(usuario, data)

    for i in range(1, len(otherusers)):
        tmpuser = otherusers[i]
        if tmpuser != usuario:
            tmpvector = buscar(tmpuser, data)
            tmpdist = distancia(vectoruser, tmpvector)
            if tmpdist is not math.nan:
              listDist, listName = keepClosest(funName, listDist, listName, tmpdist, tmpuser, N)

    return listDist, listName

### Keep closest values only (RUN)

In [5]:
# Init K-vector with correct value based on distance type
def initVectDist(funName, N):
    if funName == 'euclidiana' or funName == 'manhattan' or funName == 'euclidianaL' or funName == 'manhattanL':
        ls = [99999] * N
    else:
        ls = [-1] * N

    lu = [None] * N
    return ls, lu


# Keep the closest values, avoiding sort
def keepClosest(funname, lstdist, lstuser, newdist, newuser, N):
    if funname == 'euclidiana' or funname == 'manhattan' or funname == 'euclidianaL' or funname == 'manhattanL':
        count = -1
        for i in lstdist:
            count += 1
            if newdist > i:
                continue
            lstdist.insert(count, newdist)
            lstuser.insert(count, newuser)
            break
    else:
        count = -1
        for i in lstdist:
            count += 1
            if newdist < i:
                continue
            lstdist.insert(count, newdist)
            lstuser.insert(count, newuser)
            break

    if len(lstdist) > N:
        lstdist.pop()
        lstuser.pop()
    return lstdist, lstuser

### Recommendation

In [6]:
# Find the K closest firsts Item recommendation
def recommendation(usuario, distancia, N, items, minr, data):
    ldistK, luserK = knn(N, distancia, usuario, data)

    user = buscar(usuario, data)
    recom = [None] * N
    for i in range(0, N):
        recom[i] = buscar(luserK[i], data)
    # print('user preference:', user)

    lstRecomm = [-1] * items
    lstUser = [None] * items
    lstObj = [None] * items
    k = 0

    for i in range(len(user)):
        if math.isnan(user[i]):
            for j in range(0, N):
                if not (math.isnan(recom[j][i])):
                    currentR = recom[j][i]
                    if currentR > minr and lstRecomm[k] < currentR:
                        lstRecomm[k] = currentR
                        lstUser[k] = j
                        lstObj[k] = i
            if lstRecomm[k] > -1:
                k += 1
        if k >= items:
            break

    finallst = []
    for i in range(k):
        tmp = [luserK[lstUser[i]], data.iloc[lstObj[i], 0], lstRecomm[i]] # UnSort
        finallst.append(tmp)
    return finallst

## Distances Test

In [None]:
datos = readFile('small.csv', ';')

la = buscar('Angelica', datos)
print(la)
lb = buscar('Bill', datos)
print(lb)

print(euclidiana(la, lb))
print(manhattan(la, lb))
print(coseno(la, lb))
print(pearson(la, lb))

[3.5, 2.0, nan, 4.5, 5.0, 1.5, 2.5, 2.0]
[2.0, 3.5, 4.0, nan, 2.0, 3.5, nan, 3.0]
4.301162633521313
9.0
0.7939410698035858
-0.9040534990682699


### Form Test

In [None]:
la = buscar('Angelica', datos)
lb = buscar('Chan', datos)
print(euclidiana(la, lb))

la = buscar('Hailey', datos)
lb = buscar('Jordyn', datos)
print(euclidiana(la, lb))

la = buscar('Hailey', datos)
lb = buscar('Sam', datos)
print(euclidiana(la, lb))

la = buscar('Veronica', datos)
lb = buscar('Bill', datos)
print(euclidiana(la, lb))

la = buscar('Hailey', datos)
lb = buscar('Jordyn', datos)
print(manhattan(la, lb))

la = buscar('Sam', datos)
lb = buscar('Chan', datos)
print(manhattan(la, lb))

la = buscar('Dan', datos)
lb = buscar('Veronica', datos)
print(manhattan(la, lb))

la = buscar('Angelica', datos)
lb = buscar('Bill', datos)
print(manhattan(la, lb))

la = buscar('Veronica', datos)
lb = buscar('Hailey', datos)
print(coseno(la, lb))

la = buscar('Sam', datos)
lb = buscar('Bill', datos)
print(pearson(la, lb))

2.3979157616563596
4.387482193696061
2.449489742783178
2.449489742783178
7.5
4.0
4.0
9.0
0.9701425001453319
-0.8164965809277261


## Small dataset

In [None]:
datos = readFile('small.csv', ';')
usuario = 'Bill'

rfunc = euclidiana
ldist, luser = knn(3, rfunc, usuario, datos)
print('close', ldist)
print('users', luser)

for i in range(0, len(ldist)):
  print(luser[i], ldist[i])


k-nn euclidiana
close [2.1213203435596424, 2.449489742783178, 3.4641016151377544]
users ['Dan', 'Veronica', 'Jordyn']
Dan 2.1213203435596424
Veronica 2.449489742783178
Jordyn 3.4641016151377544


In [None]:
lista = recommendation(usuario, rfunc, 3, 5, 3.0, datos)
for i in lista:
    print(i)

k-nn euclidiana
['Veronica', 'Norah Jones', 5.0]
['Dan', 'The Strokes', 4.0]


## Medium dataset
http://guidetodatamining.com/chapter2/

In [None]:
#datos = readFile('medium.csv')
datos = readFile('http://guidetodatamining.com/assets/data/Movie_Ratings.csv')
usuario = 'Jessica'

rfunc = manhattan
ldist, luser = knn(3, rfunc, usuario, datos)
print('close', ldist)
print('users', luser)
for i in range(0, len(ldist)):
  print(luser[i], ldist[i])

k-nn manhattan
close [3.0, 4.0, 5.0]
users ['Amy', 'Erin', 'Josh']
Amy 3.0
Erin 4.0
Josh 5.0


In [None]:
lista = recommendation(usuario, rfunc, 5, 3, 3.0, datos)
for i in lista:
    print(i)

k-nn manhattan
['Katherine', 'Avatar', 5.0]
['Erin', 'Braveheart', 4.0]
['Katherine', 'Dodgeball', 5.0]


### Form Test

In [None]:
la = buscar('Jessica', datos)
lb = buscar('Jeff', datos)
print(euclidiana(la, lb))

la = buscar('Matt', datos)
lb = buscar('Josh', datos)
print(pearson(la, lb))

la = buscar('Matt', datos)
lb = buscar('Josh', datos)
print(manhattan(la, lb))

la = buscar('Erin', datos)
lb = buscar('Patrick T', datos)
print(coseno(la, lb))

la = buscar('ben', datos)
lb = buscar('Gary', datos)
print(pearson(la, lb))

3.0
nan
1.0
0.8789531540569908
0.21573139418783355


# Large Dataset

https://grouplens.org/datasets/movielens/

## Code

### Read file to Sparse Matrix

In [7]:
def readLargeFile(filename, delim=','):
  data = pd.read_csv(filename, delimiter=delim, header=None)

  lst = {} # Dictionary
  j = 0
  for index, row in data.iterrows():
      #print(row[0], row[1], row[2])
      if j != row[0]:
          j = row[0]
          tmp = {row[1]:row[2]}
          lst[row[0]] = tmp
      else:
          tmp = lst.get(row[0])
          tmp[row[1]] = row[2]
          lst[row[0]] = tmp
  return lst

### Distances (Sparse Matrix)

In [None]:
# Euclidian distance
def euclidianaL(user1, user2):
    dist = 0.0
    count = 0
    for i in user2:
        if not (user1.get(i) is None):
            x = user1.get(i)
            y = user2.get(i)
            dist += math.pow(x - y, 2)
            count += 1

    if count == 0:
        return 9999.99
    return math.sqrt(dist)

# Manhattan distance
def manhattanL(user1, user2):
    dist = 0.0
    count = 0
    for i in user2:
        if not (user1.get(i) is None):
            x = user1[i]
            y = user2[i]
            dist += abs(x - y)
            count += 1

    if count == 0:
        return 9999.99
    return dist


# Cosine distance
def cosenoL(user1, user2):
    xy = xx = yy = 0.0
    count = 0
    for i in user2:
        if not (user1.get(i) is None):
            x = user1[i]
            y = user2[i]
            xy += x * y
            xx += x * x
            yy += y * y
            count += 1

    den = math.sqrt(xx) * math.sqrt(yy)
    if den == 0:
      return math.nan
    if count == 0:
        return math.nan
    return xy / den  # -1 a +1


# Pearson distance
def pearsonL(user1, user2):
    xy = zx = zy = n = zxx = zyy = 0.0
    count = 0
    for i in user2:
        if not (user1.get(i) is None):
            x = user1[i]
            y = user2[i]
            xy += x * y
            zx += x
            zy += y
            n += 1
            zxx += x * x
            zyy += y * y
            count += 1

    if n == 0:
        return math.nan
    a = xy - (zx * zy) / n
    b = math.sqrt(zxx - (zx * zx) / n) * math.sqrt(zyy - (zy * zy) / n)
    if b == 0:
        return math.nan
    if count == 0:
        return math.nan
    return a / b  # -1 a +1

### K-NN (Sparse Matrix)

In [None]:
# K-Nearest neighbour
def knn_L(N, distancia, usuario, data):  # N numero de vecinos
    funName = distancia.__name__
    print('k-nn', funName)

    listDist, listName = initVectDist(funName, N)
    nsize = len(data)
    otherusers = range(0, nsize)
    vectoruser = data.get(usuario)

    for i in range(0, nsize):
        tmpuser = i
        if tmpuser != usuario:
            tmpvector = data.get(tmpuser)
            if not (tmpvector is None):
              tmpdist = distancia(vectoruser, tmpvector)
              if tmpdist is not math.nan:
                listDist, listName = keepClosest(funName, listDist, listName, tmpdist, tmpuser, N)

    return listDist, listName

### Recommendation (sparce Matrix)

In [None]:
# Find the K closest firsts Item recommendation
def recommendationL(usuario, distancia, N, items, minr, data):
    ldistK, luserK = knn_L(N, distancia, usuario, data)

    user = data.get(usuario)
    recom = [None] * N
    for i in range(0, N):
        recom[i] = data.get(luserK[i])
    # print('user preference:', user)

    lstRecomm = [-1] * items
    lstUser = [None] * items
    lstObj = [None] * items
    k = 0

    fullObjs = {}
    count = 0
    for i in recom:
        for j in i:
          tmp = fullObjs.get(j)
          if tmp is None:
            fullObjs[j] = [i.get(j), luserK[count]]
          else:
            nval = i.get(j)
            if nval > tmp[0]:
              fullObjs[j] = [nval, luserK[count]]
        count += 1

    finallst = topSuggestions(fullObjs, count, items)
    return finallst

Top suggestions filter

In [None]:
def topSuggestions(fullObj, k, items):
  rp = [-1]*items

  for i in fullObj:
    rating = fullObj.get(i)

    for j in range(0, items):
      if rp[j] == -1 :
        tmp = [i, rating[0], rating[1]]
        rp.insert(j, tmp)
        rp.pop()
        break
      else:
        tval = rp[j]
        if tval[1] < rating[0]:
          tmp = [i, rating[0], rating[1]]
          rp.insert(j, tmp)
          rp.pop()
          break

  return rp


## 1 Million
ml-1m.zip contents 1M recommendations
https://grouplens.org/datasets/movielens/

### Load 1M Dataset

In [None]:
!wget 'https://files.grouplens.org/datasets/movielens/ml-1m.zip'

--2023-11-25 14:13:09--  https://files.grouplens.org/datasets/movielens/ml-1m.zip
Resolving files.grouplens.org (files.grouplens.org)... 128.101.65.152
Connecting to files.grouplens.org (files.grouplens.org)|128.101.65.152|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 5917549 (5.6M) [application/zip]
Saving to: ‘ml-1m.zip’


2023-11-25 14:13:09 (32.8 MB/s) - ‘ml-1m.zip’ saved [5917549/5917549]



In [None]:
!unzip ml-1m.zip

Archive:  ml-1m.zip
   creating: ml-1m/
  inflating: ml-1m/movies.dat        
  inflating: ml-1m/ratings.dat       
  inflating: ml-1m/README            
  inflating: ml-1m/users.dat         


In [None]:
lst = readLargeFile('ml-1m/ratings.dat', '::') # 55 seg, 6040

  data = pd.read_csv(filename, delimiter=delim, header=None)


Size

In [None]:
len(lst)

6040

Distance Test

In [None]:
a = lst.get(50)
print('a: ', len(a))

b = lst.get(5707)
print('b: ', len(b))

print('euclidiana: ', euclidianaL(a,b))
print('manhattan:  ', manhattanL(a, b))
print('coseno:     ', cosenoL(a, b))
print('pearson:    ', pearsonL(a, b))

a:  43
b:  100
euclidiana:  1.4142135623730951
manhattan:   2.0
coseno:      0.9897475249773018
pearson:     1.000000000000004


### KNN 1M

In [None]:
usuario = 50
rfunc = euclidianaL

ldist, luser = knn_L(10, rfunc, usuario, lst)
print('close', ldist)
print('users', luser)

k-nn euclidianaL
close [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]
users [6039, 6006, 6005, 5992, 5988, 5979, 5976, 5962, 5960, 5958]


### Recommendation 1M

In [None]:
usuario = 50
rfunc = euclidianaL

# 4 vecinos, 10 recomendaciones
lista = recommendationL(usuario, rfunc, 4, 10, 3.0, lst)
for i in lista:
    print('user:', i[2], 'obj:',i[0], 'rating:', i[1])

k-nn euclidianaL
user: 6039 obj: 3037 rating: 5
user: 6039 obj: 903 rating: 5
user: 6039 obj: 904 rating: 5
user: 6039 obj: 913 rating: 5
user: 6039 obj: 916 rating: 5
user: 6039 obj: 918 rating: 5
user: 6006 obj: 919 rating: 5
user: 6039 obj: 922 rating: 5
user: 6039 obj: 926 rating: 5
user: 6039 obj: 3088 rating: 5


## 10 million
ml-10m.zip contents 10M recommendations
https://grouplens.org/datasets/movielens/

### Load 10M Dataset

In [None]:
!wget 'https://files.grouplens.org/datasets/movielens/ml-10m.zip'

--2023-11-18 13:37:47--  https://files.grouplens.org/datasets/movielens/ml-10m.zip
Resolving files.grouplens.org (files.grouplens.org)... 128.101.65.152
Connecting to files.grouplens.org (files.grouplens.org)|128.101.65.152|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 65566137 (63M) [application/zip]
Saving to: ‘ml-10m.zip’


2023-11-18 13:37:49 (39.6 MB/s) - ‘ml-10m.zip’ saved [65566137/65566137]



In [None]:
!unzip ml-10m.zip

Archive:  ml-10m.zip
   creating: ml-10M100K/
  inflating: ml-10M100K/allbut.pl    
  inflating: ml-10M100K/movies.dat   
  inflating: ml-10M100K/ratings.dat  
  inflating: ml-10M100K/README.html  
  inflating: ml-10M100K/split_ratings.sh  
  inflating: ml-10M100K/tags.dat     


In [None]:
lstdb = readLargeFile('ml-10M100K/ratings.dat', '::') #  10 min 47 seg, 69878

  data = pd.read_csv(filename, delimiter=delim, header=None)


In [None]:
len(lstdb)

69878

### K-NN 10M

In [None]:
usuario = 345
rfunc = euclidianaL

ldist, luser = knn_L(15, rfunc, usuario, lstdb) # 1 seg
print('close', ldist)
print('users', luser)

k-nn euclidianaL
close [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]
users [69877, 69847, 69710, 69704, 69657, 69642, 69582, 69561, 69328, 69275, 68979, 68934, 68925, 68683, 68566]


### Recommendation 10M

In [None]:
usuario = 45600
rfunc = euclidianaL

# 10 vecinos, 20 recomendaciones
lista = recommendationL(usuario, rfunc, 10, 20, 3.0, lstdb) # 2 seg
for i in lista:
    print('user:', i[2], 'obj:',i[0], 'rating:', i[1])

k-nn euclidianaL
user: 69869 obj: 913.0 rating: 5.0
user: 69877 obj: 1199.0 rating: 5.0
user: 69877 obj: 1204.0 rating: 5.0
user: 69877 obj: 1676.0 rating: 5.0
user: 69877 obj: 2395.0 rating: 5.0
user: 69877 obj: 2712.0 rating: 5.0
user: 69877 obj: 2804.0 rating: 5.0
user: 69877 obj: 3307.0 rating: 5.0
user: 69877 obj: 3471.0 rating: 5.0
user: 69877 obj: 4022.0 rating: 5.0
user: 69877 obj: 6711.0 rating: 5.0
user: 69877 obj: 7361.0 rating: 5.0
user: 69876 obj: 587.0 rating: 5.0
user: 69876 obj: 590.0 rating: 5.0
user: 69855 obj: 832.0 rating: 5.0
user: 69876 obj: 858.0 rating: 5.0
user: 69869 obj: 923.0 rating: 5.0
user: 69876 obj: 1012.0 rating: 5.0
user: 69876 obj: 1036.0 rating: 5.0
user: 69876 obj: 1090.0 rating: 5.0


## 20 millions
ml-20m.zip contents 20M recommendations
https://grouplens.org/datasets/movielens/

### Load 20M Dataset

In [8]:
!wget 'https://files.grouplens.org/datasets/movielens/ml-20m.zip'

--2023-12-04 02:38:45--  https://files.grouplens.org/datasets/movielens/ml-20m.zip
Resolving files.grouplens.org (files.grouplens.org)... 128.101.65.152
Connecting to files.grouplens.org (files.grouplens.org)|128.101.65.152|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 198702078 (189M) [application/zip]
Saving to: ‘ml-20m.zip’


2023-12-04 02:38:59 (14.5 MB/s) - ‘ml-20m.zip’ saved [198702078/198702078]



In [9]:
!unzip ml-20m.zip

Archive:  ml-20m.zip
   creating: ml-20m/
  inflating: ml-20m/genome-scores.csv  
  inflating: ml-20m/genome-tags.csv  
  inflating: ml-20m/links.csv        
  inflating: ml-20m/movies.csv       
  inflating: ml-20m/ratings.csv      
  inflating: ml-20m/README.txt       
  inflating: ml-20m/tags.csv         


In [None]:
import pandas as pd
lstdb20 = readLargeFile('ml-20m/ratings.csv') # 17:42 min

  data = pd.read_csv(filename, delimiter=delim, header=None)


In [None]:
len(lstdb20)

138495

### KNN 20M

In [None]:
usuario = 98765
rfunc = euclidianaL

ldist, luser = knn_L(15, rfunc, usuario, lstdb20) # 2 seg
print('close', ldist)
print('users', luser)

k-nn euclidianaL
close [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]
users [138492, 138478, 138464, 138420, 138410, 138405, 138396, 138395, 138351, 138349, 138344, 138339, 138315, 138286, 138274]


### Recommendation 20M

In [None]:
usuario = 45600
rfunc = euclidianaL

# 10 vecinos, 20 recomendaciones
lista = recommendationL(usuario, rfunc, 10, 20, 3.0, lstdb20) # 3 seg
for i in lista:
    print('user:', i[2], 'obj:',i[0], 'rating:', i[1])

k-nn euclidianaL
user: 138466 obj: 233 rating: 5.0
user: 138466 obj: 246 rating: 5.0
user: 138466 obj: 265 rating: 5.0
user: 138252 obj: 1270 rating: 5.0
user: 138466 obj: 1542 rating: 5.0
user: 138466 obj: 1719 rating: 5.0
user: 138466 obj: 1922 rating: 5.0
user: 138466 obj: 1931 rating: 5.0
user: 138466 obj: 2324 rating: 5.0
user: 138466 obj: 2396 rating: 5.0
user: 138455 obj: 2571 rating: 5.0
user: 138466 obj: 2580 rating: 5.0
user: 138466 obj: 2599 rating: 5.0
user: 138466 obj: 2690 rating: 5.0
user: 138466 obj: 2706 rating: 5.0
user: 138466 obj: 2858 rating: 5.0
user: 138466 obj: 2997 rating: 5.0
user: 138466 obj: 3148 rating: 5.0
user: 138455 obj: 111 rating: 5.0
user: 138455 obj: 296 rating: 5.0
