<a href="https://colab.research.google.com/github/caiom26/AnaliseDeDados/blob/main/REC_KNN.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install scikit-surprise

Collecting scikit-surprise
  Downloading scikit_surprise-1.1.4.tar.gz (154 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m154.4/154.4 kB[0m [31m2.0 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Building wheels for collected packages: scikit-surprise
  Building wheel for scikit-surprise (pyproject.toml) ... [?25l[?25hdone
  Created wheel for scikit-surprise: filename=scikit_surprise-1.1.4-cp310-cp310-linux_x86_64.whl size=2357233 sha256=203577aa1b285c1c0191750223a29bf87167330cce715221f5b20e1ac61040e5
  Stored in directory: /root/.cache/pip/wheels/4b/3f/df/6acbf0a40397d9bf3ff97f582cc22fb9ce66adde75bc71fd54
Successfully built scikit-surprise
Installing collected packages: scikit-surprise
Successfully installed scikit-surprise-1.1.4


In [None]:
import os
import numpy as np
import pandas as pd
from google.colab import files

In [None]:
df_movies=pd.read_parquet('/content/movies.parquet')
df_ratings=pd.read_parquet('/content/ratings.parquet')

In [None]:
df_movies.set_index('item_id',inplace=True)


In [None]:
df_movies.head()


Unnamed: 0_level_0,title,genres
item_id,Unnamed: 1_level_1,Unnamed: 2_level_1
1,Toy Story (1995),Animation|Children's|Comedy
2,Jumanji (1995),Adventure|Children's|Fantasy
3,Grumpier Old Men (1995),Comedy|Romance
4,Waiting to Exhale (1995),Comedy|Drama
5,Father of the Bride Part II (1995),Comedy


In [None]:
train_size=0.75

In [None]:
df_ratings=df_ratings.sort_values(by='timestamp',ascending=True)

In [None]:
df_ratings.head()

Unnamed: 0,userID,itemID,rating,timestamp
1000138,6040,858,4,956703932
1000153,6040,2384,4,956703954
999873,6040,593,5,956703954
1000007,6040,1961,4,956703977
1000192,6040,2019,5,956703977


In [None]:
df_ratings=df_ratings.rename(columns={'userid':'userID','itemid':'itemID'})

In [None]:
df_train,df_test=np.split(df_ratings,[int(train_size*len(df_ratings))])

In [None]:
print(df_train.shape)
print(df_test.shape)

(750156, 4)
(250053, 4)


In [None]:
from surprise import Reader,Dataset

In [None]:
def convert_train_valid_sets(df_train:pd.DataFrame,df_test:pd.DataFrame):
  reader=Reader(rating_scale=(1,5))

  train_set=(
      Dataset
      .load_from_df(df_train[['userID','itemID','rating']],reader)
      .build_full_trainset()
  )
  test_set=(
      Dataset
      .load_from_df(df_test[['userID','itemID','rating']],reader)
      .build_full_trainset()
      .build_testset()
  )
  return train_set,test_set

In [None]:
train_set,test_set=convert_train_valid_sets(df_train,df_test)

In [None]:
train_set

<surprise.trainset.Trainset at 0x799481584b80>

In [None]:
from surprise import KNNWithMeans

In [None]:
sim_options={
    'name':'pearson_baseline','user_based':True}

In [None]:
model=KNNWithMeans(k=40,sim_options=sim_options,verbose=True)


In [None]:
model

<surprise.prediction_algorithms.knns.KNNWithMeans at 0x79947f1fb6d0>

In [None]:
%%time
model.fit(train_set)

Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
CPU times: user 46 s, sys: 2.16 s, total: 48.1 s
Wall time: 48.1 s


<surprise.prediction_algorithms.knns.KNNWithMeans at 0x79947f0a98d0>

In [None]:
user_id=df_test['userID'].sample().tolist()[0]
item_id=df_test['itemID'].sample().tolist()[0]

In [None]:
user_id

1285

In [None]:
model.predict(uid=user_id,iid=item_id)

Prediction(uid=1285, iid=913, r_ui=None, est=4.689505934936586, details={'actual_k': 40, 'was_impossible': False})

In [None]:
df_test[ 'pred']=df_test.apply(
 lambda x:model.predict(uid=x['userID'],iid=x['itemID']).est,
 axis=1)

In [None]:
def recommend_n_items(model, user_id, item_ids:np.array, n=20):
  df_predictions = pd.DataFrame(columns=['item_id', 'score'])
  for item_id in item_ids:
    prediction = model.predict(uid=user_id, iid=item_id).est
    df_predictions.loc[df_predictions.shape[0]] = [item_id, prediction]

  user_predictions = (
      df_predictions
      .sort_values(by='score', ascending=False)
      .head(n)
      .set_index('item_id')
  )
  return user_predictions


In [None]:
user_id = 1879
recommendable_items = df_movies.index.values
recommend_n_items(model, user_id, recommendable_items, n=5)

Unnamed: 0_level_0,score
item_id,Unnamed: 1_level_1
864.0,5.0
2332.0,5.0
687.0,5.0
318.0,5.0
130.0,5.0


In [None]:
item_id = 1     # Toy Story
item_id = 1356  # Star Trek: First Contact
item_id = 260   # Star Wars: Episode IV - A New Hope
# item_id = 3578  # Gladiator

def get_item_k_neighbors(model, item_id, k=10):
  iid = model.trainset.to_inner_iid(item_id)
  neighbor_iids = model.get_neighbors(iid, k)
  item_ids = [model.trainset.to_raw_iid(iid) for iid in neighbor_iids]
  return item_ids

k = 10
title = df_movies.loc[item_id]['title']
print (f'{k} vizinhos mais próximos de "{title}" (ID = {item_id})')
item_ids = get_item_k_neighbors(model, item_id, k)
df_movies[df_movies.index.isin(item_ids)]

10 vizinhos mais próximos de "Star Wars: Episode IV - A New Hope (1977)" (ID = 260)


ValueError: 3947 is not a valid inner id.

In [None]:
!pip install git+https://github.com/gbolmier/funk-svd

Collecting git+https://github.com/gbolmier/funk-svd
  Cloning https://github.com/gbolmier/funk-svd to /tmp/pip-req-build-p39ryrbc
  Running command git clone --filter=blob:none --quiet https://github.com/gbolmier/funk-svd /tmp/pip-req-build-p39ryrbc
  Resolved https://github.com/gbolmier/funk-svd to commit fecc38ea1c2859ef6a6d9af0b7f953e1b693764e
  Preparing metadata (setup.py) ... [?25l[?25hdone
INFO: pip is looking at multiple versions of funk-svd to determine which version is compatible with other requirements. This could take a while.
[31mERROR: Package 'funk-svd' requires a different Python: 3.10.12 not in '<3.10,>=3.6.5'[0m[31m
[0m

In [None]:
item_id=2

In [None]:
get_item_k_neighbors(model, item_id=2,k=6)

ValueError: 3973 is not a valid inner id.

In [None]:
k = 5
title = df_items.loc[item_id]['title']
print (f'{k} vizinhos mais próximos de "{title}" (ID = {item_id})')
item_ids = get_item_k_neighbors(model, item_id, k)
df_items[df_items.index.isin(item_ids)]

5 vizinhos mais próximos de "Jumanji (1995)" (ID = 2)


ValueError: 3973 is not a valid inner id.