# Aula 07 - Filtragem Baseada em Conhecimento - Exemplos

## Importação dos dados (MovieLens 100k)

In [1]:
import pandas as pd
import numpy as np

In [2]:
import wget
!python3 -m wget https://files.grouplens.org/datasets/movielens/ml-100k.zip
!tar -xvzf ml-100k.zip


100% [......................................................] 4924029 / 4924029
Saved under ml-100k.zip
x ml-100k/
x ml-100k/allbut.pl
x ml-100k/mku.sh
x ml-100k/README
x ml-100k/u.data
x ml-100k/u.genre
x ml-100k/u.info
x ml-100k/u.item
x ml-100k/u.occupation
x ml-100k/u.user
x ml-100k/u1.base
x ml-100k/u1.test
x ml-100k/u2.base
x ml-100k/u2.test
x ml-100k/u3.base
x ml-100k/u3.test
x ml-100k/u4.base
x ml-100k/u4.test
x ml-100k/u5.base
x ml-100k/u5.test
x ml-100k/ua.base
x ml-100k/ua.test
x ml-100k/ub.base
x ml-100k/ub.test


In [2]:
#Types of genres
genre = pd.read_csv('./ml-100k/u.genre', sep="|", encoding='latin-1', header=None)
genre.drop(genre.columns[1], axis=1, inplace=True)
genre.columns = ['Genres']
genre_list = list(genre['Genres'])
genre_list

['unknown',
 'Action',
 'Adventure',
 'Animation',
 "Children's",
 'Comedy',
 'Crime',
 'Documentary',
 'Drama',
 'Fantasy',
 'Film-Noir',
 'Horror',
 'Musical',
 'Mystery',
 'Romance',
 'Sci-Fi',
 'Thriller',
 'War',
 'Western']

In [3]:
#Load the Movies data
item = pd.read_csv('./ml-100k/u.item', sep="|", encoding='latin-1', header=None)
item.columns = ['movieId', 'title' ,'release','video release date', 'IMDb URL', 'unknown', 'Action', 
                'Adventure', 'Animation', 'Children\'s', 'Comedy', 'Crime', 'Documentary', 'Drama', 'Fantasy', 'Film-Noir', 
                'Horror', 'Musical', 'Mystery', 'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western']
item['release'] = pd.to_datetime(item['release'])
item = item[pd.notnull(item['release'])]
item['year'] = item['release'].dt.year.astype(int)
item.drop(columns=['release', 'video release date', 'IMDb URL'], inplace=True)
item.head()

Unnamed: 0,movieId,title,unknown,Action,Adventure,Animation,Children's,Comedy,Crime,Documentary,...,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western,year
0,1,Toy Story (1995),0,0,0,1,1,1,0,0,...,0,0,0,0,0,0,0,0,0,1995
1,2,GoldenEye (1995),0,1,1,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,1995
2,3,Four Rooms (1995),0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,1995
3,4,Get Shorty (1995),0,1,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,1995
4,5,Copycat (1995),0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,1,0,0,1995


In [4]:
df_meta = item.melt(id_vars=['movieId', 'title'], var_name='genre')
df_meta = df_meta[df_meta.value == 1]
df_meta.drop(columns=['value'], inplace=True)
df_meta[df_meta['movieId']==1]

Unnamed: 0,movieId,title,genre
5043,1,Toy Story (1995),Animation
6724,1,Toy Story (1995),Children's
8405,1,Toy Story (1995),Comedy


In [5]:
# Obter a lista de gêneros de um item
def get_genres(df, movieId):
    if movieId not in df['movieId'].values:
        return []
    return df.loc[(df.movieId==movieId),'genre'].tolist()

get_genres(df_meta, 3)

['Thriller']

# Filtragem Baseada em Casos

In [8]:
# Requisitos
ano_lancamento = 1998
generos_pref = ['Drama', 'War']
pesos = [0.5, 0.5]

# Similaridade de ano (próximo ao especificado pelo usuário)
min_ano = item['year'].min()
max_ano = item['year'].max()    
item['w_year'] = 1-abs(item['year']-ano_lancamento)/(max_ano-min_ano)

# Similaridade de gênero (Jaccard)
for i, row in item.iterrows():
    genres = get_genres(df_meta, item.loc[i, 'movieId'])
    item.loc[i,'w_genre'] = len(list(set(genres) & set(generos_pref))) / len(set(genres + generos_pref)) 

# Média das similaridades 
item['w'] = pesos[0]*item['w_year'] + pesos[1]*item['w_genre']
    
# Recomendação dos top 10
result = item.sort_values(by='w', ascending=False)
result.head(10)

Unnamed: 0,movieId,title,unknown,Action,Adventure,Animation,Children's,Comedy,Crime,Documentary,...,Mystery,Romance,Sci-Fi,Thriller,War,Western,year,w_year,w_genre,w
1662,1663,Nothing Personal (1995),0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,1997,0.986842,1.0,0.993421
934,935,Paradise Road (1997),0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,1997,0.986842,1.0,0.993421
689,690,Seven Years in Tibet (1997),0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,1997,0.986842,1.0,0.993421
1175,1176,Welcome To Sarajevo (1997),0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,1997,0.986842,1.0,0.993421
890,891,Bent (1997),0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,1997,0.986842,1.0,0.993421
743,744,Michael Collins (1996),0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,1996,0.973684,1.0,0.986842
470,471,Courage Under Fire (1996),0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,1996,0.973684,1.0,0.986842
9,10,Richard III (1995),0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,1996,0.973684,1.0,0.986842
1422,1423,"Walking Dead, The (1995)",0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,1995,0.960526,1.0,0.980263
317,318,Schindler's List (1993),0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,1993,0.934211,1.0,0.967105
