In [2]:
import pandas as pd
from sklearn.cluster import KMeans, DBSCAN
from sklearn.feature_extraction.text import TfidfVectorizer
import collections
from pprint import pprint

# Análise das tags

Utilizando a relação da frequência de cada uma das palavras, pude utilizar como entrada como a distancia entre cada uma das tags para um algoritmo de clusterização. o algoritmo utilizado foi o DBSCAN, principalmente por não precisar setar o numero de clusters. agora, temos uma repação de tags relacionadas, que serão utlizadas em meu modelo de recomendação.

## Primeiras análises dos clusters

Analisando superficialmente os grupos obtidos, podemos ver diversas razões para agrupamento: como nome do ator, tipo do filme, trilha sonora... com essa análise naao é possível demonstrar se a tag está recomendando ou apontando falhas do filme. 

Alguns dos grupos de tags que nao esperava achar: 

- [Bechdel Test:Pass](https://en.wikipedia.org/wiki/Bechdel_test)
- Better than the first
- true to the book


## Proximos passos

o rating vai ser um crivo final nessa etapa: explorarei filtrar tags com média muito baixa, ou, o que considero ideal, vincular de alguma forma a nota do usuario a sua tag.

Essa será a estratégia para a fase final do projeto.

In [51]:
from nltk.stem.snowball import PorterStemmer, SnowballStemmer
import nltk
from nltk.corpus import stopwords
from sklearn.metrics.pairwise import linear_kernel

def tag_tokenize(tag):
    tokens = nltk.word_tokenize(tag)
    stemmer = SnowballStemmer('english')
    sw = stopwords.words('english')
    stems = [stemmer.stem(t) for t in tokens if t not in sw and len(t) > 2]
    return stems
    

In [161]:
def cluster_tags(texts):
    text_frequencies = TfidfVectorizer(tokenizer=tag_tokenize,
                                       lowercase=True,
                                       ngram_range=(1, 2), min_df=0).fit_transform(texts)
    
    clustering_model = DBSCAN(eps=0.001, min_samples=5, metric='cosine').fit(text_frequencies)
    substitutive_class = {}
    labels = {-1: 'indeterminado'}
 
    for idx, label in enumerate(clustering_model.labels_):
        substitutive_class[texts[idx]] = label
 
    return substitutive_class

In [18]:
tags = pd.read_csv("ml-latest-small/tags.csv")
tags = tags[tags.tag.isna() == False]

(3683, 4)

{'"artsy"': -1,
 '06 Oscar Nominated Best Movie - Animation': -1,
 '1900s': -1,
 '1920s': -1,
 '1950s': -1,
 '1960s': -1,
 '1970s': -1,
 '1980s': -1,
 '1990s': -1,
 '2001-like': -1,
 '2D animation': 23,
 '70mm': -1,
 "80's": -1,
 'AIDs': -1,
 'AS Byatt': -1,
 'AWESOME': -1,
 'Aardman': 148,
 'Academy award (Best Supporting Actress)': -1,
 'Action': 55,
 'Adam Sandler': 36,
 'Adrien Brody': -1,
 'Adventure': 32,
 'Afghanistan': -1,
 'Africa': 49,
 'Agatha Christie': -1,
 'Al Pacino': 4,
 'Alcatraz': -1,
 'Alfred Hitchcock': -1,
 'Alicia Vikander': -1,
 'Amazing Cinematography': -1,
 'American Indians': -1,
 'American propaganda': -1,
 'Amish': -1,
 'Amtrak': -1,
 'Amy Adams': -1,
 'Andrew Lloyd Weber': -1,
 'Andy Garcia': -1,
 'Andy Kaufman': -1,
 'Andy Samberg': -1,
 'Angelina Jolie': -1,
 'Animal movie': 120,
 'Animation': 23,
 'Anne Boleyn': -1,
 'Anne Hathaway': -1,
 'Anthony Hopkins': -1,
 'Arnold Schwarzenegger': -1,
 'Arthur C. Clarke': -1,
 'Arthur Miller': -1,
 'Astaire and Rog

In [177]:
clusters = cluster_tags(tags.tag)
tag_rework = tags.replace({'tag': clusters}).groupby(['movieId', 'tag']).size().reset_index()
xxx = tag_rework.pivot(index='movieId', columns='tag')
xxx

Unnamed: 0_level_0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
tag,-1,0,1,2,3,4,5,6,7,8,...,177,178,179,180,181,182,183,184,185,186
movieId,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
1,,,,,,,,,,,...,,1.0,,,,,,,,
2,3.0,,,,,,,,,,...,,,,,,,,,,
3,2.0,,,,,,,,,,...,,,,,,,,,,
5,,,,,,,,,,,...,,,,,,,,,,
7,,,,,,,,,,,...,,,,,,,,,,
11,,,,,,,,,,,...,,,,,,,,,,
14,,,,,,,,,,,...,,,,,,,,,,
16,,,,,,,,1.0,,,...,,,,,,,,,,
17,1.0,,,,,,,,,,...,,,,,,,,,,
21,1.0,,,,,,,,,,...,,,,,,,,,,


In [169]:
movies = pd.read_csv("ml-latest-small/movies.csv")

In [170]:
newmovies = movies.copy()
newmovies.genres = newmovies.genres.apply(lambda x: x.split('|'))
newmovies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),"[Adventure, Animation, Children, Comedy, Fantasy]"
1,2,Jumanji (1995),"[Adventure, Children, Fantasy]"
2,3,Grumpier Old Men (1995),"[Comedy, Romance]"
3,4,Waiting to Exhale (1995),"[Comedy, Drama, Romance]"
4,5,Father of the Bride Part II (1995),[Comedy]


In [172]:
lol = newmovies.genres.apply(pd.Series) \
    .merge(newmovies, right_index = True, left_index = True) \
    .drop(["genres"], axis = 1) \
    .melt(id_vars = ['movieId', 'title'], value_name = "genre") \
    .drop("variable", axis = 1) \
    .dropna()
lol['n']=1

In [178]:
wtf = lol.pivot(index='movieId', columns='genre', values='n')
pd.concat([xxx, wtf], axis=1).fillna(0)

Unnamed: 0_level_0,"(0, -1)","(0, 0)","(0, 1)","(0, 2)","(0, 3)","(0, 4)","(0, 5)","(0, 6)","(0, 7)","(0, 8)",...,Film-Noir,Horror,IMAX,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
10,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
