# Goal: 
Combine ratings and tags to help data establish additional value

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import nltk
# nltk.download('stopwords')
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
from nltk.tokenize import RegexpTokenizer
from nltk.stem.porter import PorterStemmer
import re
import string
from sklearn import preprocessing
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split 
from sklearn.utils import shuffle
from sklearn import tree
from sklearn.feature_extraction.text import CountVectorizer 
from sklearn import metrics
from sklearn.cluster import KMeans

In [2]:
## links = pd.read_csv('~/Documents/EECS/EECS_731/HW/EECS731_3/data/ml-latest-small/links.csv')
movies = pd.read_csv('~/Documents/EECS/EECS_731/HW/EECS731_3/data/ml-latest-small/movies.csv')
ratings = pd.read_csv('~/Documents/EECS/EECS_731/HW/EECS731_3/data/ml-latest-small/ratings.csv')
tags = pd.read_csv('~/Documents/EECS/EECS_731/HW/EECS731_3/data/ml-latest-small/tags.csv')

In [3]:
ratings.dtypes

userId         int64
movieId        int64
rating       float64
timestamp      int64
dtype: object

In [4]:
ratings = ratings.drop(columns='timestamp')
ratings = ratings.drop(columns='userId')
ratings

Unnamed: 0,movieId,rating
0,1,4.0
1,3,4.0
2,6,4.0
3,47,5.0
4,50,5.0
...,...,...
100831,166534,4.0
100832,168248,5.0
100833,168250,5.0
100834,168252,5.0


Here, I decided to take the average rating for each movie. I combined the columns that have the same movieId to get an average rating for the movie. 

In [5]:
aggregation_functions = {'movieId': 'first', 'rating': 'mean'}
rating_avg = ratings.groupby(ratings['movieId']).aggregate(aggregation_functions)
rating_avg.index.name = None
rating_avg

Unnamed: 0,movieId,rating
1,1,3.920930
2,2,3.431818
3,3,3.259615
4,4,2.357143
5,5,3.071429
...,...,...
193581,193581,4.000000
193583,193583,3.500000
193585,193585,3.500000
193587,193587,3.500000


In [6]:
tags = tags.drop(columns='timestamp')
tags = tags.drop(columns='userId')

In [7]:
tags.sort_values(['movieId'])

Unnamed: 0,movieId,tag
2886,1,fun
981,1,pixar
629,1,pixar
35,2,Robin Williams
34,2,magic board game
...,...,...
402,187595,star wars
528,193565,comedy
527,193565,anime
530,193565,remaster


I saw that some movies have multiple tags. We need to combine those tags in a single line. 

In [8]:
tags = tags.groupby(['movieId'])['tag'].apply(lambda x: ','.join(x)).reset_index()

Merge the data for rating and tags. 

In [9]:
data_rt = pd.merge(rating_avg, tags)
data_rt = pd.merge(movies, data_rt)
data_rt

Unnamed: 0,movieId,title,genres,rating,tag
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,3.920930,"pixar,pixar,fun"
1,2,Jumanji (1995),Adventure|Children|Fantasy,3.431818,"fantasy,magic board game,Robin Williams,game"
2,3,Grumpier Old Men (1995),Comedy|Romance,3.259615,"moldy,old"
3,5,Father of the Bride Part II (1995),Comedy,3.071429,"pregnancy,remake"
4,7,Sabrina (1995),Comedy|Romance,3.185185,remake
...,...,...,...,...,...
1549,183611,Game Night (2018),Action|Comedy|Crime|Horror,4.000000,"Comedy,funny,Rachel McAdams"
1550,184471,Tomb Raider (2018),Action|Adventure|Fantasy,2.500000,"adventure,Alicia Vikander,video game adaptation"
1551,187593,Deadpool 2 (2018),Action|Comedy|Sci-Fi,3.875000,"Josh Brolin,Ryan Reynolds,sarcasm"
1552,187595,Solo: A Star Wars Story (2018),Action|Adventure|Children|Sci-Fi,3.900000,"Emilia Clarke,star wars"


### Assign meaning to the text data 

The before we can create a cluster, I needed to assign meaning to the `tag` category. I can transorm the text into a vector of numbers. 

I learned how to do that here: https://towardsdatascience.com/k-means-clustering-8e1e64c1561c

In [10]:
data_rt['tag'] = data_rt['tag'].apply(lambda x: x.lower())
#data_rt['tag'] = data_rt['tag'].apply(lambda x: re.split( '\s+', x))
data_rt['tag'] = data_rt['tag'].apply(lambda x: re.split( '\s*,\s*', x))
data_rt

Unnamed: 0,movieId,title,genres,rating,tag
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,3.920930,"[pixar, pixar, fun]"
1,2,Jumanji (1995),Adventure|Children|Fantasy,3.431818,"[fantasy, magic board game, robin williams, game]"
2,3,Grumpier Old Men (1995),Comedy|Romance,3.259615,"[moldy, old]"
3,5,Father of the Bride Part II (1995),Comedy,3.071429,"[pregnancy, remake]"
4,7,Sabrina (1995),Comedy|Romance,3.185185,[remake]
...,...,...,...,...,...
1549,183611,Game Night (2018),Action|Comedy|Crime|Horror,4.000000,"[comedy, funny, rachel mcadams]"
1550,184471,Tomb Raider (2018),Action|Adventure|Fantasy,2.500000,"[adventure, alicia vikander, video game adapta..."
1551,187593,Deadpool 2 (2018),Action|Comedy|Sci-Fi,3.875000,"[josh brolin, ryan reynolds, sarcasm]"
1552,187595,Solo: A Star Wars Story (2018),Action|Adventure|Children|Sci-Fi,3.900000,"[emilia clarke, star wars]"


In [11]:
data_rt['tag'].value_counts()

[in netflix queue]                                                                                                               98
[disney]                                                                                                                         15
[shakespeare]                                                                                                                    10
[aliens]                                                                                                                         10
[christmas]                                                                                                                       9
                                                                                                                                 ..
[irreverent]                                                                                                                      1
[genocide]                                                                  

In [12]:
tags_one_hot = pd.get_dummies(data_rt['tag'].apply(pd.Series).stack()).sum(level=0)

In [13]:
data_rt

Unnamed: 0,movieId,title,genres,rating,tag
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,3.920930,"[pixar, pixar, fun]"
1,2,Jumanji (1995),Adventure|Children|Fantasy,3.431818,"[fantasy, magic board game, robin williams, game]"
2,3,Grumpier Old Men (1995),Comedy|Romance,3.259615,"[moldy, old]"
3,5,Father of the Bride Part II (1995),Comedy,3.071429,"[pregnancy, remake]"
4,7,Sabrina (1995),Comedy|Romance,3.185185,[remake]
...,...,...,...,...,...
1549,183611,Game Night (2018),Action|Comedy|Crime|Horror,4.000000,"[comedy, funny, rachel mcadams]"
1550,184471,Tomb Raider (2018),Action|Adventure|Fantasy,2.500000,"[adventure, alicia vikander, video game adapta..."
1551,187593,Deadpool 2 (2018),Action|Comedy|Sci-Fi,3.875000,"[josh brolin, ryan reynolds, sarcasm]"
1552,187595,Solo: A Star Wars Story (2018),Action|Adventure|Children|Sci-Fi,3.900000,"[emilia clarke, star wars]"


In [14]:
data_rt_OH = pd.merge(data_rt, tags_one_hot, left_index=True, right_index=True)

In [15]:
data_rt

Unnamed: 0,movieId,title,genres,rating,tag
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,3.920930,"[pixar, pixar, fun]"
1,2,Jumanji (1995),Adventure|Children|Fantasy,3.431818,"[fantasy, magic board game, robin williams, game]"
2,3,Grumpier Old Men (1995),Comedy|Romance,3.259615,"[moldy, old]"
3,5,Father of the Bride Part II (1995),Comedy,3.071429,"[pregnancy, remake]"
4,7,Sabrina (1995),Comedy|Romance,3.185185,[remake]
...,...,...,...,...,...
1549,183611,Game Night (2018),Action|Comedy|Crime|Horror,4.000000,"[comedy, funny, rachel mcadams]"
1550,184471,Tomb Raider (2018),Action|Adventure|Fantasy,2.500000,"[adventure, alicia vikander, video game adapta..."
1551,187593,Deadpool 2 (2018),Action|Comedy|Sci-Fi,3.875000,"[josh brolin, ryan reynolds, sarcasm]"
1552,187595,Solo: A Star Wars Story (2018),Action|Adventure|Children|Sci-Fi,3.900000,"[emilia clarke, star wars]"


In [16]:
tags_one_hot['pixar']

0       2
1       0
2       0
3       0
4       0
       ..
1549    0
1550    0
1551    0
1552    0
1553    0
Name: pixar, Length: 1554, dtype: uint8

In [17]:
from nltk import NaiveBayesClassifier
from textblob import TextBlob
data_rt['tag'].apply(lambda x:(' '.join(x)))

0                                       pixar pixar fun
1          fantasy magic board game robin williams game
2                                             moldy old
3                                      pregnancy remake
4                                                remake
                             ...                       
1549                        comedy funny rachel mcadams
1550    adventure alicia vikander video game adaptation
1551                  josh brolin ryan reynolds sarcasm
1552                            emilia clarke star wars
1553                      anime comedy gintama remaster
Name: tag, Length: 1554, dtype: object

In [18]:
data_rt['tag'].apply(lambda x:(' '.join(x)))

0                                       pixar pixar fun
1          fantasy magic board game robin williams game
2                                             moldy old
3                                      pregnancy remake
4                                                remake
                             ...                       
1549                        comedy funny rachel mcadams
1550    adventure alicia vikander video game adaptation
1551                  josh brolin ryan reynolds sarcasm
1552                            emilia clarke star wars
1553                      anime comedy gintama remaster
Name: tag, Length: 1554, dtype: object

In [19]:
stemmer = PorterStemmer()

def word_stemmer(text): 
        stem_text = " ".join([stemmer.stem(i) for i in text])
        return stem_text

In [20]:
data_rt['tag'].apply(lambda x : word_stemmer(x))
data_rt['tag'].head(5)

0                                  [pixar, pixar, fun]
1    [fantasy, magic board game, robin williams, game]
2                                         [moldy, old]
3                                  [pregnancy, remake]
4                                             [remake]
Name: tag, dtype: object

In [22]:
data_rt.to_csv('tag_trimmed_2.csv')

### Findings 


For some reason, this line: 

`data_rt[['polarity', 'subjectivity']] = data_rt['tag'].apply(lambda Text: pd.Series(TextBlob(Text).sentiment))` 

doesn't work unless I export to a new file and then import in another notebook. So, I did the actual clustering work in the notebook titled `clustering`