# Goal: 
Combine ratings and tags to help data establish additional value

In [1]:
import numpy as np
import pandas as pd
import nltk
from nltk.tokenize import RegexpTokenizer
from nltk.stem.porter import PorterStemmer
import re
import string
from sklearn import preprocessing
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split 
from sklearn.utils import shuffle
from sklearn import tree
from sklearn.feature_extraction.text import CountVectorizer 
from sklearn import metrics

In [2]:
## links = pd.read_csv('~/Documents/EECS/EECS_731/HW/EECS731_3/data/ml-latest-small/links.csv')
movies = pd.read_csv('~/Documents/EECS/EECS_731/HW/EECS731_3/data/ml-latest-small/movies.csv')
ratings = pd.read_csv('~/Documents/EECS/EECS_731/HW/EECS731_3/data/ml-latest-small/ratings.csv')
tags = pd.read_csv('~/Documents/EECS/EECS_731/HW/EECS731_3/data/ml-latest-small/tags.csv')

In [3]:
movies.head(5)

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [4]:
ratings.dtypes

userId         int64
movieId        int64
rating       float64
timestamp      int64
dtype: object

In [5]:
ratings = ratings.drop(columns='timestamp')

Here, I decided to take the average rating for each movie. 

In [6]:
aggregation_functions = {'userId': 'first', 'movieId': 'first', 'rating': 'mean'}
rating_avg = ratings.groupby(ratings['movieId']).aggregate(aggregation_functions)
rating_avg.index.name = None
rating_avg

Unnamed: 0,userId,movieId,rating
1,1,1,3.920930
2,6,2,3.431818
3,1,3,3.259615
4,6,4,2.357143
5,6,5,3.071429
...,...,...,...
193581,184,193581,4.000000
193583,184,193583,3.500000
193585,184,193585,3.500000
193587,184,193587,3.500000


In [7]:
tags = tags.drop(columns='timestamp')
tags = tags.drop(columns='userId')
tags

Unnamed: 0,movieId,tag
0,60756,funny
1,60756,Highly quotable
2,60756,will ferrell
3,89774,Boxing story
4,89774,MMA
...,...,...
3678,7382,for katie
3679,7936,austere
3680,3265,gun fu
3681,3265,heroic bloodshed


Combining the data sets of `movies`, `tags` and `links` is easy using the `merge` operation. `merge` merges two structures based on columns with the same name. If the movieId doesn't have a tag, I still want to know the user rating. 

In [8]:
data_1 = pd.merge(movies, rating_avg)
data = pd.merge(data_1, tags)
data

Unnamed: 0,movieId,title,genres,userId,rating,tag
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,1,3.920930,pixar
1,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,1,3.920930,pixar
2,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,1,3.920930,fun
3,2,Jumanji (1995),Adventure|Children|Fantasy,6,3.431818,fantasy
4,2,Jumanji (1995),Adventure|Children|Fantasy,6,3.431818,magic board game
...,...,...,...,...,...,...
3657,187595,Solo: A Star Wars Story (2018),Action|Adventure|Children|Sci-Fi,62,3.900000,star wars
3658,193565,Gintama: The Movie (2010),Action|Animation|Comedy|Sci-Fi,184,3.500000,anime
3659,193565,Gintama: The Movie (2010),Action|Animation|Comedy|Sci-Fi,184,3.500000,comedy
3660,193565,Gintama: The Movie (2010),Action|Animation|Comedy|Sci-Fi,184,3.500000,gintama


The movieID is already a number, so we don't need to do anything with the title. I realized I do need to encode the genres and the tags somehow. 

So, next, I needed to split the genre into a list of words. There is nothing to be done with it as is. 

In [9]:
#d_genres = data['genres'].str.split('|', expand=True)
#d_genres

In [10]:
# label_encoder = LabelEncoder()
# integer_encoded = label_encoder.fit_transform(d_genres)

In [11]:
def remove_punctuation(text):
    no_punct = "".join([c for c in text if c not in string.punctuation]) 
    return no_punct

In [12]:
data['genres'] = data['genres'].apply(lambda x: remove_punctuation(x)) 
data.head()

Unnamed: 0,movieId,title,genres,userId,rating,tag
0,1,Toy Story (1995),AdventureAnimationChildrenComedyFantasy,1,3.92093,pixar
1,1,Toy Story (1995),AdventureAnimationChildrenComedyFantasy,1,3.92093,pixar
2,1,Toy Story (1995),AdventureAnimationChildrenComedyFantasy,1,3.92093,fun
3,2,Jumanji (1995),AdventureChildrenFantasy,6,3.431818,fantasy
4,2,Jumanji (1995),AdventureChildrenFantasy,6,3.431818,magic board game


In [13]:
data['genres'] = data['genres'].apply(lambda x: re.sub( r"([A-Z])", r" \1", x).split())

In [14]:
data['genres'].head(5)

0    [Adventure, Animation, Children, Comedy, Fantasy]
1    [Adventure, Animation, Children, Comedy, Fantasy]
2    [Adventure, Animation, Children, Comedy, Fantasy]
3                       [Adventure, Children, Fantasy]
4                       [Adventure, Children, Fantasy]
Name: genres, dtype: object

In [15]:
stemmer = PorterStemmer()
def word_stemmer(text):
    stem_text = " ".join([stemmer.stem(i) for i in text]) 
    return stem_text

In [16]:
data['genres'].apply(lambda x :word_stemmer(x))
data['genres'].head(5)

0    [Adventure, Animation, Children, Comedy, Fantasy]
1    [Adventure, Animation, Children, Comedy, Fantasy]
2    [Adventure, Animation, Children, Comedy, Fantasy]
3                       [Adventure, Children, Fantasy]
4                       [Adventure, Children, Fantasy]
Name: genres, dtype: object

In [17]:
data['genres'].value_counts()

[Drama]                                            354
[Comedy, Crime, Drama, Thriller]                   201
[Comedy]                                           162
[Drama, Romance]                                   129
[Action, Adventure, Sci, Fi]                       106
                                                  ... 
[Adventure, Children, Comedy, Fantasy, Sci, Fi]      1
[Adventure, Crime, Drama, Thriller]                  1
[Crime, Horror, Mystery, Thriller]                   1
[Mystery]                                            1
[Adventure, Children, Comedy, Musical]               1
Name: genres, Length: 370, dtype: int64

In [18]:
genres_one_hot = pd.get_dummies(data['genres'].apply(pd.Series).stack()).sum(level=0)
genres_one_hot

Unnamed: 0,A,Action,Adventure,Animation,Children,Comedy,Crime,Documentary,Drama,Fantasy,...,Noir,Romance,Sci,Thriller,War,Western,X,genres,listed,no
0,0,0,1,1,1,1,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
1,0,0,1,1,1,1,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
2,0,0,1,1,1,1,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
3,0,0,1,0,1,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
4,0,0,1,0,1,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3657,0,1,1,0,1,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
3658,0,1,0,1,0,1,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
3659,0,1,0,1,0,1,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
3660,0,1,0,1,0,1,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0


In [19]:
genres_one_hot.columns

Index(['A', 'Action', 'Adventure', 'Animation', 'Children', 'Comedy', 'Crime',
       'Documentary', 'Drama', 'Fantasy', 'Fi', 'Film', 'Horror', 'I', 'M',
       'Musical', 'Mystery', 'Noir', 'Romance', 'Sci', 'Thriller', 'War',
       'Western', 'X', 'genres', 'listed', 'no'],
      dtype='object')

In [20]:
genres_one_hot = genres_one_hot.drop(columns='A')
genres_one_hot = genres_one_hot.drop(columns='I')
genres_one_hot = genres_one_hot.drop(columns='M')
genres_one_hot = genres_one_hot.drop(columns='X')
genres_one_hot = genres_one_hot.drop(columns='Fi')
genres_one_hot = genres_one_hot.drop(columns='genres')
genres_one_hot = genres_one_hot.drop(columns='listed')

In [21]:
genres_one_hot = genres_one_hot.rename(columns={'Sci': 'SciFi', 'no': 'GenresNotListed'})

In [22]:
dataOHG= pd.concat([data, genres_one_hot], axis=1)
dataOHG = dataOHG.drop(columns='genres')
dataOHG

Unnamed: 0,movieId,title,userId,rating,tag,Action,Adventure,Animation,Children,Comedy,...,Horror,Musical,Mystery,Noir,Romance,SciFi,Thriller,War,Western,GenresNotListed
0,1,Toy Story (1995),1,3.920930,pixar,0,1,1,1,1,...,0,0,0,0,0,0,0,0,0,0
1,1,Toy Story (1995),1,3.920930,pixar,0,1,1,1,1,...,0,0,0,0,0,0,0,0,0,0
2,1,Toy Story (1995),1,3.920930,fun,0,1,1,1,1,...,0,0,0,0,0,0,0,0,0,0
3,2,Jumanji (1995),6,3.431818,fantasy,0,1,0,1,0,...,0,0,0,0,0,0,0,0,0,0
4,2,Jumanji (1995),6,3.431818,magic board game,0,1,0,1,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3657,187595,Solo: A Star Wars Story (2018),62,3.900000,star wars,1,1,0,1,0,...,0,0,0,0,0,1,0,0,0,0
3658,193565,Gintama: The Movie (2010),184,3.500000,anime,1,0,1,0,1,...,0,0,0,0,0,1,0,0,0,0
3659,193565,Gintama: The Movie (2010),184,3.500000,comedy,1,0,1,0,1,...,0,0,0,0,0,1,0,0,0,0
3660,193565,Gintama: The Movie (2010),184,3.500000,gintama,1,0,1,0,1,...,0,0,0,0,0,1,0,0,0,0


In [23]:
dataOHG['tag'].value_counts()

In Netflix queue     119
atmospheric           36
superhero             24
thought-provoking     24
surreal               23
                    ... 
Savannah               1
Harvey Keitel          1
Truman Capote          1
younger men            1
gintama                1
Name: tag, Length: 1584, dtype: int64

In [24]:
dataOHG.columns

Index(['movieId', 'title', 'userId', 'rating', 'tag', 'Action', 'Adventure',
       'Animation', 'Children', 'Comedy', 'Crime', 'Documentary', 'Drama',
       'Fantasy', 'Film', 'Horror', 'Musical', 'Mystery', 'Noir', 'Romance',
       'SciFi', 'Thriller', 'War', 'Western', 'GenresNotListed'],
      dtype='object')

In [25]:
X = np.array(dataOHG['movieId'],dataOHG['rating'])
X

array([1.00000e+00, 1.00000e+00, 1.00000e+00, ..., 1.93565e+05,
       1.93565e+05, 1.93565e+05])

In [26]:
from sklearn.cluster import KMeans
kmeans = KMeans(n_clusters=4)
kmeans.fit(X)
y_kmeans = kmeans.predict(X)

ValueError: Expected 2D array, got 1D array instead:
array=[1.00000e+00 1.00000e+00 1.00000e+00 ... 1.93565e+05 1.93565e+05
 1.93565e+05].
Reshape your data either using array.reshape(-1, 1) if your data has a single feature or array.reshape(1, -1) if it contains a single sample.