## SSAC - Exploration 8 - ALS recommendation : Movie

### 1) load and preprocess data

In [1]:
import pandas as pd
import numpy as np
import os, copy
from scipy.sparse import csr_matrix
from implicit.als import AlternatingLeastSquares

In [2]:
rating_file_path=os.getenv('HOME') + '/aiffel/recommendata_iu/data/ml-1m/ratings.dat'
ratings_cols = ['user_id', 'movie_id', 'rating', 'timestamp']
ratings = pd.read_csv(rating_file_path, sep='::', names=ratings_cols, engine='python')
orginal_data_size = len(ratings)
ratings.head()

Unnamed: 0,user_id,movie_id,rating,timestamp
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291


In [3]:
# remove timstap column
ratings = ratings.iloc[:,0:3]
ratings.head()

Unnamed: 0,user_id,movie_id,rating
0,1,1193,5
1,1,661,3
2,1,914,3
3,1,3408,4
4,1,2355,5


In [4]:
# remove rows with less than 3 rating scores
ratings = ratings[ratings['rating']>=3]
filtered_data_size = len(ratings)

print(f'orginal_data_size: {orginal_data_size}, filtered_data_size: {filtered_data_size}')
print(f'Ratio of Remaining Data is {filtered_data_size / orginal_data_size:.2%}')

orginal_data_size: 1000209, filtered_data_size: 836478
Ratio of Remaining Data is 83.63%


In [5]:
# change rating column name to count
ratings.rename(columns={'rating':'count'}, inplace=True)

In [6]:
# load meta data containing movie title and movie id data
movie_file_path=os.getenv('HOME') + '/aiffel/recommendata_iu/data/ml-1m/movies.dat'
cols = ['movie_id', 'title', 'genre'] 
movies = pd.read_csv(movie_file_path, sep='::', names=cols, engine='python')
movies.head()

Unnamed: 0,movie_id,title,genre
0,1,Toy Story (1995),Animation|Children's|Comedy
1,2,Jumanji (1995),Adventure|Children's|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama
4,5,Father of the Bride Part II (1995),Comedy


In [7]:
sum(movies['title'].isna())

0

In [8]:
# create column of movie title with all lower character 
movies['title_lower'] = movies['title'].str.lower()

In [10]:
movies.head(5)

Unnamed: 0,movie_id,title,genre,title_lower
0,1,Toy Story (1995),Animation|Children's|Comedy,toy story (1995)
1,2,Jumanji (1995),Adventure|Children's|Fantasy,jumanji (1995)
2,3,Grumpier Old Men (1995),Comedy|Romance,grumpier old men (1995)
3,4,Waiting to Exhale (1995),Comedy|Drama,waiting to exhale (1995)
4,5,Father of the Bride Part II (1995),Comedy,father of the bride part ii (1995)


### 2) Analyze data 
- number of unique movies in dataset
- number of unique users in dataset
- top 30 popualar movies

In [11]:
# number of unique movies in dataset
ratings['movie_id'].nunique()

3628

In [12]:
# number of unique users in dataset
ratings['user_id'].nunique()

6039

In [13]:
# top 30 popular movies
# assuming having them scored or rated means users had watched it ; gains popularity
print(ratings.groupby(['movie_id']).count().sort_values(['count'], ascending=False)['count'].head(30))
# gather movie id data sorted by popularity
pop_movie_id = ratings.groupby('movie_id').count().sort_values(['count'], ascending=False).head(30).index

movie_id
2858    3211
260     2910
1196    2885
1210    2716
2028    2561
589     2509
593     2498
1198    2473
1270    2460
2571    2434
480     2413
2762    2385
608     2371
110     2314
1580    2297
527     2257
1197    2252
2396    2213
1617    2210
318     2194
858     2167
1265    2121
1097    2102
2997    2066
2716    2051
296     2030
356     2022
1240    2019
1       2000
457     1941
Name: count, dtype: int64


In [14]:
# fetch movie title data from meta dataset with gathered movie id list
movies['title'][movies['movie_id'].isin(pop_movie_id.tolist())]

0                                        Toy Story (1995)
108                                     Braveheart (1995)
257             Star Wars: Episode IV - A New Hope (1977)
293                                   Pulp Fiction (1994)
315                      Shawshank Redemption, The (1994)
352                                   Forrest Gump (1994)
453                                  Fugitive, The (1993)
476                                  Jurassic Park (1993)
523                               Schindler's List (1993)
585                     Terminator 2: Judgment Day (1991)
589                      Silence of the Lambs, The (1991)
604                                          Fargo (1996)
847                                 Godfather, The (1972)
1081                    E.T. the Extra-Terrestrial (1982)
1178    Star Wars: Episode V - The Empire Strikes Back...
1179                           Princess Bride, The (1987)
1180                       Raiders of the Lost Ark (1981)
1192    Star W

### 3) add rows of my favorite 5 movies to ratings dataset

In [16]:
def find_movie(string):
    # wanted_list = []
    # id_list = []
    for i in range(len(movies['title_lower'])):
        if movies['title_lower'][i].find(string) != -1:
            # wanted_list.append(movies['title'][i])
            # id_list.append(movies['movie_id'][i])
            print(f"movie: {movies['title'][i]}, id: {movies['movie_id'][i]}")
    # return wanted_list, id_list

In [17]:
find_movie('(2001)') # movies released in 2001 are not included in ratings dataset

In [18]:
find_movie('harry') # 1307 when harry met sally

movie: When Harry Met Sally... (1989), id: 1307
movie: Deconstructing Harry (1997), id: 1701
movie: Trouble with Harry, The (1955), id: 2184
movie: Who's Harry Crumb? (1989), id: 3387
movie: Harry and the Hendersons (1987), id: 3388
movie: Let's Get Harry (1986), id: 3389


In [19]:
find_movie('sunrise') # 215 before sunrise

movie: Before Sunrise (1995), id: 215
movie: Tequila Sunrise (1988), id: 2802


In [20]:
find_movie('groundhog') # 1265 groundhog day

movie: Groundhog Day (1993), id: 1265


In [21]:
find_movie('matrix') # 2571 matrix

movie: Matrix, The (1999), id: 2571


In [22]:
find_movie('mulan') # 1907 mulan

movie: Mulan (1998), id: 1907


In [24]:
my_movie = [1307, 215, 1265, 2571, 1907] # when harry met sally, before sunrise, groundhog day, matrix, mulan

In [25]:
ratings['user_id'].max() # use max method to find the end of user_id as it is integer type.

6040

In [26]:
ratings.tail() # check id I got using max method really is at the end of dataset

Unnamed: 0,user_id,movie_id,count
1000203,6040,1090,3
1000205,6040,1094,5
1000206,6040,562,5
1000207,6040,1096,4
1000208,6040,1097,4


In [28]:
# make dataframe out of my favorite movies giving 5 count scores
myfav = {'user_id': [6041]*5, 'movie_id': my_movie, 'count':[5]*5}
myfav = pd.DataFrame(myfav)
myfav

Unnamed: 0,user_id,movie_id,count
0,6041,1307,5
1,6041,215,5
2,6041,1265,5
3,6041,2571,5
4,6041,1907,5


In [29]:
# add my favorites to dataset
data = copy.deepcopy(ratings) # copy ratings dataset to object data
if not ratings.isin({'user_id':['6041']})['user_id'].any():  
    data = data.append(myfav) # add my favorites to dataset                          
data.tail(7)

Unnamed: 0,user_id,movie_id,count
1000207,6040,1096,4
1000208,6040,1097,4
0,6041,1307,5
1,6041,215,5
2,6041,1265,5
3,6041,2571,5
4,6041,1907,5


In [30]:
data['user_id'].nunique()

6040

In [31]:
data['user_id'].max() # max method returns 6041 as my choice is added under id 6041

6041

In [32]:
data['movie_id'].nunique()

3628

In [33]:
data['movie_id'].max() # max and nunique return different numbers.

3952

### 4) create CSR matrix

In [34]:
csr_data = csr_matrix((data['count'], (data.user_id, data.movie_id)))
csr_data 

<6042x3953 sparse matrix of type '<class 'numpy.int64'>'
	with 836483 stored elements in Compressed Sparse Row format>

### 5) build and train als_model = AlternatingLeastSquares

In [35]:
os.environ['OPENBLAS_NUM_THREADS']='1'
os.environ['KMP_DUPLICATE_LIB_OK']='True'
os.environ['MKL_NUM_THREADS']='1'

In [39]:
als_model = AlternatingLeastSquares(factors=128, regularization=0.01, use_gpu=False, iterations=15, dtype=np.float32)

In [40]:
# als model takes in item x user shaped matrix as input; as csr_data is created as user x item it needs to be transposed
csr_data_transpose = csr_data.T
csr_data_transpose

<3953x6042 sparse matrix of type '<class 'numpy.int64'>'
	with 836483 stored elements in Compressed Sparse Column format>

In [41]:
als_model.fit(csr_data_transpose)

  0%|          | 0/15 [00:00<?, ?it/s]

### 6) get favor scores of one of my favorites and one of the others from traind als model

In [42]:
# get my user vector from trained model
als_model.user_factors[6041]

array([-0.03671481, -0.21935   ,  0.12754501,  0.9945725 ,  0.13789749,
        0.09895207,  0.5804605 , -0.30127826,  0.035249  , -0.49202418,
        0.53168803,  0.22564851, -0.14183515, -0.0167791 , -0.3888874 ,
        0.05825231,  0.05250799, -0.58470434,  0.98937047, -0.77979565,
       -0.3014502 , -0.03688507,  0.08277088, -0.3173057 , -0.07260982,
        0.67299825,  0.33262837,  0.9251099 ,  0.562729  , -0.18661322,
       -0.23814096, -0.04420493,  0.8505713 , -0.56173795, -0.17071465,
       -0.46091908,  0.91928166, -0.01246936, -0.20095077, -0.1994051 ,
        0.24364899, -0.5307997 , -0.11383876,  0.34062433, -0.71368897,
       -0.3146015 ,  0.16159455, -0.44594288, -0.11182477,  0.4251395 ,
        0.33421108,  0.7347167 ,  0.26137942, -0.23547493,  0.39491645,
        0.09500978,  0.28481376,  0.1431498 ,  0.61266565, -0.07440323,
       -0.06925932,  0.21025914,  0.06153481, -0.09653218, -0.17510201,
       -0.36498344, -0.03465715, -0.1238426 , -0.45321837, -0.25

In [43]:
movies[movies['movie_id'].isin(myfav['movie_id'])]

Unnamed: 0,movie_id,title,genre,title_lower
213,215,Before Sunrise (1995),Drama|Romance,before sunrise (1995)
1245,1265,Groundhog Day (1993),Comedy|Romance,groundhog day (1993)
1287,1307,When Harry Met Sally... (1989),Comedy|Romance,when harry met sally... (1989)
1838,1907,Mulan (1998),Animation|Children's,mulan (1998)
2502,2571,"Matrix, The (1999)",Action|Sci-Fi|Thriller,"matrix, the (1999)"


In [44]:
# pick random movie
random_movie = np.random.choice(movies['movie_id'])
movies[movies['movie_id']==random_movie]

Unnamed: 0,movie_id,title,genre,title_lower
2651,2720,Inspector Gadget (1999),Action|Adventure|Children's|Comedy,inspector gadget (1999)


In [56]:
# get favor score of inspector gadget and before sunrise
r_mov_vec, fav_mov_vec =  als_model.item_factors[2720], als_model.item_factors[215]
my_vec = als_model.user_factors[6041]
print(f' inspector gadget favor score : {np.dot(my_vec, r_mov_vec)} , before sunrise favor score : {np.dot(my_vec, fav_mov_vec)}') 

 inspector gadget favor score : 0.010590699501335621 , before sunrise favor score : 0.17019209265708923


### 7) get similar movies recommendation from trained model

In [51]:
# movies similar to before sunrise
similar_movie = als_model.similar_items(215, N=10)
for x in range(len(similar_movie)):
    print(movies['title'][movies['movie_id'].isin(similar_movie[x])])

0           Toy Story (1995)
213    Before Sunrise (1995)
Name: title, dtype: object
754    Heavy (1995)
Name: title, dtype: object
71    Kicking and Screaming (1995)
Name: title, dtype: object
413    Barcelona (1994)
Name: title, dtype: object
1344    Ridicule (1996)
Name: title, dtype: object
2816    Guinevere (1999)
Name: title, dtype: object
835    Flirt (1995)
Name: title, dtype: object
147    Amateur (1994)
Name: title, dtype: object
1428    Hotel de Love (1996)
Name: title, dtype: object
1853    Whatever (1998)
Name: title, dtype: object


### 8) movies recommendations based on my taste

In [52]:
movies_recommended = als_model.recommend(6041, csr_data, N=20, filter_already_liked_items=True)
movies_recommended

[(589, 0.38454714),
 (1270, 0.34660178),
 (457, 0.28571454),
 (1, 0.26723158),
 (1240, 0.2566485),
 (1197, 0.25634918),
 (2918, 0.25597346),
 (2916, 0.24381313),
 (1653, 0.23401353),
 (2762, 0.22473969),
 (2716, 0.22223154),
 (357, 0.2170414),
 (377, 0.21523449),
 (1527, 0.21232672),
 (2081, 0.2101368),
 (2248, 0.20873171),
 (1923, 0.20525223),
 (364, 0.1937974),
 (595, 0.18965554),
 (2028, 0.18050879)]

In [53]:
movies_recommended[0][0]

589

In [54]:
len(movies_recommended)

20

In [55]:
for x, y in movies_recommended:
    title = movies['title'][movies['movie_id'].isin([x])]
    print(f'추천영화 : {title}', end = ' ')
    print(f'선호도 : {y:.3f}')

추천영화 : 585    Terminator 2: Judgment Day (1991)
Name: title, dtype: object 선호도 : 0.385
추천영화 : 1250    Back to the Future (1985)
Name: title, dtype: object 선호도 : 0.347
추천영화 : 453    Fugitive, The (1993)
Name: title, dtype: object 선호도 : 0.286
추천영화 : 0    Toy Story (1995)
Name: title, dtype: object 선호도 : 0.267
추천영화 : 1220    Terminator, The (1984)
Name: title, dtype: object 선호도 : 0.257
추천영화 : 1179    Princess Bride, The (1987)
Name: title, dtype: object 선호도 : 0.256
추천영화 : 2849    Ferris Bueller's Day Off (1986)
Name: title, dtype: object 선호도 : 0.256
추천영화 : 2847    Total Recall (1990)
Name: title, dtype: object 선호도 : 0.244
추천영화 : 1607    Gattaca (1997)
Name: title, dtype: object 선호도 : 0.234
추천영화 : 2693    Sixth Sense, The (1999)
Name: title, dtype: object 선호도 : 0.225
추천영화 : 2647    Ghostbusters (1984)
Name: title, dtype: object 선호도 : 0.222
추천영화 : 353    Four Weddings and a Funeral (1994)
Name: title, dtype: object 선호도 : 0.217
추천영화 : 373    Speed (1994)
Name: title, dtype: object 선호도 : 0.21