### E7 Project Recommend Movie

In [123]:
import pandas as pd
from scipy.sparse import csr_matrix

#### 01 데이터 준비와 전처리

In [3]:
import os
rating_file_path=os.getenv('HOME') + '/aiffel/recommendata_iu/data/ml-1m/ratings.dat'
ratings_cols = ['user_id', 'movie_id', 'rating', 'timestamp']
ratings = pd.read_csv(rating_file_path, sep='::', names=ratings_cols, engine='python')
orginal_data_size = len(ratings)
ratings.head()

Unnamed: 0,user_id,movie_id,rating,timestamp
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291


In [4]:
ratings = ratings[ratings['rating']>=3]
filtered_data_size = len(ratings)

print(f'orginal_data_size: {orginal_data_size}, filtered_data_size: {filtered_data_size}')
print(f'Ratio of Remaining Data is {filtered_data_size / orginal_data_size:.2%}')

orginal_data_size: 1000209, filtered_data_size: 836478
Ratio of Remaining Data is 83.63%


In [12]:
ratings.rename(columns={'rating':'count'}, inplace=True)
select_col = ['user_id', 'movie_id', 'count']
ratings_df = ratings[select_col]
ratings_df.tail()

Unnamed: 0,user_id,movie_id,count
1000203,6040,1090,3
1000205,6040,1094,5
1000206,6040,562,5
1000207,6040,1096,4
1000208,6040,1097,4


In [6]:
movie_file_path=os.getenv('HOME') + '/aiffel/recommendata_iu/data/ml-1m/movies.dat'
cols = ['movie_id', 'title', 'genre'] 
movies = pd.read_csv(movie_file_path, sep='::', names=cols, engine='python')
movies.head()

Unnamed: 0,movie_id,title,genre
0,1,Toy Story (1995),Animation|Children's|Comedy
1,2,Jumanji (1995),Adventure|Children's|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama
4,5,Father of the Bride Part II (1995),Comedy


In [99]:
merge_df = pd.merge(ratings_df, movies)
merge_df.tail()

Unnamed: 0,user_id,movie_id,count,title,genre
836473,5851,3607,5,One Little Indian (1973),Comedy|Drama|Western
836474,5854,3026,4,Slaughterhouse (1987),Horror
836475,5854,690,3,"Promise, The (Versprechen, Das) (1994)",Romance
836476,5938,2909,4,"Five Wives, Three Secretaries and Me (1998)",Documentary
836477,5948,1360,5,Identification of a Woman (Identificazione di ...,Drama


#### 02 분석

- ratings에 있는 유니크한 영화 개수

In [171]:
movie_unique = merge_df['title'].unique()
merge_df['title'].nunique()


3628

- ratings에 있는 유니크한 사용자 수 

In [101]:
user_unique = merge_df['user_id'].unique()
merge_df['user_id'].nunique()

6039

- 가장 인기있는 영화 30개(인기순)

In [102]:
count_mean_df=pd.DataFrame(merge_df.groupby('title').mean()).sort_values(by = 'count', ascending=False)
count_mean_df.reset_index(level=['title'], inplace=True)

In [103]:
popular_ls = count_mean_df['title'][0:30]
pd.DataFrame(popular_ls)


Unnamed: 0,title
0,Ulysses (Ulisse) (1954)
1,Country Life (1994)
2,Schlafes Bruder (Brother of Sleep) (1995)
3,Foreign Student (1994)
4,Follow the Bitch (1998)
5,One Little Indian (1973)
6,Criminal Lovers (Les Amants Criminels) (1999)
7,Message to Love: The Isle of Wight Festival (1...
8,Identification of a Woman (Identificazione di ...
9,Late Bloomers (1996)


In [172]:
my_movies = ['Titanic (1953)', 'Small Soldiers (1998)', 'Toy Story 2 (1999)', 'Miss Julie (1999)', 'Terminator 2: Judgment Day (1991)']
counts = [5, 4, 4, 5, 4]
genre = ['Drama|Romance', "Animation|Children's|Fantasy|War", "Animation|Children's|Comedy", 'Drama', 'Action|Sci-Fi|Thriller']
my_movies_df = pd.DataFrame({'user_id':[6041]*5, 'title':my_movies, 'count':counts,'genre':genre})
my_movies_df

Unnamed: 0,user_id,title,count,genre
0,6041,Titanic (1953),5,Drama|Romance
1,6041,Small Soldiers (1998),4,Animation|Children's|Fantasy|War
2,6041,Toy Story 2 (1999),4,Animation|Children's|Comedy
3,6041,Miss Julie (1999),5,Drama
4,6041,Terminator 2: Judgment Day (1991),4,Action|Sci-Fi|Thriller


In [173]:
columns = ['user_id', 'title', 'count', 'genre']
total_df = merge_df[columns]
total_df = pd.concat([total_df, my_movies_df], ignore_index=True)
total_df.tail(10)

Unnamed: 0,user_id,title,count,genre
836473,5851,One Little Indian (1973),5,Comedy|Drama|Western
836474,5854,Slaughterhouse (1987),4,Horror
836475,5854,"Promise, The (Versprechen, Das) (1994)",3,Romance
836476,5938,"Five Wives, Three Secretaries and Me (1998)",4,Documentary
836477,5948,Identification of a Woman (Identificazione di ...,5,Drama
836478,6041,Titanic (1953),5,Drama|Romance
836479,6041,Small Soldiers (1998),4,Animation|Children's|Fantasy|War
836480,6041,Toy Story 2 (1999),4,Animation|Children's|Comedy
836481,6041,Miss Julie (1999),5,Drama
836482,6041,Terminator 2: Judgment Day (1991),4,Action|Sci-Fi|Thriller


In [174]:
genre_unique = total_df['genre'].unique()
user_unique = total_df['user_id'].unique()
movie_unique = total_df['title'].unique()
genre_to_idx = {v:k for k, v in enumerate(genre_unique)}
user_to_idx = {v:k for k, v in enumerate(user_unique)}
movie_to_idx = {v:k for k, v in enumerate(movie_unique)}

In [175]:
temp_genre_data = total_df['genre'].map(genre_to_idx).dropna()
temp_user_data = total_df['user_id'].map(user_to_idx).dropna()
temp_movie_data = total_df['title'].map(movie_to_idx).dropna()
if len(temp_genre_data) == len(total_df):
    print('genre column indexing OK!!')
    total_df['genre'] = temp_genre_data
else:
    print('genre column indexing Fail!!')

if len(temp_user_data) == len(total_df):
    print('user_id column indexing OK!!')
    total_df['user_id'] = temp_user_data
else:
    print('user_id column indexing Fail!!')

if len(temp_movie_data) == len(total_df):
    print('movie_id column indexing OK!!')
    total_df['movie_id'] = temp_movie_data
else:
    print('movie_id column indexing Fail!!')

total_df

genre column indexing OK!!
user_id column indexing OK!!
movie_id column indexing OK!!


Unnamed: 0,user_id,title,count,genre,movie_id
0,0,One Flew Over the Cuckoo's Nest (1975),5,0,0
1,1,One Flew Over the Cuckoo's Nest (1975),5,0,0
2,2,One Flew Over the Cuckoo's Nest (1975),4,0,0
3,3,One Flew Over the Cuckoo's Nest (1975),4,0,0
4,4,One Flew Over the Cuckoo's Nest (1975),5,0,0
...,...,...,...,...,...
836478,6039,Titanic (1953),5,18,1626
836479,6039,Small Soldiers (1998),4,271,1736
836480,6039,Toy Story 2 (1999),4,3,50
836481,6039,Miss Julie (1999),5,0,3319


In [176]:
num_user = total_df['user_id'].nunique()
num_movie = total_df['title'].nunique()

In [188]:
csr_data = csr_matrix((total_df['genre'], (total_df.user_id, total_df.movie_id)), shape=(num_user, num_movie))
csr_data

<6040x3628 sparse matrix of type '<class 'numpy.int64'>'
	with 836483 stored elements in Compressed Sparse Row format>

In [189]:
from implicit.als import AlternatingLeastSquares
import os
import numpy as np

# implicit 라이브러리에서 권장하고 있는 부분입니다. 학습 내용과는 무관합니다.
os.environ['OPENBLAS_NUM_THREADS']='1'
os.environ['KMP_DUPLICATE_LIB_OK']='True'
os.environ['MKL_NUM_THREADS']='1'

In [190]:
als_model = AlternatingLeastSquares(factors=100, regularization=0.01, use_gpu=False, iterations=15, dtype=np.float32)

In [191]:
csr_data_transpose = csr_data.T
csr_data_transpose

<3628x6040 sparse matrix of type '<class 'numpy.int64'>'
	with 836483 stored elements in Compressed Sparse Column format>

In [192]:
als_model.fit(csr_data_transpose)

HBox(children=(FloatProgress(value=0.0, max=15.0), HTML(value='')))




In [193]:
star_wars =movie_to_idx['Star Wars: Episode I - The Phantom Menace (1999)']

In [194]:
star_wars_vector = als_model.item_factors[star_wars]
star_wars_vector

array([ 8.22700094e-03,  2.21009110e-03, -1.32231251e-03,  1.15244202e-02,
        4.99342103e-03,  2.01749918e-03,  4.97526024e-03, -2.26159254e-03,
        1.94713823e-03,  2.86967633e-03,  9.92947910e-03,  6.05281151e-04,
        6.22677244e-03,  1.47793395e-03,  2.77676387e-03,  8.42647348e-03,
        8.24027788e-03,  2.96880445e-03,  8.80008750e-03,  1.83096586e-03,
       -6.90125860e-04,  1.70038303e-03,  1.07566901e-02,  1.59442006e-03,
        1.48214831e-03,  3.55865154e-03, -4.07487946e-03,  3.75165674e-03,
        5.14009967e-03,  6.93062553e-03,  9.40079393e-04,  5.01732901e-03,
        7.12601002e-03,  6.70091202e-03,  8.47730413e-03,  9.12980177e-03,
        4.55387728e-03,  7.31635140e-03,  1.25736995e-02,  1.96833583e-03,
        4.85450914e-03,  1.14810076e-02, -1.14919350e-03,  6.79111909e-05,
        4.27782536e-03,  7.23167742e-03,  3.32280644e-03, -3.04197380e-03,
        3.25034116e-03,  1.13561191e-02,  1.20474258e-02,  5.66515652e-03,
        1.05538068e-03,  

In [195]:
similar_movies = als_model.similar_items(star_wars, N = 15)
similar_movies

[(60, 0.06645686),
 (64, 0.056168225),
 (44, 0.055971358),
 (117, 0.05544935),
 (120, 0.054046683),
 (38, 0.05355615),
 (26, 0.05275984),
 (48, 0.05252306),
 (124, 0.051714),
 (107, 0.051464982),
 (121, 0.0514529),
 (22, 0.051408287),
 (175, 0.05091595),
 (87, 0.05082306),
 (160, 0.05049911)]

In [196]:
idx_to_artist = {v:k for k, v in movie_to_idx.items()}
[idx_to_artist[i[0]] for i in similar_movies]

['Star Wars: Episode I - The Phantom Menace (1999)',
 'Star Wars: Episode VI - Return of the Jedi (1983)',
 'Star Wars: Episode IV - A New Hope (1977)',
 'Star Wars: Episode V - The Empire Strikes Back (1980)',
 'Raiders of the Lost Ark (1981)',
 'Sixth Sense, The (1999)',
 'E.T. the Extra-Terrestrial (1982)',
 'Saving Private Ryan (1998)',
 'Matrix, The (1999)',
 'Jurassic Park (1993)',
 'Silence of the Lambs, The (1991)',
 'Back to the Future (1985)',
 'Men in Black (1997)',
 'Braveheart (1995)',
 'Forrest Gump (1994)']