## Simple recommender system with movielens 100k data set
- reference: https://acodeforthought.wordpress.com/2016/12/26/building-a-simple-recommender-system-with-movie-lens-data-set/

In [1]:
import pandas as pd
import numpy as np

### Data import

In [2]:
# 각 데이터(user, item, rating)의 열 이름(column name)을 정의한다
user_cols = ['user id','age','gender','occupation','zip code']
item_cols = ['movie id','movie title','release date','video release date',
             'IMDb URL','unknown','Action','Adventure','Animation','Childrens',
             'Comedy','Crime','Documentary','Drama','Fantasy','Film-Noir','Horror',
             'Musical','Mystery','Romance ','Sci-Fi','Thriller','War' ,'Western']
rating_cols = ['user id','movie id','rating','timestamp']

In [3]:
# pandas의 read_csv() 함수를 활용해 각 데이터셋을 불러온다
users = pd.read_csv('ml-100k/u.user', sep = '|', names = user_cols, encoding = 'latin-1')       
items = pd.read_csv('ml-100k/u.item', sep = '|', names = item_cols, encoding = 'latin-1')
ratings = pd.read_csv('ml-100k/u.data', sep = '\t', names = rating_cols, encoding = 'latin-1')

## Data Preprocessing

### Merge datasets
- merge items, ratings, and users dataset, and create a whole dataset

In [4]:
whole_data = pd.merge(pd.merge(items, ratings), users)

In [5]:
print(whole_data.head())

   movie id            movie title release date  video release date  \
0         1       Toy Story (1995)  01-Jan-1995                 NaN   
1         4      Get Shorty (1995)  01-Jan-1995                 NaN   
2         5         Copycat (1995)  01-Jan-1995                 NaN   
3         7  Twelve Monkeys (1995)  01-Jan-1995                 NaN   
4         8            Babe (1995)  01-Jan-1995                 NaN   

                                            IMDb URL  unknown  Action  \
0  http://us.imdb.com/M/title-exact?Toy%20Story%2...        0       0   
1  http://us.imdb.com/M/title-exact?Get%20Shorty%...        0       1   
2  http://us.imdb.com/M/title-exact?Copycat%20(1995)        0       0   
3  http://us.imdb.com/M/title-exact?Twelve%20Monk...        0       0   
4     http://us.imdb.com/M/title-exact?Babe%20(1995)        0       0   

   Adventure  Animation  Childrens    ...     Thriller  War  Western  user id  \
0          0          1          1    ...            

### Group & average data
- 데이터 인스턴스(관측치)들을 영화의 제목(movie title)을 기준으로 그루핑한다
- 각 영화에 해당하는 rating들을 평균 내 average ratings를 계산한다

In [6]:
ratings_total = whole_data.groupby('movie title').size()    # number of people who rated each movie

In [7]:
print(ratings_total.head())

movie title
'Til There Was You (1997)      9
1-900 (1994)                   5
101 Dalmatians (1996)        109
12 Angry Men (1957)          125
187 (1997)                    41
dtype: int64


In [8]:
ratings_mean = (whole_data.groupby('movie title'))['movie title', 'rating'].mean()    # mean rating of each movie

In [9]:
print(ratings_mean.head())

                             rating
movie title                        
'Til There Was You (1997)  2.333333
1-900 (1994)               2.600000
101 Dalmatians (1996)      2.908257
12 Angry Men (1957)        4.344000
187 (1997)                 3.024390


In [10]:
ratings_total = pd.DataFrame({'movie title': ratings_total.index, 'total ratings': ratings_total.values})
ratings_total

Unnamed: 0,movie title,total ratings
0,'Til There Was You (1997),9
1,1-900 (1994),5
2,101 Dalmatians (1996),109
3,12 Angry Men (1957),125
4,187 (1997),41
5,2 Days in the Valley (1996),93
6,"20,000 Leagues Under the Sea (1954)",72
7,2001: A Space Odyssey (1968),259
8,3 Ninjas: High Noon At Mega Mountain (1998),5
9,"39 Steps, The (1935)",59


In [11]:
ratings_mean['movie title'] = ratings_mean.index
ratings_mean

Unnamed: 0_level_0,rating,movie title
movie title,Unnamed: 1_level_1,Unnamed: 2_level_1
'Til There Was You (1997),2.333333,'Til There Was You (1997)
1-900 (1994),2.600000,1-900 (1994)
101 Dalmatians (1996),2.908257,101 Dalmatians (1996)
12 Angry Men (1957),4.344000,12 Angry Men (1957)
187 (1997),3.024390,187 (1997)
2 Days in the Valley (1996),3.225806,2 Days in the Valley (1996)
"20,000 Leagues Under the Sea (1954)",3.500000,"20,000 Leagues Under the Sea (1954)"
2001: A Space Odyssey (1968),3.969112,2001: A Space Odyssey (1968)
3 Ninjas: High Noon At Mega Mountain (1998),1.000000,3 Ninjas: High Noon At Mega Mountain (1998)
"39 Steps, The (1935)",4.050847,"39 Steps, The (1935)"


## Building a simple recommender system

In [12]:
final = pd.merge(ratings_mean, ratings_total).sort_values(by = 'total ratings', ascending= False)   # sort movies by the number of total ratings

In [13]:
print(final.head(10))

        rating                    movie title  total ratings
1398  4.358491               Star Wars (1977)            583
333   3.803536                 Contact (1997)            509
498   4.155512                   Fargo (1996)            508
1234  4.007890      Return of the Jedi (1983)            507
860   3.156701               Liar Liar (1997)            485
460   3.656965    English Patient, The (1996)            481
1284  3.441423                  Scream (1996)            478
1523  3.878319               Toy Story (1995)            452
32    3.631090           Air Force One (1997)            431
744   3.438228  Independence Day (ID4) (1996)            429


In [14]:
final = final[:300].sort_values(by = 'rating', ascending = False)    # select only 300 frequently-rated movies
print(final.head(10))

        rating                       movie title  total ratings
1281  4.466443           Schindler's List (1993)            298
1652  4.466102        Wrong Trousers, The (1993)            118
273   4.456790                 Casablanca (1942)            243
1317  4.445230  Shawshank Redemption, The (1994)            283
1215  4.387560                Rear Window (1954)            209
1572  4.385768        Usual Suspects, The (1995)            267
1398  4.358491                  Star Wars (1977)            583
3     4.344000               12 Angry Men (1957)            125
303   4.292929               Citizen Kane (1941)            198
1507  4.292237      To Kill a Mockingbird (1962)            219
