In [1]:
import numpy as np
import re
import pandas as pd

## Reading Data

In [2]:
def read_movie(filename):
    tmp_i = []
    tmp_n = []
    tmp_c = []
    with open(filename) as f:
        for line in f:
            line_split = re.split('::', line)
            tmp_i.append(line_split[0])
            tmp_n.append(line_split[1])
            tmp_c.append(line_split[2][:-1].split('|'))

    movie_df = pd.DataFrame({'movie_id':tmp_i, 'name':tmp_n, 'category':tmp_c})
    return movie_df
movie_df = read_movie('../../data/ml-1m/movies.dat')
movie_df.head()

Unnamed: 0,category,movie_id,name
0,"[Animation, Children's, Comedy]",1,Toy Story (1995)
1,"[Adventure, Children's, Fantasy]",2,Jumanji (1995)
2,"[Comedy, Romance]",3,Grumpier Old Men (1995)
3,"[Comedy, Drama]",4,Waiting to Exhale (1995)
4,[Comedy],5,Father of the Bride Part II (1995)


In [3]:
def read_user(filename):
    user_id = []
    gender = []
    age = []
    occu = []
    zip_code = []
    with open(filename) as f:
        for line in f:
            line_split = re.split('::', line)
            user_id.append(line_split[0])
            gender.append(line_split[1])
            age.append(line_split[2])
            occu.append(line_split[3])
            zip_code.append(line_split[4][:-1])
    user_df = pd.DataFrame({'user_id':user_id, 'gender':gender, 'age':age, 'occupation':occu,'zipcode':zip_code})
    return user_df
            
user_df = read_user('../../data/ml-1m/users.dat')
user_df.head()

Unnamed: 0,age,gender,occupation,user_id,zipcode
0,1,F,10,1,48067
1,56,M,16,2,70072
2,25,M,15,3,55117
3,45,M,7,4,2460
4,25,M,20,5,55455


In [4]:
def read_rating(filename):
    user_id = []
    movie_id = []
    rating = []
    timestamp = []
    with open(filename) as f:
        for line in f:
            line_split = re.split('::', line)
            user_id.append(line_split[0])
            movie_id.append(line_split[1])
            rating.append(line_split[2])
            timestamp.append(line_split[3][:-1])
    rating_df = pd.DataFrame({'user_id':user_id, 'movie_id':movie_id, 'rating': rating, 'timestamp':timestamp})
    return rating_df
rating_df = read_rating('../../data/ml-1m/ratings.dat')
rating_df.head()

Unnamed: 0,movie_id,rating,timestamp,user_id
0,1193,5,978300760,1
1,661,3,978302109,1
2,914,3,978301968,1
3,3408,4,978300275,1
4,2355,5,978824291,1


## Pre-process

Hàm đổ i category sang dạng các cột.
df: dataframe
id_cols: danh sách các cột để  định danh 1 object
cate_col: cột cần chuyển
multi_cate: bằng True nếu 1 object thuộc nhiều category
merge: bằng True nếu kết quả trả về cần merge với dataframe input

In [5]:
def convert_cate_tocol(df, id_cols, cate_col, multi_cate, merge = True):
    def mapping(cat_list, all_categories):
        category_vec = np.zeros(len(all_categories)).astype(int)
        if multi_cate == True:
            cat_list = set(cat_list)
        else:
            cat_list = set([cat_list])
        for i in range(len(all_categories)):
            if all_categories[i] in cat_list:
                category_vec[i] = 1
        return category_vec
    
    # lấy danh sách các category
    all_categories = None
    if multi_cate:
        all_categories = np.unique(sum(df[cate_col], []))
    else:
        all_categories = np.unique(df[cate_col])
    print all_categories
    
    #với mỗ i dòng: chuyển category thành vector
    res_df = pd.DataFrame([mapping(df.iloc[i][cate_col], all_categories) for i in range(len(df))],columns=all_categories)
    
    #thêm tên cate_ vào các cột để tránh trùng tên
    res_df.columns = ('%s_' % cate_col) + res_df.columns
    for col in id_cols:
        res_df[col] = df[col]
        
    if merge == True:
        res_df = pd.merge(res_df, df, on = id_cols)
    return res_df

### movie

In [6]:
movie_df.head()

Unnamed: 0,category,movie_id,name
0,"[Animation, Children's, Comedy]",1,Toy Story (1995)
1,"[Adventure, Children's, Fantasy]",2,Jumanji (1995)
2,"[Comedy, Romance]",3,Grumpier Old Men (1995)
3,"[Comedy, Drama]",4,Waiting to Exhale (1995)
4,[Comedy],5,Father of the Bride Part II (1995)


In [7]:
movie_df = convert_cate_tocol(movie_df, ['movie_id', 'name'], 'category', multi_cate = True, merge = False)
movie_df.head()

['Action' 'Adventure' 'Animation' "Children's" 'Comedy' 'Crime'
 'Documentary' 'Drama' 'Fantasy' 'Film-Noir' 'Horror' 'Musical' 'Mystery'
 'Romance' 'Sci-Fi' 'Thriller' 'War' 'Western']


Unnamed: 0,category_Action,category_Adventure,category_Animation,category_Children's,category_Comedy,category_Crime,category_Documentary,category_Drama,category_Fantasy,category_Film-Noir,category_Horror,category_Musical,category_Mystery,category_Romance,category_Sci-Fi,category_Thriller,category_War,category_Western,movie_id,name
0,0,0,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,Toy Story (1995)
1,0,1,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,2,Jumanji (1995)
2,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,3,Grumpier Old Men (1995)
3,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,4,Waiting to Exhale (1995)
4,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,5,Father of the Bride Part II (1995)


### rating

In [8]:
rating_df.head()

Unnamed: 0,movie_id,rating,timestamp,user_id
0,1193,5,978300760,1
1,661,3,978302109,1
2,914,3,978301968,1
3,3408,4,978300275,1
4,2355,5,978824291,1


In [9]:
rating_df['rating'] = rating_df['rating'].astype(float)

In [10]:
#tính mean,count,std, min, max, percentile cho rating
rate_dcrb_df = rating_df.groupby(['movie_id'])['rating'].describe(percentiles = np.array(range(1, 10)) * 1.0/10)

In [11]:
#loại bỏ na
rate_dcrb_df['std'] = rate_dcrb_df['std'].fillna(0)

In [12]:
rate_dcrb_df.head()

Unnamed: 0_level_0,count,mean,std,min,10%,20%,30%,40%,50%,60%,70%,80%,90%,max
movie_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
1,2077.0,4.146846,0.852349,1.0,3.0,3.0,4.0,4.0,4.0,4.0,5.0,5.0,5.0,5.0
10,888.0,3.540541,0.891233,1.0,2.0,3.0,3.0,3.0,4.0,4.0,4.0,4.0,5.0,5.0
100,128.0,3.0625,0.961872,1.0,2.0,2.0,3.0,3.0,3.0,3.0,3.9,4.0,4.0,5.0
1000,20.0,3.05,1.234376,1.0,1.0,2.0,2.7,3.0,3.0,3.4,4.0,4.0,4.1,5.0
1002,8.0,4.25,0.886405,3.0,3.0,3.4,4.0,4.0,4.5,5.0,5.0,5.0,5.0,5.0


In [13]:
rate_dcrb_df = rate_dcrb_df.reset_index()

#### Merge category and rating

In [14]:
movie_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3883 entries, 0 to 3882
Data columns (total 20 columns):
category_Action         3883 non-null int64
category_Adventure      3883 non-null int64
category_Animation      3883 non-null int64
category_Children's     3883 non-null int64
category_Comedy         3883 non-null int64
category_Crime          3883 non-null int64
category_Documentary    3883 non-null int64
category_Drama          3883 non-null int64
category_Fantasy        3883 non-null int64
category_Film-Noir      3883 non-null int64
category_Horror         3883 non-null int64
category_Musical        3883 non-null int64
category_Mystery        3883 non-null int64
category_Romance        3883 non-null int64
category_Sci-Fi         3883 non-null int64
category_Thriller       3883 non-null int64
category_War            3883 non-null int64
category_Western        3883 non-null int64
movie_id                3883 non-null object
name                    3883 non-null object
dtypes: int

In [15]:
rate_dcrb_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3706 entries, 0 to 3705
Data columns (total 15 columns):
movie_id    3706 non-null object
count       3706 non-null float64
mean        3706 non-null float64
std         3706 non-null float64
min         3706 non-null float64
10%         3706 non-null float64
20%         3706 non-null float64
30%         3706 non-null float64
40%         3706 non-null float64
50%         3706 non-null float64
60%         3706 non-null float64
70%         3706 non-null float64
80%         3706 non-null float64
90%         3706 non-null float64
max         3706 non-null float64
dtypes: float64(14), object(1)
memory usage: 434.4+ KB


In [16]:
cate_rate_movie_df = pd.merge(rate_dcrb_df, movie_df, on = ['movie_id'])

In [17]:
cate_rate_movie_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3706 entries, 0 to 3705
Data columns (total 34 columns):
movie_id                3706 non-null object
count                   3706 non-null float64
mean                    3706 non-null float64
std                     3706 non-null float64
min                     3706 non-null float64
10%                     3706 non-null float64
20%                     3706 non-null float64
30%                     3706 non-null float64
40%                     3706 non-null float64
50%                     3706 non-null float64
60%                     3706 non-null float64
70%                     3706 non-null float64
80%                     3706 non-null float64
90%                     3706 non-null float64
max                     3706 non-null float64
category_Action         3706 non-null int64
category_Adventure      3706 non-null int64
category_Animation      3706 non-null int64
category_Children's     3706 non-null int64
category_Comedy         3706

Total movies: 3900
- number of movie with categories (length of movie_df): 3883
- number of movie with rating (length of rating_df): 3706

In [18]:
cate_rate_movie_df.head()

Unnamed: 0,movie_id,count,mean,std,min,10%,20%,30%,40%,50%,...,category_Film-Noir,category_Horror,category_Musical,category_Mystery,category_Romance,category_Sci-Fi,category_Thriller,category_War,category_Western,name
0,1,2077.0,4.146846,0.852349,1.0,3.0,3.0,4.0,4.0,4.0,...,0,0,0,0,0,0,0,0,0,Toy Story (1995)
1,10,888.0,3.540541,0.891233,1.0,2.0,3.0,3.0,3.0,4.0,...,0,0,0,0,0,0,1,0,0,GoldenEye (1995)
2,100,128.0,3.0625,0.961872,1.0,2.0,2.0,3.0,3.0,3.0,...,0,0,0,0,0,0,1,0,0,City Hall (1996)
3,1000,20.0,3.05,1.234376,1.0,1.0,2.0,2.7,3.0,3.0,...,0,0,0,0,0,0,0,0,0,Curdled (1996)
4,1002,8.0,4.25,0.886405,3.0,3.0,3.4,4.0,4.0,4.5,...,0,0,0,0,0,0,0,0,0,Ed's Next Move (1996)


In [19]:
cate_rate_movie_df.to_csv('../../data/pre-processed/movie_all.csv')

### user info

In [20]:
user_df.to_csv('../../data/pre-processed/user_all.csv')

In [21]:
user_df.head()

Unnamed: 0,age,gender,occupation,user_id,zipcode
0,1,F,10,1,48067
1,56,M,16,2,70072
2,25,M,15,3,55117
3,45,M,7,4,2460
4,25,M,20,5,55455


In [22]:
cvage_user_df = convert_cate_tocol(user_df, id_cols=['user_id'], cate_col='age', multi_cate=False, merge=False)
cvage_user_df.head()

['1' '18' '25' '35' '45' '50' '56']


Unnamed: 0,age_1,age_18,age_25,age_35,age_45,age_50,age_56,user_id
0,1,0,0,0,0,0,0,1
1,0,0,0,0,0,0,1,2
2,0,0,1,0,0,0,0,3
3,0,0,0,0,1,0,0,4
4,0,0,1,0,0,0,0,5


In [23]:
cvgender_user_df = convert_cate_tocol(user_df, id_cols=['user_id'], cate_col='gender', multi_cate=False, merge=False)
cvgender_user_df.head()

['F' 'M']


Unnamed: 0,gender_F,gender_M,user_id
0,1,0,1
1,0,1,2
2,0,1,3
3,0,1,4
4,0,1,5


In [24]:
cvoccu_user_df = convert_cate_tocol(user_df, id_cols=['user_id'], cate_col='occupation', multi_cate=False, merge=False)
cvoccu_user_df.head()

['0' '1' '10' '11' '12' '13' '14' '15' '16' '17' '18' '19' '2' '20' '3' '4'
 '5' '6' '7' '8' '9']


Unnamed: 0,occupation_0,occupation_1,occupation_10,occupation_11,occupation_12,occupation_13,occupation_14,occupation_15,occupation_16,occupation_17,...,occupation_2,occupation_20,occupation_3,occupation_4,occupation_5,occupation_6,occupation_7,occupation_8,occupation_9,user_id
0,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
1,0,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,2
2,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,3
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,4
4,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,5


In [25]:
user_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6040 entries, 0 to 6039
Data columns (total 5 columns):
age           6040 non-null object
gender        6040 non-null object
occupation    6040 non-null object
user_id       6040 non-null object
zipcode       6040 non-null object
dtypes: object(5)
memory usage: 236.0+ KB


In [26]:
del cvage_user_df['age_56']
del cvgender_user_df['gender_F']
del cvoccu_user_df['occupation_20']

In [28]:
tmp1 = pd.merge(cvage_user_df, cvgender_user_df, on = ['user_id'])
cvt_user_df = pd.merge(tmp1, user_df[['user_id', 'zipcode']], on = ['user_id'])

In [30]:
cvt_user_df.head()

Unnamed: 0,age_1,age_18,age_25,age_35,age_45,age_50,user_id,gender_M,zipcode
0,1,0,0,0,0,0,1,0,48067
1,0,0,0,0,0,0,2,1,70072
2,0,0,1,0,0,0,3,1,55117
3,0,0,0,0,1,0,4,1,2460
4,0,0,1,0,0,0,5,1,55455


In [31]:
cvt_user_df.to_csv('../../data/pre-processed/cvt_user.csv')