In [1]:
import numpy as np
import torch
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
import torch.nn as nn
import torch.nn.functional as functional
from torch.autograd import Variable

In [3]:
def convert_cate_tocol(df, id_cols, cate_col, multi_cate, merge = True):
	def mapping(cat_list, all_categories):
		category_vec = np.zeros(len(all_categories)).astype(int)
		if multi_cate == True:
			cat_list = set(cat_list)
		else:
			cat_list = set([cat_list])
		for i in range(len(all_categories)):
			if all_categories[i] in cat_list:
				category_vec[i] = 1
		return category_vec
	
	all_categories = None
	if multi_cate:
		all_categories = np.unique(sum(df[cate_col], []))
	else:
		all_categories = np.unique(df[cate_col])
	
	res_df = pd.DataFrame([mapping(df.iloc[i][cate_col], all_categories) for i in range(len(df))],columns=all_categories)
	res_df.columns = ['%s_%s' % (cate_col, col) for col in res_df.columns]
	for col in id_cols:
		res_df[col] = df[col]
		
	if merge == True:
		res_df = pd.merge(res_df, df, on = id_cols)
	return res_df

# Read data

## movie

In [4]:
movie_all = pd.read_csv('../../../data/pre-processed/movie_all.csv').iloc[:, [1] + range(16, 35)]

In [5]:
movie_all.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3706 entries, 0 to 3705
Data columns (total 20 columns):
movie_id                3706 non-null int64
category_Action         3706 non-null int64
category_Adventure      3706 non-null int64
category_Animation      3706 non-null int64
category_Children's     3706 non-null int64
category_Comedy         3706 non-null int64
category_Crime          3706 non-null int64
category_Documentary    3706 non-null int64
category_Drama          3706 non-null int64
category_Fantasy        3706 non-null int64
category_Film-Noir      3706 non-null int64
category_Horror         3706 non-null int64
category_Musical        3706 non-null int64
category_Mystery        3706 non-null int64
category_Romance        3706 non-null int64
category_Sci-Fi         3706 non-null int64
category_Thriller       3706 non-null int64
category_War            3706 non-null int64
category_Western        3706 non-null int64
name                    3706 non-null object
dtypes: int6

## user

In [6]:
user_info = pd.read_csv('../../../data/pre-processed/user_all.csv').iloc[:, 1:]
print user_info.head()

   age gender  occupation  user_id zipcode
0    1      F          10        1   48067
1   56      M          16        2   70072
2   25      M          15        3   55117
3   45      M           7        4   02460
4   25      M          20        5   55455


In [7]:
user_info['zipcode'] = np.array(user_info['zipcode']).astype(int)

In [8]:
user_info.head()

Unnamed: 0,age,gender,occupation,user_id,zipcode
0,1,F,10,1,48067
1,56,M,16,2,70072
2,25,M,15,3,55117
3,45,M,7,4,2460
4,25,M,20,5,55455


In [9]:
user_info.columns

Index([u'age', u'gender', u'occupation', u'user_id', u'zipcode'], dtype='object')

In [10]:
user_info_final = convert_cate_tocol(user_info, cate_col = 'occupation', id_cols= ['user_id'], multi_cate=False, merge=True)

In [11]:
type(user_info_final['age'][0])

numpy.int64

In [12]:
lookup_age= {1:1, 18:2, 25:3, 35:4, 45:5, 50:6, 56:7}

In [13]:
user_info_final['age'] = [lookup_age[i] for i in user_info_final['age']]

## rating

In [14]:
import re
def read_rating(filename):
	user_id = []
	movie_id = []
	rating = []
	timestamp = []
	with open(filename) as f:
		for line in f:
			line_split = re.split('::', line)
			user_id.append(line_split[0])
			movie_id.append(line_split[1])
			rating.append(line_split[2])
			timestamp.append(line_split[3][:-1])
	rating_df = pd.DataFrame({'user_id':user_id, 'movie_id':movie_id, 'rating': rating, 'timestamp':timestamp})
	return rating_df

In [15]:
rating = read_rating('../../../data/ml-1m/ratings.dat')
rating.head()

Unnamed: 0,movie_id,rating,timestamp,user_id
0,1193,5,978300760,1
1,661,3,978302109,1
2,914,3,978301968,1
3,3408,4,978300275,1
4,2355,5,978824291,1


In [16]:
rating['movie_id'] = np.array(rating['movie_id']).astype(int)

In [17]:
rating['user_id'] = np.array(rating['user_id']).astype(int)

## merge

In [18]:
rating = pd.merge(rating, movie_all, on =['movie_id'], how = 'left')
print rating.head()
rating.info()

   movie_id rating  timestamp  user_id  category_Action  category_Adventure  \
0      1193      5  978300760        1                0                   0   
1       661      3  978302109        1                0                   0   
2       914      3  978301968        1                0                   0   
3      3408      4  978300275        1                0                   0   
4      2355      5  978824291        1                0                   0   

   category_Animation  category_Children's  category_Comedy  category_Crime  \
0                   0                    0                0               0   
1                   1                    1                0               0   
2                   0                    0                0               0   
3                   0                    0                0               0   
4                   1                    1                1               0   

                    ...                    categor

In [19]:
rating = pd.merge(rating, user_info_final, on =['user_id'], how = 'left')
print rating.head()
rating.info()

   movie_id rating  timestamp  user_id  category_Action  category_Adventure  \
0      1193      5  978300760        1                0                   0   
1       661      3  978302109        1                0                   0   
2       914      3  978301968        1                0                   0   
3      3408      4  978300275        1                0                   0   
4      2355      5  978824291        1                0                   0   

   category_Animation  category_Children's  category_Comedy  category_Crime  \
0                   0                    0                0               0   
1                   1                    1                0               0   
2                   0                    0                0               0   
3                   0                    0                0               0   
4                   1                    1                1               0   

    ...     occupation_15  occupation_16  occupati

In [20]:
rating.head()

Unnamed: 0,movie_id,rating,timestamp,user_id,category_Action,category_Adventure,category_Animation,category_Children's,category_Comedy,category_Crime,...,occupation_15,occupation_16,occupation_17,occupation_18,occupation_19,occupation_20,age,gender,occupation,zipcode
0,1193,5,978300760,1,0,0,0,0,0,0,...,0,0,0,0,0,0,1,F,10,48067
1,661,3,978302109,1,0,0,1,1,0,0,...,0,0,0,0,0,0,1,F,10,48067
2,914,3,978301968,1,0,0,0,0,0,0,...,0,0,0,0,0,0,1,F,10,48067
3,3408,4,978300275,1,0,0,0,0,0,0,...,0,0,0,0,0,0,1,F,10,48067
4,2355,5,978824291,1,0,0,1,1,1,0,...,0,0,0,0,0,0,1,F,10,48067


In [21]:
rating['rating'] = np.array(rating['rating']).astype(float)

In [22]:
mean_rate = rating.groupby(['movie_id'])['rating'].mean().reset_index()

In [23]:
mean_rate.columns = ['movie_id','mean_rate']

In [24]:
mean_rate.head()

Unnamed: 0,movie_id,mean_rate
0,1,4.146846
1,2,3.201141
2,3,3.016736
3,4,2.729412
4,5,3.006757


In [25]:
rating = pd.merge(rating, mean_rate, how = 'left', on = ['movie_id'])

In [26]:
# del rating['movie_id']
# del rating['user_id']
del rating['occupation']
del rating['name']
del rating['zipcode']
del rating['timestamp']

In [27]:
rating.head()

Unnamed: 0,movie_id,rating,user_id,category_Action,category_Adventure,category_Animation,category_Children's,category_Comedy,category_Crime,category_Documentary,...,occupation_14,occupation_15,occupation_16,occupation_17,occupation_18,occupation_19,occupation_20,age,gender,mean_rate
0,1193,5.0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,F,4.390725
1,661,3.0,1,0,0,1,1,0,0,0,...,0,0,0,0,0,0,0,1,F,3.464762
2,914,3.0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,F,4.154088
3,3408,4.0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,F,3.863878
4,2355,5.0,1,0,0,1,1,1,0,0,...,0,0,0,0,0,0,0,1,F,3.854375


In [28]:
from sklearn.utils import shuffle
rating = shuffle(rating)

In [29]:
rating['rating'] = np.array(rating['rating']).astype(float) *1.0 / 5

In [30]:
rating['gender'] = np.array(rating['gender'] == 'F').astype(int)

In [31]:
rating['mean_rate'] = np.array(rating['mean_rate']).astype(float) *1.0 / 5

## Seperate by movie_id

In [32]:
movie_id = rating['movie_id'].unique()

In [33]:
import random
random.shuffle(movie_id)

In [34]:
train_id = np.array(movie_id[:int(0.64* len(movie_id))])
val_id = np.array(movie_id[int(0.64* len(movie_id)):int(0.8* len(movie_id))])
test_id = np.array(movie_id[int(0.8* len(movie_id)):])

In [35]:
rating = rating.set_index(['movie_id'])

In [36]:
train_rating = rating.loc[train_id].reset_index()

In [37]:
val_rating = rating.loc[val_id].reset_index()

In [38]:
test_rating = rating.loc[test_id].reset_index()

In [39]:
print len(train_rating) * 100.0 / len(rating)
print (len(train_rating) + len(val_rating)) * 100.0 / len(rating)

66.2438550343
80.6329477139


In [40]:
train_rating.iloc[:, [0,2] + range(3, 21)].columns

Index([u'movie_id', u'user_id', u'category_Action', u'category_Adventure',
       u'category_Animation', u'category_Children's', u'category_Comedy',
       u'category_Crime', u'category_Documentary', u'category_Drama',
       u'category_Fantasy', u'category_Film-Noir', u'category_Horror',
       u'category_Musical', u'category_Mystery', u'category_Romance',
       u'category_Sci-Fi', u'category_Thriller', u'category_War',
       u'category_Western'],
      dtype='object')

In [41]:
x_item_train = train_rating.iloc[:, [0,2] + range(3, 21)]
x_user_train = train_rating.iloc[:, [0,2] + range(21, 44)]
y_train = train_rating.iloc[:, [0,2, 1, -1]]

In [42]:
x_item_val = val_rating.iloc[:, [0,2] + range(3, 21)]
x_user_val = val_rating.iloc[:, [0,2] + range(21, 44)]
y_val = val_rating.iloc[:, [0,2, 1, -1]]

In [43]:
x_item_test = test_rating.iloc[:, [0,2] + range(3, 21)]
x_user_test = test_rating.iloc[:, [0,2] + range(21, 44)]
y_test = test_rating.iloc[:, [0,2, 1, -1]]

In [50]:
val_rating.head()

Unnamed: 0,movie_id,rating,user_id,category_Action,category_Adventure,category_Animation,category_Children's,category_Comedy,category_Crime,category_Documentary,...,occupation_14,occupation_15,occupation_16,occupation_17,occupation_18,occupation_19,occupation_20,age,gender,mean_rate
0,2366,1.0,1425,1,1,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0.731217
1,2366,1.0,4509,1,1,0,0,0,0,0,...,0,0,0,0,0,0,0,3,0,0.731217
2,2366,0.8,195,1,1,0,0,0,0,0,...,0,0,0,0,0,0,0,3,0,0.731217
3,2366,0.4,770,1,1,0,0,0,0,0,...,0,0,0,0,0,0,0,2,0,0.731217
4,2366,0.6,477,1,1,0,0,0,0,0,...,1,0,0,0,0,0,0,4,0,0.731217


In [55]:
len(val_rating)

143921

In [52]:
train_rating['movie_id'].nunique()

2371

In [44]:
x_item_train.to_csv('../../../data/input_formated/final/x_item_train.csv')
x_item_val.to_csv('../../../data/input_formated/final/x_item_val.csv')
x_item_test.to_csv('../../../data/input_formated/final/x_item_test.csv')
x_user_train.to_csv('../../../data/input_formated/final/x_user_train.csv')
x_user_val.to_csv('../../../data/input_formated/final/x_user_val.csv')
x_user_test.to_csv('../../../data/input_formated/final/x_user_test.csv')
y_train.to_csv('../../../data/input_formated/final/y_train.csv')
y_val.to_csv('../../../data/input_formated/final/y_val.csv')
y_test.to_csv('../../../data/input_formated/final/y_test.csv')

In [45]:
# y_train = y_train.reshape((-1, 1))
# y_test = y_test.reshape((-1, 1))

In [46]:
# x_item_test = np.array(x_item_test, dtype = np.float32)
# x_user_test = np.array(x_user_test, dtype = np.float32)
# y_test = np.array(y_test, dtype = np.float32)

In [47]:
# np.savetxt('../../data/input_formated/withuser_2/x_item_train.csv', x_item_train)
# np.savetxt('../../data/input_formated/withuser_2/x_user_train.csv', x_user_train)
# np.savetxt('../../data/input_formated/withuser_2/y_train.csv', y_train)
# np.savetxt('../../data/input_formated/withuser_2/x_item_test.csv', x_item_test)
# np.savetxt('../../data/input_formated/withuser_2/x_user_test.csv', x_user_test)
# np.savetxt('../../data/input_formated/withuser_2/y_test.csv', y_test)