In [1]:
import numpy as np
import torch
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
import torch.nn as nn
import torch.nn.functional as functional
from torch.autograd import Variable

In [3]:
def convert_cate_tocol(df, id_cols, cate_col, multi_cate, merge = True):
	def mapping(cat_list, all_categories):
		category_vec = np.zeros(len(all_categories)).astype(int)
		if multi_cate == True:
			cat_list = set(cat_list)
		else:
			cat_list = set([cat_list])
		for i in range(len(all_categories)):
			if all_categories[i] in cat_list:
				category_vec[i] = 1
		return category_vec
	
	all_categories = None
	if multi_cate:
		all_categories = np.unique(sum(df[cate_col], []))
	else:
		all_categories = np.unique(df[cate_col])
	
	res_df = pd.DataFrame([mapping(df.iloc[i][cate_col], all_categories) for i in range(len(df))],columns=all_categories)
	res_df.columns = ['%s_%s' % (cate_col, col) for col in res_df.columns]
	for col in id_cols:
		res_df[col] = df[col]
		
	if merge == True:
		res_df = pd.merge(res_df, df, on = id_cols)
	return res_df

# Read data

## movie

In [4]:
movie_all = pd.read_csv('../../data/pre-processed/movie_all.csv').iloc[:, [1] + range(16, 35)]

In [5]:
movie_all.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3706 entries, 0 to 3705
Data columns (total 20 columns):
movie_id                3706 non-null int64
category_Action         3706 non-null int64
category_Adventure      3706 non-null int64
category_Animation      3706 non-null int64
category_Children's     3706 non-null int64
category_Comedy         3706 non-null int64
category_Crime          3706 non-null int64
category_Documentary    3706 non-null int64
category_Drama          3706 non-null int64
category_Fantasy        3706 non-null int64
category_Film-Noir      3706 non-null int64
category_Horror         3706 non-null int64
category_Musical        3706 non-null int64
category_Mystery        3706 non-null int64
category_Romance        3706 non-null int64
category_Sci-Fi         3706 non-null int64
category_Thriller       3706 non-null int64
category_War            3706 non-null int64
category_Western        3706 non-null int64
name                    3706 non-null object
dtypes: int6

In [6]:
movie_all[movie_all['movie_id'] == 1193]

Unnamed: 0,movie_id,category_Action,category_Adventure,category_Animation,category_Children's,category_Comedy,category_Crime,category_Documentary,category_Drama,category_Fantasy,category_Film-Noir,category_Horror,category_Musical,category_Mystery,category_Romance,category_Sci-Fi,category_Thriller,category_War,category_Western,name
189,1193,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,One Flew Over the Cuckoo's Nest (1975)


## user

In [7]:
user_info = pd.read_csv('../../data/pre-processed/user_all.csv').iloc[:, 1:]
print user_info.head()

   age gender  occupation  user_id zipcode
0    1      F          10        1   48067
1   56      M          16        2   70072
2   25      M          15        3   55117
3   45      M           7        4   02460
4   25      M          20        5   55455


In [8]:
user_info['zipcode'] = np.array(user_info['zipcode']).astype(int)

In [9]:
user_info.head()

Unnamed: 0,age,gender,occupation,user_id,zipcode
0,1,F,10,1,48067
1,56,M,16,2,70072
2,25,M,15,3,55117
3,45,M,7,4,2460
4,25,M,20,5,55455


In [10]:
user_info.columns

Index([u'age', u'gender', u'occupation', u'user_id', u'zipcode'], dtype='object')

In [11]:
user_info_final = convert_cate_tocol(user_info, cate_col = 'occupation', id_cols= ['user_id'], multi_cate=False, merge=True)

In [12]:
type(user_info_final['age'][0])

numpy.int64

In [13]:
lookup_age= {1:1, 18:2, 25:3, 35:4, 45:5, 50:6, 56:7}

In [14]:
user_info_final['age'] = [lookup_age[i] for i in user_info_final['age']]

## rating

In [15]:
import re
def read_rating(filename):
	user_id = []
	movie_id = []
	rating = []
	timestamp = []
	with open(filename) as f:
		for line in f:
			line_split = re.split('::', line)
			user_id.append(line_split[0])
			movie_id.append(line_split[1])
			rating.append(line_split[2])
			timestamp.append(line_split[3][:-1])
	rating_df = pd.DataFrame({'user_id':user_id, 'movie_id':movie_id, 'rating': rating, 'timestamp':timestamp})
	return rating_df

In [16]:
rating = read_rating('../../data/ml-1m/ratings.dat')
rating.head()

Unnamed: 0,movie_id,rating,timestamp,user_id
0,1193,5,978300760,1
1,661,3,978302109,1
2,914,3,978301968,1
3,3408,4,978300275,1
4,2355,5,978824291,1


In [17]:
rating['movie_id'] = np.array(rating['movie_id']).astype(int)

In [18]:
rating['user_id'] = np.array(rating['user_id']).astype(int)

## merge

In [19]:
rating = pd.merge(rating, movie_all, on =['movie_id'], how = 'left')
print rating.head()
rating.info()

   movie_id rating  timestamp  user_id  category_Action  category_Adventure  \
0      1193      5  978300760        1                0                   0   
1       661      3  978302109        1                0                   0   
2       914      3  978301968        1                0                   0   
3      3408      4  978300275        1                0                   0   
4      2355      5  978824291        1                0                   0   

   category_Animation  category_Children's  category_Comedy  category_Crime  \
0                   0                    0                0               0   
1                   1                    1                0               0   
2                   0                    0                0               0   
3                   0                    0                0               0   
4                   1                    1                1               0   

                    ...                    categor

In [20]:
rating = pd.merge(rating, user_info_final, on =['user_id'], how = 'left')
print rating.head()
rating.info()

   movie_id rating  timestamp  user_id  category_Action  category_Adventure  \
0      1193      5  978300760        1                0                   0   
1       661      3  978302109        1                0                   0   
2       914      3  978301968        1                0                   0   
3      3408      4  978300275        1                0                   0   
4      2355      5  978824291        1                0                   0   

   category_Animation  category_Children's  category_Comedy  category_Crime  \
0                   0                    0                0               0   
1                   1                    1                0               0   
2                   0                    0                0               0   
3                   0                    0                0               0   
4                   1                    1                1               0   

    ...     occupation_15  occupation_16  occupati

In [21]:
del rating['movie_id']
del rating['user_id']

In [22]:
rating.head()

Unnamed: 0,rating,timestamp,category_Action,category_Adventure,category_Animation,category_Children's,category_Comedy,category_Crime,category_Documentary,category_Drama,...,occupation_15,occupation_16,occupation_17,occupation_18,occupation_19,occupation_20,age,gender,occupation,zipcode
0,5,978300760,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,1,F,10,48067
1,3,978302109,0,0,1,1,0,0,0,0,...,0,0,0,0,0,0,1,F,10,48067
2,3,978301968,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,F,10,48067
3,4,978300275,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,1,F,10,48067
4,5,978824291,0,0,1,1,1,0,0,0,...,0,0,0,0,0,0,1,F,10,48067


## scale

In [23]:
rating['rating'] = np.array(rating['rating']).astype(float) *1.0 / 5

In [24]:
del rating['occupation']
del rating['name']
del rating['zipcode']
del rating['timestamp']

In [25]:
rating['gender'] = np.array(rating['gender'] == 'F').astype(int)

In [26]:
from sklearn.utils import shuffle
rating = shuffle(rating)

In [27]:
rating.head()

Unnamed: 0,rating,category_Action,category_Adventure,category_Animation,category_Children's,category_Comedy,category_Crime,category_Documentary,category_Drama,category_Fantasy,...,occupation_13,occupation_14,occupation_15,occupation_16,occupation_17,occupation_18,occupation_19,occupation_20,age,gender
645086,0.8,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,3,0
247113,0.4,1,0,0,0,0,1,0,1,0,...,0,0,0,0,0,0,0,0,3,0
710877,0.8,1,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,3,0
524282,0.8,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,4,0
594868,0.2,1,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,3,0


In [28]:
x_item = np.array(rating.iloc[:, 1:19])
x_user = np.array(rating.iloc[:, 19:])
y = np.array(rating.iloc[:, 0])

## seperate train|test

In [29]:
n_train = int(len(rating) * 0.8)
print n_train

800167


In [30]:
x_item_train = x_item[:n_train, :]
x_item_test = x_item[n_train:, :]
x_user_train = x_user[:n_train, :]
x_user_test = x_user[n_train:, :]
y_train = y[:n_train]
y_test = y[n_train:]

In [31]:
y_train = y_train.reshape((-1, 1))

In [32]:
y_train.shape

(800167, 1)

In [33]:
y_test = y_test.reshape((-1, 1))

In [34]:
x_item_test = np.array(x_item_test, dtype = np.float32)
x_user_test = np.array(x_user_test, dtype = np.float32)
y_test = np.array(y_test, dtype = np.float32)

In [35]:
np.savetxt('../../data/input_formated/withuser/x_item_train.csv', x_item_train)
np.savetxt('../../data/input_formated/withuser/x_user_train.csv', x_user_train)
np.savetxt('../../data/input_formated/withuser/y_train.csv', y_train)
np.savetxt('../../data/input_formated/withuser/x_item_test.csv', x_item_test)
np.savetxt('../../data/input_formated/withuser/x_user_test.csv', x_user_test)
np.savetxt('../../data/input_formated/withuser/y_test.csv', y_test)