In [1]:
import numpy as np
import torch
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
import torch.nn as nn
import torch.nn.functional as functional
from torch.autograd import Variable

In [3]:
def convert_cate_tocol(df, id_cols, cate_col, multi_cate, merge = True):
	def mapping(cat_list, all_categories):
		category_vec = np.zeros(len(all_categories)).astype(int)
		if multi_cate == True:
			cat_list = set(cat_list)
		else:
			cat_list = set([cat_list])
		for i in range(len(all_categories)):
			if all_categories[i] in cat_list:
				category_vec[i] = 1
		return category_vec
	
	all_categories = None
	if multi_cate:
		all_categories = np.unique(sum(df[cate_col], []))
	else:
		all_categories = np.unique(df[cate_col])
	
	res_df = pd.DataFrame([mapping(df.iloc[i][cate_col], all_categories) for i in range(len(df))],columns=all_categories)
	res_df.columns = ['%s_%s' % (cate_col, col) for col in res_df.columns]
	for col in id_cols:
		res_df[col] = df[col]
		
	if merge == True:
		res_df = pd.merge(res_df, df, on = id_cols)
	return res_df

# Read data

## movie

In [4]:
movie_all = pd.read_csv('../../data/pre-processed/movie_all.csv').iloc[:, [1] + range(16, 35)]

In [5]:
movie_all.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3706 entries, 0 to 3705
Data columns (total 20 columns):
movie_id                3706 non-null int64
category_Action         3706 non-null int64
category_Adventure      3706 non-null int64
category_Animation      3706 non-null int64
category_Children's     3706 non-null int64
category_Comedy         3706 non-null int64
category_Crime          3706 non-null int64
category_Documentary    3706 non-null int64
category_Drama          3706 non-null int64
category_Fantasy        3706 non-null int64
category_Film-Noir      3706 non-null int64
category_Horror         3706 non-null int64
category_Musical        3706 non-null int64
category_Mystery        3706 non-null int64
category_Romance        3706 non-null int64
category_Sci-Fi         3706 non-null int64
category_Thriller       3706 non-null int64
category_War            3706 non-null int64
category_Western        3706 non-null int64
name                    3706 non-null object
dtypes: int6

In [6]:
movie_all[movie_all['movie_id'] == 1193]

Unnamed: 0,movie_id,category_Action,category_Adventure,category_Animation,category_Children's,category_Comedy,category_Crime,category_Documentary,category_Drama,category_Fantasy,category_Film-Noir,category_Horror,category_Musical,category_Mystery,category_Romance,category_Sci-Fi,category_Thriller,category_War,category_Western,name
189,1193,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,One Flew Over the Cuckoo's Nest (1975)


## rating

In [7]:
import re
def read_rating(filename):
	user_id = []
	movie_id = []
	rating = []
	timestamp = []
	with open(filename) as f:
		for line in f:
			line_split = re.split('::', line)
			user_id.append(line_split[0])
			movie_id.append(line_split[1])
			rating.append(line_split[2])
			timestamp.append(line_split[3][:-1])
	rating_df = pd.DataFrame({'user_id':user_id, 'movie_id':movie_id, 'rating': rating, 'timestamp':timestamp})
	return rating_df

In [8]:
rating = read_rating('../../data/ml-1m/ratings.dat')
rating.head()

Unnamed: 0,movie_id,rating,timestamp,user_id
0,1193,5,978300760,1
1,661,3,978302109,1
2,914,3,978301968,1
3,3408,4,978300275,1
4,2355,5,978824291,1


In [9]:
rating['movie_id'] = np.array(rating['movie_id']).astype(int)

## merge

In [10]:
rating = pd.merge(rating, movie_all, on =['movie_id'], how = 'left')
print rating.head()
rating.info()

   movie_id rating  timestamp user_id  category_Action  category_Adventure  \
0      1193      5  978300760       1                0                   0   
1       661      3  978302109       1                0                   0   
2       914      3  978301968       1                0                   0   
3      3408      4  978300275       1                0                   0   
4      2355      5  978824291       1                0                   0   

   category_Animation  category_Children's  category_Comedy  category_Crime  \
0                   0                    0                0               0   
1                   1                    1                0               0   
2                   0                    0                0               0   
3                   0                    0                0               0   
4                   1                    1                1               0   

                    ...                    category_Film

In [11]:
rating.head()

Unnamed: 0,movie_id,rating,timestamp,user_id,category_Action,category_Adventure,category_Animation,category_Children's,category_Comedy,category_Crime,...,category_Film-Noir,category_Horror,category_Musical,category_Mystery,category_Romance,category_Sci-Fi,category_Thriller,category_War,category_Western,name
0,1193,5,978300760,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,One Flew Over the Cuckoo's Nest (1975)
1,661,3,978302109,1,0,0,1,1,0,0,...,0,0,1,0,0,0,0,0,0,James and the Giant Peach (1996)
2,914,3,978301968,1,0,0,0,0,0,0,...,0,0,1,0,1,0,0,0,0,My Fair Lady (1964)
3,3408,4,978300275,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Erin Brockovich (2000)
4,2355,5,978824291,1,0,0,1,1,1,0,...,0,0,0,0,0,0,0,0,0,"Bug's Life, A (1998)"


## scale

In [12]:
rating['rating'] = np.array(rating['rating']).astype(float) *1.0 / 5

In [13]:
del rating['name']
del rating['timestamp']

In [14]:
from sklearn.utils import shuffle
rating = shuffle(rating)

In [15]:
rating.head()

Unnamed: 0,movie_id,rating,user_id,category_Action,category_Adventure,category_Animation,category_Children's,category_Comedy,category_Crime,category_Documentary,...,category_Fantasy,category_Film-Noir,category_Horror,category_Musical,category_Mystery,category_Romance,category_Sci-Fi,category_Thriller,category_War,category_Western
989984,940,0.8,5978,1,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
929404,262,0.8,5615,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
991977,2951,0.6,5994,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
669796,2140,0.6,4026,0,0,0,1,0,0,0,...,1,0,0,0,0,0,1,0,0,0
862738,39,0.6,5193,0,0,0,0,1,0,0,...,0,0,0,0,0,1,0,0,0,0


In [52]:
x = rating.iloc[:, [0] + range(2,len(rating.columns))]
y = rating.iloc[:, [0,1,2]]

In [17]:
rating.columns

Index([u'movie_id', u'rating', u'user_id', u'category_Action',
       u'category_Adventure', u'category_Animation', u'category_Children's',
       u'category_Comedy', u'category_Crime', u'category_Documentary',
       u'category_Drama', u'category_Fantasy', u'category_Film-Noir',
       u'category_Horror', u'category_Musical', u'category_Mystery',
       u'category_Romance', u'category_Sci-Fi', u'category_Thriller',
       u'category_War', u'category_Western'],
      dtype='object')

## seperate train|test

In [18]:
n_train = int(len(rating) * 0.8)
print n_train

800167


In [53]:
x_train = x.iloc[:n_train, :]
x_test = x.iloc[n_train:, :]
y_train = y.iloc[:n_train]
y_test = y.iloc[n_train:]

In [20]:
# y_train = y_train.reshape((-1, 1))

In [21]:
# y_test = y_test.reshape((-1, 1))

In [22]:
# x_train = np.array(x_train, dtype = np.float32)
# y_train = np.array(y_train, dtype = np.float32)
# x_test = np.array(x_test, dtype = np.float32)
# y_test = np.array(y_test, dtype = np.float32)

In [54]:
x_train.to_csv('../../data/input_formated/nouser/x_train.csv')
y_train.to_csv('../../data/input_formated/nouser/y_train.csv')

In [55]:
x_test.to_csv('../../data/input_formated/nouser/x_test.csv')
y_test.to_csv('../../data/input_formated/nouser/y_test.csv')