In [1]:
import pandas as pd
import numpy as np

## Remove timestamp from the data and save original data as csv 

### movielens 100k data

In [3]:
data_100k = pd.read_csv('data/ml-100k/u.data', sep="\t", header=None)

In [4]:
data_100k.columns = ['user_id', 'movie_id', 'rating', "time_stamp"]

In [5]:
data_100k.head()

Unnamed: 0,user_id,movie_id,rating,time_stamp
0,196,242,3,881250949
1,186,302,3,891717742
2,22,377,1,878887116
3,244,51,2,880606923
4,166,346,1,886397596


In [6]:
data_100k = data.drop('time_stamp', axis=1)

In [7]:
# Uncomment only when need to write user data to csv

#data_100k.to_csv("data/user_data/ml_100k_user_data.csv", index=False)

## Processing exposure model data

In [9]:
data = pd.read_csv('data/ml_100k_user_data.csv')

In [10]:
data = data.drop(["rating"], axis=1)
data.head()

Unnamed: 0,user_id,movie_id
0,196,242
1,186,302
2,22,377
3,244,51
4,166,346


In [11]:
# create one hot encoding using movie id

one_hot = pd.get_dummies(data.movie_id)

In [12]:
# generate column names for one hot dataframe --> format movid_id_#id_number

one_hot.columns = ["movie_id_"+str(i) for i in range(1, len(data.movie_id.unique())+1)]

In [13]:
one_hot['user_id'] = data.user_id

In [14]:
cols = one_hot.columns.tolist()
cols = cols[-1:] + cols[:-1]
one_hot = one_hot[cols]
one_hot.head()

Unnamed: 0,user_id,movie_id_1,movie_id_2,movie_id_3,movie_id_4,movie_id_5,movie_id_6,movie_id_7,movie_id_8,movie_id_9,...,movie_id_1673,movie_id_1674,movie_id_1675,movie_id_1676,movie_id_1677,movie_id_1678,movie_id_1679,movie_id_1680,movie_id_1681,movie_id_1682
0,196,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,186,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,22,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,244,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,166,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [15]:
unique_user_id = one_hot.user_id.unique().tolist()

In [16]:
grouped = one_hot.groupby("user_id")

In [17]:
exposure_model_data = pd.DataFrame(columns=one_hot.columns)

In [18]:
for i in unique_user_id:
    temp = pd.DataFrame(grouped.get_group(i).drop("user_id", axis=1).sum()).T
    temp["user_id"] = i
    t_cols = temp.columns.tolist()
    t_cols = t_cols[-1:] + t_cols[:-1]
    temp = temp[t_cols]
    exposure_model_data = pd.concat([exposure_model_data, temp], ignore_index=True)

In [19]:
exposure_model_data.head()

Unnamed: 0,user_id,movie_id_1,movie_id_2,movie_id_3,movie_id_4,movie_id_5,movie_id_6,movie_id_7,movie_id_8,movie_id_9,...,movie_id_1673,movie_id_1674,movie_id_1675,movie_id_1676,movie_id_1677,movie_id_1678,movie_id_1679,movie_id_1680,movie_id_1681,movie_id_1682
0,196,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
1,186,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,22,0,1,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,244,1,0,1,0,0,0,1,0,1,...,0,0,0,0,0,0,0,0,0,0
4,166,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [20]:
exposure_model_data.shape

(943, 1683)

In [21]:
# Uncomment only when exposure model write to csv is needed

#exposure_model_data.to_csv("data/exposure_model_data.csv", index=False)

## Preprocess rating model data

In [2]:
data = pd.read_csv('data/ml_100k_user_data.csv')

In [3]:
data.head()

Unnamed: 0,user_id,movie_id,rating
0,196,242,3
1,186,302,3
2,22,377,1
3,244,51,2
4,166,346,1


In [4]:
one_hot = pd.get_dummies(data.movie_id)

In [5]:
one_hot.shape

(100000, 1682)

In [6]:
one_hot.columns = ["movie_id_"+str(i) for i in range(1, len(data.movie_id.unique())+1)]

In [7]:
one_hot["user_id"] = data.user_id
one_hot["rating"] = data.rating

In [8]:
cols = one_hot.columns.tolist()
cols = cols[-2:] + cols[:-2]
one_hot = one_hot[cols]
one_hot.head()

Unnamed: 0,user_id,rating,movie_id_1,movie_id_2,movie_id_3,movie_id_4,movie_id_5,movie_id_6,movie_id_7,movie_id_8,...,movie_id_1673,movie_id_1674,movie_id_1675,movie_id_1676,movie_id_1677,movie_id_1678,movie_id_1679,movie_id_1680,movie_id_1681,movie_id_1682
0,196,3,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,186,3,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,22,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,244,2,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,166,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [9]:
one_hot.shape

(100000, 1684)

In [10]:
for i in range(one_hot.shape[0]):
    for j in range(1, 1683):
        if one_hot["movie_id_"+str(j)][i] == 1:
            one_hot["movie_id_"+str(j)][i] = one_hot["rating"][i]

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.


In [17]:
one_hot = one_hot.drop(['rating'], axis=1)
one_hot.head()

Unnamed: 0,user_id,movie_id_1,movie_id_2,movie_id_3,movie_id_4,movie_id_5,movie_id_6,movie_id_7,movie_id_8,movie_id_9,...,movie_id_1673,movie_id_1674,movie_id_1675,movie_id_1676,movie_id_1677,movie_id_1678,movie_id_1679,movie_id_1680,movie_id_1681,movie_id_1682
0,196,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,186,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,22,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,244,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,166,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [30]:
unique_user_id = one_hot.user_id.unique().tolist()

In [31]:
grouped = one_hot.groupby("user_id")

In [32]:
rating_model_data = pd.DataFrame(columns=one_hot.columns)

In [33]:
for i in unique_user_id:
    temp = pd.DataFrame(grouped.get_group(i).drop("user_id", axis=1).sum()).T
    temp["user_id"] = i
    t_cols = temp.columns.tolist()
    t_cols = t_cols[-1:] + t_cols[:-1]
    temp = temp[t_cols]
    rating_model_data = pd.concat([rating_model_data, temp], ignore_index=True)

In [34]:
rating_model_data.shape

(943, 1683)

In [39]:
# Do not uncomment unless you want to write the rating model data again to the csv

# rating_model_data.to_csv("data/rating_model_data.csv", index=False)

In [41]:
# incomplete code --> optimized code to replace the one's of the one hot encoding dataframe by the given rating by the user  

"""
data_list = list()
for i in range(1, 1683):
    if one_hot.loc['movie_id_'+str(i)] == 1:
        one_hot.loc["movie_id_"+str(i)] = rows["rating"]
        print(one_hot.head())
        break
"""

'\ndata_list = list()\nfor i in range(1, 1683):\n    if one_hot.loc[\'movie_id_\'+str(i)] == 1:\n        one_hot.loc["movie_id_"+str(i)] = rows["rating"]\n        print(one_hot.head())\n        break\n'