In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm

## Remove timestamp from the data and save original data as csv 

### convert movielens 100k u.dat (rating data) to csv

In [None]:
data_100k = pd.read_csv('data/ml-100k/u.data', sep="\t", header=None)

In [None]:
data_100k.columns = ['user_id', 'movie_id', 'rating', "time_stamp"]

In [None]:
data_100k = data_100k.drop('time_stamp', axis=1)

In [None]:
data_100k.head()

In [None]:
# Uncomment only when need to write user data to csv

#data_100k.to_csv("data/user_data/ml_100k_user_data.csv", index=False)

### movielens 1m data ratings.dat (rating data) to csv

In [None]:
data_1m = pd.read_csv('data/ml-1m/ratings.dat', sep="::", header=None)

In [None]:
data_1m.columns = ['user_id', 'movie_id', 'rating', "time_stamp"]

In [None]:
data_1m = data_1m.drop('time_stamp', axis=1)

In [None]:
data_1m.head()

In [None]:
# Uncomment only when need to write user data to csv

#data_1m.to_csv("data/user_data/ml_1m_user_data.csv", index=False)

### Converting 100k user data to one hot encoding for exposure model

In [17]:
data_100k = pd.read_csv('data/user_data/ml_100k_user_data.csv')

In [18]:
# create one hot encoding using movie id

one_hot = pd.get_dummies(data_100k.movie_id, prefix="movie_id", dtype=np.uint8)

In [19]:
one_hot['user_id'] = data_100k.user_id

In [37]:
# clearing dataframe from memory which won't be required further for faster execution of below blocks

del data_100k

In [20]:
one_hot = one_hot[one_hot.columns.tolist()[-1:] + one_hot.columns.tolist()[:-1]]
one_hot.head()

Unnamed: 0,user_id,movie_id_1,movie_id_2,movie_id_3,movie_id_4,movie_id_5,movie_id_6,movie_id_7,movie_id_8,movie_id_9,...,movie_id_1673,movie_id_1674,movie_id_1675,movie_id_1676,movie_id_1677,movie_id_1678,movie_id_1679,movie_id_1680,movie_id_1681,movie_id_1682
0,196,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,186,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,22,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,244,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,166,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [21]:
grouped = one_hot.groupby("user_id")

In [22]:
exposure_model_data = pd.DataFrame(columns=one_hot.columns)

In [23]:
# grouping the one hot encoded user_id wise

for i in tqdm(one_hot.user_id.unique().tolist()):
    temp = pd.DataFrame(grouped.get_group(i).drop("user_id", axis=1).sum()).T
    temp["user_id"] = i
    temp = temp[temp.columns.tolist()[-1:] + temp.columns.tolist()[:-1]]
    exposure_model_data = pd.concat([exposure_model_data, temp], ignore_index=True)

100%|██████████| 943/943 [00:23<00:00, 39.81it/s]


In [24]:
exposure_model_data.head()

Unnamed: 0,user_id,movie_id_1,movie_id_2,movie_id_3,movie_id_4,movie_id_5,movie_id_6,movie_id_7,movie_id_8,movie_id_9,...,movie_id_1673,movie_id_1674,movie_id_1675,movie_id_1676,movie_id_1677,movie_id_1678,movie_id_1679,movie_id_1680,movie_id_1681,movie_id_1682
0,196,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
1,186,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,22,0,1,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,244,1,0,1,0,0,0,1,0,1,...,0,0,0,0,0,0,0,0,0,0
4,166,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [25]:
exposure_model_data.shape

(943, 1683)

In [16]:
# Uncomment only when exposure model data write to csv is needed

#exposure_model_data.to_csv("data/preprocess_data/preproc_100k_exposure_data.csv", index=False)

### Converting 1m user data to one hot encoding for exposure model

In [26]:
data_1m = pd.read_csv('data/user_data/ml_1m_user_data.csv')

In [27]:
# create one hot encoding using movie id

one_hot = pd.get_dummies(data_1m.movie_id, prefix="movie_id", dtype=np.uint8)

In [28]:
one_hot['user_id'] = data_1m.user_id

In [36]:
# clearing dataframe from memory which won't be required further for faster execution of below blocks

del data_1m

In [29]:
one_hot = one_hot[one_hot.columns.tolist()[-1:] + one_hot.columns.tolist()[:-1]]
one_hot.head()

Unnamed: 0,user_id,movie_id_1,movie_id_2,movie_id_3,movie_id_4,movie_id_5,movie_id_6,movie_id_7,movie_id_8,movie_id_9,...,movie_id_3943,movie_id_3944,movie_id_3945,movie_id_3946,movie_id_3947,movie_id_3948,movie_id_3949,movie_id_3950,movie_id_3951,movie_id_3952
0,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [30]:
grouped = one_hot.groupby("user_id")

In [31]:
exposure_model_data = pd.DataFrame(columns=one_hot.columns)

In [32]:
# grouping the one hot encoded user_id wise

for i in tqdm(one_hot.user_id.unique().tolist()):
    temp = pd.DataFrame(grouped.get_group(i).drop("user_id", axis=1).sum()).T
    temp["user_id"] = i
    temp = temp[temp.columns.tolist()[-1:] + temp.columns.tolist()[:-1]]
    exposure_model_data = pd.concat([exposure_model_data, temp], ignore_index=True)

100%|██████████| 6040/6040 [33:00<00:00,  3.05it/s]


In [33]:
exposure_model_data.head()

Unnamed: 0,user_id,movie_id_1,movie_id_2,movie_id_3,movie_id_4,movie_id_5,movie_id_6,movie_id_7,movie_id_8,movie_id_9,...,movie_id_3943,movie_id_3944,movie_id_3945,movie_id_3946,movie_id_3947,movie_id_3948,movie_id_3949,movie_id_3950,movie_id_3951,movie_id_3952
0,1,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,2,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,3,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,4,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,5,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [34]:
exposure_model_data.shape

(6040, 3707)

In [35]:
# Uncomment only when exposure model data write to csv is needed

#exposure_model_data.to_csv("data/preprocess_data/preproc_1m_exposure_data.csv", index=False)

### Preprocess rating model data 100k

In [None]:
data_100k = pd.read_csv('data/user_data/ml_100k_user_data.csv')

In [None]:
one_hot = pd.get_dummies(data_100k.movie_id, prefix="movie_id", dtype=np.uint8)

In [None]:
one_hot["user_id"] = data_100k.user_id
one_hot["rating"] = data_100k.rating

In [None]:
cols = one_hot.columns.tolist()
cols = cols[-2:] + cols[:-2]
one_hot = one_hot[cols]
one_hot.head()

In [None]:
one_hot.shape

In [None]:
for i in range(one_hot.shape[0]):
    for j in range(1, 1683):
        if one_hot["movie_id_"+str(j)][i] == 1:
            one_hot["movie_id_"+str(j)][i] = one_hot["rating"][i]

In [None]:
one_hot = one_hot.drop(['rating'], axis=1)
one_hot.head()

In [None]:
unique_user_id = one_hot.user_id.unique().tolist()

In [None]:
grouped = one_hot.groupby("user_id")

In [None]:
rating_model_data = pd.DataFrame(columns=one_hot.columns)

In [None]:
for i in unique_user_id:
    temp = pd.DataFrame(grouped.get_group(i).drop("user_id", axis=1).sum()).T
    temp["user_id"] = i
    t_cols = temp.columns.tolist()
    t_cols = t_cols[-1:] + t_cols[:-1]
    temp = temp[t_cols]
    rating_model_data = pd.concat([rating_model_data, temp], ignore_index=True)

In [None]:
rating_model_data.shape

In [None]:
# Do not uncomment unless you want to write the rating model data again to the csv

# rating_model_data.to_csv("data/preprocess_data/preproc_100k_rating_data.csv", index=False)

### Preprocess rating model data 1m data 

In [2]:
data_1m = pd.read_csv('data/user_data/ml_1m_user_data.csv')

In [3]:
data_1m.head()

Unnamed: 0,user_id,movie_id,rating
0,1,1193,5
1,1,661,3
2,1,914,3
3,1,3408,4
4,1,2355,5


In [4]:
one_hot = pd.get_dummies(data_1m.movie_id, prefix="movie_id", dtype=np.uint8)

In [5]:
one_hot.head()

Unnamed: 0,movie_id_1,movie_id_2,movie_id_3,movie_id_4,movie_id_5,movie_id_6,movie_id_7,movie_id_8,movie_id_9,movie_id_10,...,movie_id_3943,movie_id_3944,movie_id_3945,movie_id_3946,movie_id_3947,movie_id_3948,movie_id_3949,movie_id_3950,movie_id_3951,movie_id_3952
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [6]:
#one_hot["user_id"] = data_1m.user_id
one_hot["rating"] = data_1m.rating.astype(np.uint8)

In [None]:
#cols = one_hot.columns.tolist()
#cols = cols[-2:] + cols[:-2]
#one_hot = one_hot[cols]
#one_hot.head()

In [7]:
user_ids = data_1m.user_id
del data_1m

In [8]:
for col in one_hot.columns.to_list()[:-1]:
    one_hot[col] = np.multiply(one_hot.rating, one_hot[col])

In [9]:
one_hot.movie_id_661[1]

3

In [10]:
one_hot = one_hot.drop(['rating'], axis=1)
one_hot.head()

Unnamed: 0,movie_id_1,movie_id_2,movie_id_3,movie_id_4,movie_id_5,movie_id_6,movie_id_7,movie_id_8,movie_id_9,movie_id_10,...,movie_id_3944,movie_id_3945,movie_id_3946,movie_id_3947,movie_id_3948,movie_id_3949,movie_id_3950,movie_id_3951,movie_id_3952,rating
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,5
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,3
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,3
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,4
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,5


In [11]:
one_hot["user_id"] = user_ids
del user_ids

In [12]:
unique_user_id = one_hot.user_id.unique().tolist()

In [13]:
grouped = one_hot.groupby("user_id")

In [14]:
rating_model_data = pd.DataFrame(columns=one_hot.columns)
del one_hot

In [15]:
for i in tqdm(unique_user_id):
    temp = pd.DataFrame(grouped.get_group(i).drop("user_id", axis=1).sum()).T
    temp["user_id"] = i
    temp = temp[temp.columns.tolist()[-1:] + temp.columns.tolist()[:-1]]
    rating_model_data = pd.concat([rating_model_data, temp], ignore_index=True)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  
100%|██████████| 6040/6040 [26:14<00:00,  3.84it/s]


In [16]:
rating_model_data.shape

(6040, 3708)

In [21]:
cols = rating_model_data.columns.to_list()[-1:] + rating_model_data.columns.to_list()[:-1]
rating_model_data = rating_model_data[cols]
rating_model_data = rating_model_data.drop(['rating'], axis=1)
rating_model_data.head()

Unnamed: 0,user_id,movie_id_1,movie_id_10,movie_id_100,movie_id_1000,movie_id_1002,movie_id_1003,movie_id_1004,movie_id_1005,movie_id_1006,...,movie_id_99,movie_id_990,movie_id_991,movie_id_992,movie_id_993,movie_id_994,movie_id_996,movie_id_997,movie_id_998,movie_id_999
0,1,5,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,2,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,3,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,4,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,5,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,5,0,0,0,0


In [22]:
# Do not uncomment unless you want to write the rating model data again to the csv

#rating_model_data.to_csv("data/preprocess_data/preproc_1m_rating_data.csv", index=False)