In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm

## Remove timestamp from the data and save original data as csv 

### movielens 100k data to csv

In [21]:
data_100k = pd.read_csv('data/ml-100k/u.data', sep="\t", header=None)

In [22]:
data_100k.columns = ['user_id', 'movie_id', 'rating', "time_stamp"]

In [23]:
data_100k = data_100k.drop('time_stamp', axis=1)

In [24]:
data_100k.head()

Unnamed: 0,user_id,movie_id,rating
0,196,242,3
1,186,302,3
2,22,377,1
3,244,51,2
4,166,346,1


In [7]:
# Uncomment only when need to write user data to csv

#data_100k.to_csv("data/user_data/ml_100k_user_data.csv", index=False)

### Movielens 1m data

In [25]:
data_1m = pd.read_csv('data/ml-1m/ratings.dat', sep="::", header=None)

  """Entry point for launching an IPython kernel.


In [26]:
data_1m.columns = ['user_id', 'movie_id', 'rating', "time_stamp"]

In [27]:
data_1m = data_1m.drop('time_stamp', axis=1)

In [28]:
data_1m.head()

Unnamed: 0,user_id,movie_id,rating
0,1,1193,5
1,1,661,3
2,1,914,3
3,1,3408,4
4,1,2355,5


In [29]:
# Uncomment only when need to write user data to csv

#data_1m.to_csv("data/user_data/ml_1m_user_data.csv", index=False)

### Processing for exposure model data - 100K

In [30]:
data_100k = pd.read_csv('data/user_data/ml_100k_user_data.csv')

In [32]:
data_100k = data_100k.drop(["rating"], axis=1)
data_100k.head()

Unnamed: 0,user_id,movie_id
0,196,242
1,186,302
2,22,377
3,244,51
4,166,346


In [33]:
# create one hot encoding using movie id

one_hot = pd.get_dummies(data_100k.movie_id)

In [34]:
# generate column names for one hot dataframe --> format movid_id_#id_number

one_hot.columns = ["movie_id_"+str(i) for i in range(1, len(data_100k.movie_id.unique())+1)]

In [35]:
one_hot['user_id'] = data_100k.user_id

In [36]:
cols = one_hot.columns.tolist()
cols = cols[-1:] + cols[:-1]
one_hot = one_hot[cols]
one_hot.head()

Unnamed: 0,user_id,movie_id_1,movie_id_2,movie_id_3,movie_id_4,movie_id_5,movie_id_6,movie_id_7,movie_id_8,movie_id_9,...,movie_id_1673,movie_id_1674,movie_id_1675,movie_id_1676,movie_id_1677,movie_id_1678,movie_id_1679,movie_id_1680,movie_id_1681,movie_id_1682
0,196,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,186,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,22,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,244,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,166,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [37]:
unique_user_id = one_hot.user_id.unique().tolist()

In [38]:
grouped = one_hot.groupby("user_id")

In [39]:
exposure_model_data = pd.DataFrame(columns=one_hot.columns)

In [40]:
for i in unique_user_id:
    temp = pd.DataFrame(grouped.get_group(i).drop("user_id", axis=1).sum()).T
    temp["user_id"] = i
    t_cols = temp.columns.tolist()
    t_cols = t_cols[-1:] + t_cols[:-1]
    temp = temp[t_cols]
    exposure_model_data = pd.concat([exposure_model_data, temp], ignore_index=True)

In [41]:
exposure_model_data.head()

Unnamed: 0,user_id,movie_id_1,movie_id_2,movie_id_3,movie_id_4,movie_id_5,movie_id_6,movie_id_7,movie_id_8,movie_id_9,...,movie_id_1673,movie_id_1674,movie_id_1675,movie_id_1676,movie_id_1677,movie_id_1678,movie_id_1679,movie_id_1680,movie_id_1681,movie_id_1682
0,196,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
1,186,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,22,0,1,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,244,1,0,1,0,0,0,1,0,1,...,0,0,0,0,0,0,0,0,0,0
4,166,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [42]:
exposure_model_data.shape

(943, 1683)

In [21]:
# Uncomment only when exposure model write to csv is needed

#exposure_model_data.to_csv("data/preprocess_data/preproc_100k_exposure_data.csv", index=False)

### Processing for exposure model data - 1m

In [43]:
data_1m = pd.read_csv('data/user_data/ml_1m_user_data.csv')

In [44]:
data_1m = data_1m.drop(["rating"], axis=1)
data_1m.head()

Unnamed: 0,user_id,movie_id
0,1,1193
1,1,661
2,1,914
3,1,3408
4,1,2355


In [45]:
# create one hot encoding using movie id

one_hot = pd.get_dummies(data_1m.movie_id)

In [46]:
# generate column names for one hot dataframe --> format movid_id_#id_number

one_hot.columns = ["movie_id_"+str(i) for i in range(1, len(data_1m.movie_id.unique())+1)]

In [47]:
one_hot['user_id'] = data_1m.user_id

In [48]:
cols = one_hot.columns.tolist()
cols = cols[-1:] + cols[:-1]
one_hot = one_hot[cols]
one_hot.head()

Unnamed: 0,user_id,movie_id_1,movie_id_2,movie_id_3,movie_id_4,movie_id_5,movie_id_6,movie_id_7,movie_id_8,movie_id_9,...,movie_id_3697,movie_id_3698,movie_id_3699,movie_id_3700,movie_id_3701,movie_id_3702,movie_id_3703,movie_id_3704,movie_id_3705,movie_id_3706
0,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [49]:
unique_user_id = one_hot.user_id.unique().tolist()

In [50]:
grouped = one_hot.groupby("user_id")

In [51]:
exposure_model_data = pd.DataFrame(columns=one_hot.columns)

In [52]:
for i in unique_user_id:
    temp = pd.DataFrame(grouped.get_group(i).drop("user_id", axis=1).sum()).T
    temp["user_id"] = i
    t_cols = temp.columns.tolist()
    t_cols = t_cols[-1:] + t_cols[:-1]
    temp = temp[t_cols]
    exposure_model_data = pd.concat([exposure_model_data, temp], ignore_index=True)

In [53]:
exposure_model_data.head()

Unnamed: 0,user_id,movie_id_1,movie_id_2,movie_id_3,movie_id_4,movie_id_5,movie_id_6,movie_id_7,movie_id_8,movie_id_9,...,movie_id_3697,movie_id_3698,movie_id_3699,movie_id_3700,movie_id_3701,movie_id_3702,movie_id_3703,movie_id_3704,movie_id_3705,movie_id_3706
0,1,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,2,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,3,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,4,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,5,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [54]:
exposure_model_data.shape

(6040, 3707)

In [55]:
# Uncomment only when exposure model write to csv is needed

#exposure_model_data.to_csv("data/preprocess_data/preproc_1m_exposure_data.csv", index=False)

### Preprocess rating model data 100k

In [2]:
data_100k = pd.read_csv('data/user_data/ml_100k_user_data.csv')

In [3]:
data_100k.head()

Unnamed: 0,user_id,movie_id,rating
0,196,242,3
1,186,302,3
2,22,377,1
3,244,51,2
4,166,346,1


In [4]:
one_hot = pd.get_dummies(data_100k.movie_id)

In [5]:
one_hot.shape

(100000, 1682)

In [6]:
one_hot.columns = ["movie_id_"+str(i) for i in range(1, len(data_100k.movie_id.unique())+1)]

In [7]:
one_hot["user_id"] = data_100k.user_id
one_hot["rating"] = data_100k.rating

In [8]:
cols = one_hot.columns.tolist()
cols = cols[-2:] + cols[:-2]
one_hot = one_hot[cols]
one_hot.head()

Unnamed: 0,user_id,rating,movie_id_1,movie_id_2,movie_id_3,movie_id_4,movie_id_5,movie_id_6,movie_id_7,movie_id_8,...,movie_id_1673,movie_id_1674,movie_id_1675,movie_id_1676,movie_id_1677,movie_id_1678,movie_id_1679,movie_id_1680,movie_id_1681,movie_id_1682
0,196,3,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,186,3,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,22,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,244,2,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,166,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [9]:
one_hot.shape

(100000, 1684)

In [10]:
for i in range(one_hot.shape[0]):
    for j in range(1, 1683):
        if one_hot["movie_id_"+str(j)][i] == 1:
            one_hot["movie_id_"+str(j)][i] = one_hot["rating"][i]

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.


In [17]:
one_hot = one_hot.drop(['rating'], axis=1)
one_hot.head()

Unnamed: 0,user_id,movie_id_1,movie_id_2,movie_id_3,movie_id_4,movie_id_5,movie_id_6,movie_id_7,movie_id_8,movie_id_9,...,movie_id_1673,movie_id_1674,movie_id_1675,movie_id_1676,movie_id_1677,movie_id_1678,movie_id_1679,movie_id_1680,movie_id_1681,movie_id_1682
0,196,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,186,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,22,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,244,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,166,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [30]:
unique_user_id = one_hot.user_id.unique().tolist()

In [31]:
grouped = one_hot.groupby("user_id")

In [32]:
rating_model_data = pd.DataFrame(columns=one_hot.columns)

In [33]:
for i in unique_user_id:
    temp = pd.DataFrame(grouped.get_group(i).drop("user_id", axis=1).sum()).T
    temp["user_id"] = i
    t_cols = temp.columns.tolist()
    t_cols = t_cols[-1:] + t_cols[:-1]
    temp = temp[t_cols]
    rating_model_data = pd.concat([rating_model_data, temp], ignore_index=True)

In [34]:
rating_model_data.shape

(943, 1683)

In [39]:
# Do not uncomment unless you want to write the rating model data again to the csv

# rating_model_data.to_csv("data/preprocess_data/preproc_100k_rating_data.csv", index=False)

In [41]:
# incomplete code --> optimized code to replace the one's of the one hot encoding dataframe by the given rating by the user  

"""
data_list = list()
for i in range(1, 1683):
    if one_hot.loc['movie_id_'+str(i)] == 1:
        one_hot.loc["movie_id_"+str(i)] = rows["rating"]
        print(one_hot.head())
        break
"""

'\ndata_list = list()\nfor i in range(1, 1683):\n    if one_hot.loc[\'movie_id_\'+str(i)] == 1:\n        one_hot.loc["movie_id_"+str(i)] = rows["rating"]\n        print(one_hot.head())\n        break\n'

### Preprocess exposure model data 1m data 

In [2]:
data_1m = pd.read_csv('data/user_data/ml_1m_user_data.csv')

In [3]:
data_1m.head()

Unnamed: 0,user_id,movie_id,rating
0,1,1193,5
1,1,661,3
2,1,914,3
3,1,3408,4
4,1,2355,5


In [4]:
one_hot = pd.get_dummies(data_1m.movie_id, "movie_id")

In [5]:
one_hot.head()

Unnamed: 0,movie_id_1,movie_id_2,movie_id_3,movie_id_4,movie_id_5,movie_id_6,movie_id_7,movie_id_8,movie_id_9,movie_id_10,...,movie_id_3943,movie_id_3944,movie_id_3945,movie_id_3946,movie_id_3947,movie_id_3948,movie_id_3949,movie_id_3950,movie_id_3951,movie_id_3952
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [6]:
one_hot["user_id"] = data_1m.user_id
one_hot["rating"] = data_1m.rating

In [7]:
cols = one_hot.columns.tolist()
cols = cols[-2:] + cols[:-2]
one_hot = one_hot[cols]
one_hot.head()

Unnamed: 0,user_id,rating,movie_id_1,movie_id_2,movie_id_3,movie_id_4,movie_id_5,movie_id_6,movie_id_7,movie_id_8,...,movie_id_3943,movie_id_3944,movie_id_3945,movie_id_3946,movie_id_3947,movie_id_3948,movie_id_3949,movie_id_3950,movie_id_3951,movie_id_3952
0,1,5,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1,3,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1,3,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,1,4,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,1,5,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [9]:
for i in tqdm(range(one_hot.shape[0])):
    for j in one_hot.columns.to_list()[2:]:
        one_hot[j][i] = one_hot[j][i] * one_hot["rating"][i]

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until
  0%|          | 1213/1000209 [17:28:04<14699:06:13, 52.97s/it]

KeyboardInterrupt: 

In [None]:
one_hot = one_hot.drop(['rating'], axis=1)
one_hot.head()

In [None]:
unique_user_id = one_hot.user_id.unique().tolist()

In [None]:
grouped = one_hot.groupby("user_id")

In [None]:
rating_model_data = pd.DataFrame(columns=one_hot.columns)

In [None]:
for i in unique_user_id:
    temp = pd.DataFrame(grouped.get_group(i).drop("user_id", axis=1).sum()).T
    temp["user_id"] = i
    t_cols = temp.columns.tolist()
    t_cols = t_cols[-1:] + t_cols[:-1]
    temp = temp[t_cols]
    rating_model_data = pd.concat([rating_model_data, temp], ignore_index=True)

In [None]:
rating_model_data.shape

In [None]:
# Do not uncomment unless you want to write the rating model data again to the csv

rating_model_data.to_csv("data/preprocess_data/preproc_1m_rating_data.csv", index=False)