In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm
from sklearn.model_selection import train_test_split

## Remove timestamp from the data and save original data as csv 

### convert movielens 100k u.dat (rating data) to csv

In [None]:
data_100k = pd.read_csv('data/ml-100k/u.data', sep="\t", header=None)

In [None]:
data_100k.columns = ['user_id', 'movie_id', 'rating', "time_stamp"]

In [None]:
data_100k = data_100k.drop('time_stamp', axis=1)

In [None]:
data_100k.head()

In [None]:
# Uncomment only when need to write user data to csv

#data_100k.to_csv("data/user_data/ml_100k_user_data.csv", index=False)

### movielens 1m data ratings.dat (rating data) to csv

In [None]:
data_1m = pd.read_csv('data/ml-1m/ratings.dat', sep="::", header=None)

In [None]:
data_1m.columns = ['user_id', 'movie_id', 'rating', "time_stamp"]

In [None]:
data_1m = data_1m.drop('time_stamp', axis=1)

In [None]:
data_1m.head()

In [None]:
# Uncomment only when need to write user data to csv

#data_1m.to_csv("data/user_data/ml_1m_user_data.csv", index=False)

## Helper function to skrink the dataframe size by converting the datatypes of the columns

In [2]:
def shrink_df(df):
    df.user_id = df.user_id.astype(np.uint16)
    df.movie_id = df.movie_id.astype(np.uint16)
    df.rating = df.rating.astype(np.uint8)
    
    return df

# 100k data -- 100% data -- exposure model

In [17]:
data_100k = pd.read_csv('data/user_data/ml_100k_user_data.csv')

### Converting 100k user data to one hot encoding for exposure model

In [18]:
# create one hot encoding using movie id

one_hot = pd.get_dummies(data_100k.movie_id, prefix="movie_id", dtype=np.uint8)

In [19]:
one_hot['user_id'] = data_100k.user_id

In [37]:
# clearing dataframe from memory which won't be required further for faster execution of below blocks

del data_100k

In [20]:
one_hot = one_hot[one_hot.columns.tolist()[-1:] + one_hot.columns.tolist()[:-1]]
one_hot.head()

Unnamed: 0,user_id,movie_id_1,movie_id_2,movie_id_3,movie_id_4,movie_id_5,movie_id_6,movie_id_7,movie_id_8,movie_id_9,...,movie_id_1673,movie_id_1674,movie_id_1675,movie_id_1676,movie_id_1677,movie_id_1678,movie_id_1679,movie_id_1680,movie_id_1681,movie_id_1682
0,196,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,186,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,22,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,244,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,166,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [21]:
grouped = one_hot.groupby("user_id")

In [22]:
exposure_model_data = pd.DataFrame(columns=one_hot.columns)

In [23]:
# grouping the one hot encoded user_id wise

for i in tqdm(one_hot.user_id.unique().tolist()):
    temp = pd.DataFrame(grouped.get_group(i).drop("user_id", axis=1).sum()).T
    temp["user_id"] = i
    temp = temp[temp.columns.tolist()[-1:] + temp.columns.tolist()[:-1]]
    exposure_model_data = pd.concat([exposure_model_data, temp], ignore_index=True)

100%|██████████| 943/943 [00:23<00:00, 39.81it/s]


In [24]:
exposure_model_data.head()

Unnamed: 0,user_id,movie_id_1,movie_id_2,movie_id_3,movie_id_4,movie_id_5,movie_id_6,movie_id_7,movie_id_8,movie_id_9,...,movie_id_1673,movie_id_1674,movie_id_1675,movie_id_1676,movie_id_1677,movie_id_1678,movie_id_1679,movie_id_1680,movie_id_1681,movie_id_1682
0,196,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
1,186,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,22,0,1,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,244,1,0,1,0,0,0,1,0,1,...,0,0,0,0,0,0,0,0,0,0
4,166,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [25]:
exposure_model_data.shape

(943, 1683)

In [16]:
# Uncomment only when exposure model data write to csv is needed

#exposure_model_data.to_csv("data/preprocess_data/preproc_100k_exposure_data.csv", index=False)

# 80 - 20 split -- 100K data -- exposure model

In [3]:
df = pd.read_csv("data/user_data/ml_100k_user_data.csv")

In [4]:
movie_col_list = ["movie_id_"+str(i) for i in df.movie_id.unique()]

In [5]:
y = df.rating
X = df.drop(["rating"], axis=1)

In [6]:
del df

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [8]:
X_train['rating'] = y_train
X_test['rating'] = y_test

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [9]:
X_train = shrink_df(X_train)
X_test = shrink_df(X_test)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[name] = value


In [10]:
X_train.to_csv("data/preprocess_data/n_preproc_100k_udata_train.csv", index=False)

In [11]:
X_test.to_csv("data/preprocess_data/n_preproc_100k_udata_test.csv", index=False)

In [12]:
# Convert to one hot vector

In [13]:
one_hot_X_train_100k = pd.get_dummies(X_train.movie_id, prefix="movie_id", dtype=np.uint8)

In [14]:
one_hot_X_test_100k = pd.get_dummies(X_test.movie_id, prefix="movie_id", dtype=np.uint8)

In [15]:
# group the one hot vector by user_id

In [16]:
one_hot_X_train_100k['user_id'] = X_train.user_id
one_hot_X_test_100k['user_id'] = X_test.user_id

In [17]:
del X_train
del X_test

In [18]:
one_hot_X_train_100k = one_hot_X_train_100k[one_hot_X_train_100k.columns.tolist()[-1:] + 
                                            one_hot_X_train_100k.columns.tolist()[:-1]]

one_hot_X_train_100k.head()

Unnamed: 0,user_id,movie_id_1,movie_id_2,movie_id_3,movie_id_4,movie_id_5,movie_id_6,movie_id_7,movie_id_8,movie_id_9,...,movie_id_1668,movie_id_1670,movie_id_1671,movie_id_1672,movie_id_1673,movie_id_1676,movie_id_1678,movie_id_1679,movie_id_1680,movie_id_1681
75220,807,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
48955,474,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
44966,463,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
13568,139,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
92727,621,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [19]:
one_hot_X_test_100k = one_hot_X_test_100k[one_hot_X_test_100k.columns.tolist()[-1:] + 
                                            one_hot_X_test_100k.columns.tolist()[:-1]]

one_hot_X_test_100k.head()

Unnamed: 0,user_id,movie_id_1,movie_id_2,movie_id_3,movie_id_4,movie_id_5,movie_id_6,movie_id_7,movie_id_8,movie_id_9,...,movie_id_1648,movie_id_1649,movie_id_1655,movie_id_1656,movie_id_1658,movie_id_1669,movie_id_1674,movie_id_1675,movie_id_1677,movie_id_1682
75721,877,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
80184,815,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
19864,94,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
76699,416,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
92991,500,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [27]:
# Add columns with Zeros for the movie id not included in the train test one hot vector df  

In [28]:
missing_cols_train = list(set(movie_col_list) - set(one_hot_X_train_100k.columns))

In [31]:
for col in missing_cols_train:
    one_hot_X_train_100k[col] = np.zeros(len(one_hot_X_train_100k.user_id), dtype=np.uint8)

In [32]:
one_hot_X_train_100k.shape

(80000, 1683)

In [29]:
missing_cols_test = list(set(movie_col_list) - set(one_hot_X_test_100k.columns))

In [33]:
for col in missing_cols_test:
    one_hot_X_test_100k[col] = np.zeros(len(one_hot_X_test_100k.user_id), dtype=np.uint8)

In [34]:
one_hot_X_test_100k.shape

(20000, 1683)

In [36]:
# group the one hot vectors based on user_id

In [20]:
grouped_train = one_hot_X_train_100k.groupby("user_id")
grouped_test = one_hot_X_test_100k.groupby("user_id")

In [21]:
exposure_model_data_train = pd.DataFrame(columns=one_hot_X_train_100k.columns)

In [22]:
exposure_model_data_test = pd.DataFrame(columns=one_hot_X_test_100k.columns)

In [23]:
# grouping the one hot encoded user_id wise

In [24]:
for i in tqdm(one_hot_X_train_100k.user_id.unique().tolist()):
    temp = pd.DataFrame(grouped_train.get_group(i).drop("user_id", axis=1).sum()).T
    temp["user_id"] = i
    temp = temp[temp.columns.tolist()[-1:] + temp.columns.tolist()[:-1]]
    exposure_model_data_train = pd.concat([exposure_model_data_train, temp], ignore_index=True)

100%|██████████| 943/943 [00:11<00:00, 81.06it/s]


In [25]:
for i in tqdm(one_hot_X_test_100k.user_id.unique().tolist()):
    temp = pd.DataFrame(grouped_test.get_group(i).drop("user_id", axis=1).sum()).T
    temp["user_id"] = i
    temp = temp[temp.columns.tolist()[-1:] + temp.columns.tolist()[:-1]]
    exposure_model_data_test = pd.concat([exposure_model_data_test, temp], ignore_index=True)

100%|██████████| 940/940 [00:09<00:00, 96.22it/s] 


In [26]:
exposure_model_data_train.shape

(943, 1654)

In [27]:
exposure_model_data_test.shape

(940, 1412)

In [48]:
# writing onto csv

In [28]:
exposure_model_data_train.to_csv("data/preprocess_data/n_preproc_100k_exposure_data_train.csv", index=False)

In [29]:
exposure_model_data_test.to_csv("data/preprocess_data/n_preproc_100k_exposure_data_test.csv", index=False)

# 1m data -- 100% data -- exposure model

In [26]:
data_1m = pd.read_csv('data/user_data/ml_1m_user_data.csv')

In [27]:
# create one hot encoding using movie id

one_hot = pd.get_dummies(data_1m.movie_id, prefix="movie_id", dtype=np.uint8)

In [28]:
one_hot['user_id'] = data_1m.user_id

In [36]:
# clearing dataframe from memory which won't be required further for faster execution of below blocks

del data_1m

In [29]:
one_hot = one_hot[one_hot.columns.tolist()[-1:] + one_hot.columns.tolist()[:-1]]
one_hot.head()

Unnamed: 0,user_id,movie_id_1,movie_id_2,movie_id_3,movie_id_4,movie_id_5,movie_id_6,movie_id_7,movie_id_8,movie_id_9,...,movie_id_3943,movie_id_3944,movie_id_3945,movie_id_3946,movie_id_3947,movie_id_3948,movie_id_3949,movie_id_3950,movie_id_3951,movie_id_3952
0,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [30]:
grouped = one_hot.groupby("user_id")

In [31]:
exposure_model_data = pd.DataFrame(columns=one_hot.columns)

In [32]:
# grouping the one hot encoded user_id wise

for i in tqdm(one_hot.user_id.unique().tolist()):
    temp = pd.DataFrame(grouped.get_group(i).drop("user_id", axis=1).sum()).T
    temp["user_id"] = i
    temp = temp[temp.columns.tolist()[-1:] + temp.columns.tolist()[:-1]]
    exposure_model_data = pd.concat([exposure_model_data, temp], ignore_index=True)

100%|██████████| 6040/6040 [33:00<00:00,  3.05it/s]


In [33]:
exposure_model_data.head()

Unnamed: 0,user_id,movie_id_1,movie_id_2,movie_id_3,movie_id_4,movie_id_5,movie_id_6,movie_id_7,movie_id_8,movie_id_9,...,movie_id_3943,movie_id_3944,movie_id_3945,movie_id_3946,movie_id_3947,movie_id_3948,movie_id_3949,movie_id_3950,movie_id_3951,movie_id_3952
0,1,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,2,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,3,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,4,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,5,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [34]:
exposure_model_data.shape

(6040, 3707)

In [35]:
# Uncomment only when exposure model data write to csv is needed

#exposure_model_data.to_csv("data/preprocess_data/preproc_1m_exposure_data.csv", index=False)

# 100% data -- 100k -- rating model data

In [None]:
data_100k = pd.read_csv('data/user_data/ml_100k_user_data.csv')

In [None]:
one_hot = pd.get_dummies(data_100k.movie_id, prefix="movie_id", dtype=np.uint8)

In [None]:
one_hot["user_id"] = data_100k.user_id
one_hot["rating"] = data_100k.rating

In [None]:
cols = one_hot.columns.tolist()
cols = cols[-2:] + cols[:-2]
one_hot = one_hot[cols]
one_hot.head()

In [None]:
one_hot.shape

In [None]:
for i in range(one_hot.shape[0]):
    for j in range(1, 1683):
        if one_hot["movie_id_"+str(j)][i] == 1:
            one_hot["movie_id_"+str(j)][i] = one_hot["rating"][i]
            
#for col in tqdm(one_hot.columns.to_list()[:-1]):
#    one_hot[col] = np.multiply(one_hot.rating, one_hot[col])

In [None]:
one_hot = one_hot.drop(['rating'], axis=1)
one_hot.head()

In [None]:
unique_user_id = one_hot.user_id.unique().tolist()

In [None]:
grouped = one_hot.groupby("user_id")

In [None]:
rating_model_data = pd.DataFrame(columns=one_hot.columns)

In [None]:
for i in unique_user_id:
    temp = pd.DataFrame(grouped.get_group(i).drop("user_id", axis=1).sum()).T
    temp["user_id"] = i
    t_cols = temp.columns.tolist()
    t_cols = t_cols[-1:] + t_cols[:-1]
    temp = temp[t_cols]
    rating_model_data = pd.concat([rating_model_data, temp], ignore_index=True)

In [None]:
rating_model_data.shape

In [None]:
# Do not uncomment unless you want to write the rating model data again to the csv

# rating_model_data.to_csv("data/preprocess_data/preproc_100k_rating_data.csv", index=False)

# 80 - 20 split -- 100k data -- rating model data 

In [30]:
df = pd.read_csv("data/user_data/ml_100k_user_data.csv")

In [31]:
movie_col_list = ["movie_id_"+str(i) for i in df.movie_id.unique()]

In [32]:
y = df.rating
X = df.drop(["rating"], axis=1)

In [33]:
del df

In [34]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [35]:
X_train['rating'] = y_train
X_test['rating'] = y_test

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [36]:
X_train = shrink_df(X_train)
X_test = shrink_df(X_test)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[name] = value


In [37]:
one_hot_X_train = pd.get_dummies(X_train.movie_id, prefix="movie_id", dtype=np.uint8)
one_hot_X_test = pd.get_dummies(X_test.movie_id, prefix="movie_id", dtype=np.uint8)

In [38]:
one_hot_X_train["user_id"] = X_train.user_id
one_hot_X_train["rating"] = X_train.rating

In [39]:
one_hot_X_test["user_id"] = X_test.user_id
one_hot_X_test["rating"] = X_test.rating

In [40]:
cols = one_hot_X_train.columns.tolist()
cols = cols[-2:] + cols[:-2]
one_hot_X_train = one_hot_X_train[cols]
one_hot_X_train.head()

Unnamed: 0,user_id,rating,movie_id_1,movie_id_2,movie_id_3,movie_id_4,movie_id_5,movie_id_6,movie_id_7,movie_id_8,...,movie_id_1668,movie_id_1670,movie_id_1671,movie_id_1672,movie_id_1673,movie_id_1676,movie_id_1678,movie_id_1679,movie_id_1680,movie_id_1681
75220,807,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
48955,474,5,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
44966,463,4,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
13568,139,4,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
92727,621,4,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [41]:
cols = one_hot_X_test.columns.tolist()
cols = cols[-2:] + cols[:-2]
one_hot_X_test = one_hot_X_test[cols]
one_hot_X_test.head()

Unnamed: 0,user_id,rating,movie_id_1,movie_id_2,movie_id_3,movie_id_4,movie_id_5,movie_id_6,movie_id_7,movie_id_8,...,movie_id_1648,movie_id_1649,movie_id_1655,movie_id_1656,movie_id_1658,movie_id_1669,movie_id_1674,movie_id_1675,movie_id_1677,movie_id_1682
75721,877,4,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
80184,815,3,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
19864,94,4,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
76699,416,2,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
92991,500,2,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [17]:
missing_cols_train= list(set(movie_col_list) - set(one_hot_X_train.columns))
missing_cols_test = list(set(movie_col_list) - set(one_hot_X_test.columns))

In [18]:
for col in missing_cols_train:
    one_hot_X_train[col] = np.zeros(len(one_hot_X_train.user_id), dtype=np.uint8)

In [19]:
for col in missing_cols_test:
    one_hot_X_test[col] = np.zeros(len(one_hot_X_test.user_id), dtype=np.uint8)

In [42]:
for col in tqdm(one_hot_X_train.columns.to_list()[2:]):
    one_hot_X_train[col] = np.multiply(one_hot_X_train.rating, one_hot_X_train[col])

100%|██████████| 1653/1653 [00:00<00:00, 1932.45it/s]


In [43]:
one_hot_X_train.head()

Unnamed: 0,user_id,rating,movie_id_1,movie_id_2,movie_id_3,movie_id_4,movie_id_5,movie_id_6,movie_id_7,movie_id_8,...,movie_id_1668,movie_id_1670,movie_id_1671,movie_id_1672,movie_id_1673,movie_id_1676,movie_id_1678,movie_id_1679,movie_id_1680,movie_id_1681
75220,807,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
48955,474,5,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
44966,463,4,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
13568,139,4,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
92727,621,4,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [44]:
for col in tqdm(one_hot_X_test.columns.to_list()[2:]):
    one_hot_X_test[col] = np.multiply(one_hot_X_test.rating, one_hot_X_test[col])

100%|██████████| 1411/1411 [00:00<00:00, 2959.63it/s]


In [45]:
one_hot_X_test.head()

Unnamed: 0,user_id,rating,movie_id_1,movie_id_2,movie_id_3,movie_id_4,movie_id_5,movie_id_6,movie_id_7,movie_id_8,...,movie_id_1648,movie_id_1649,movie_id_1655,movie_id_1656,movie_id_1658,movie_id_1669,movie_id_1674,movie_id_1675,movie_id_1677,movie_id_1682
75721,877,4,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
80184,815,3,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
19864,94,4,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
76699,416,2,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
92991,500,2,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [46]:
one_hot_X_train = one_hot_X_train.drop(['rating'], axis=1)
one_hot_X_train.head()

Unnamed: 0,user_id,movie_id_1,movie_id_2,movie_id_3,movie_id_4,movie_id_5,movie_id_6,movie_id_7,movie_id_8,movie_id_9,...,movie_id_1668,movie_id_1670,movie_id_1671,movie_id_1672,movie_id_1673,movie_id_1676,movie_id_1678,movie_id_1679,movie_id_1680,movie_id_1681
75220,807,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
48955,474,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
44966,463,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
13568,139,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
92727,621,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [47]:
one_hot_X_test = one_hot_X_test.drop(['rating'], axis=1)
one_hot_X_test.head()

Unnamed: 0,user_id,movie_id_1,movie_id_2,movie_id_3,movie_id_4,movie_id_5,movie_id_6,movie_id_7,movie_id_8,movie_id_9,...,movie_id_1648,movie_id_1649,movie_id_1655,movie_id_1656,movie_id_1658,movie_id_1669,movie_id_1674,movie_id_1675,movie_id_1677,movie_id_1682
75721,877,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
80184,815,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
19864,94,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
76699,416,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
92991,500,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [48]:
unique_user_id_train = one_hot_X_train.user_id.unique().tolist()
unique_user_id_test = one_hot_X_test.user_id.unique().tolist()

In [56]:
grouped_train = one_hot_X_train.groupby("user_id")
grouped_test = one_hot_X_test.groupby("user_id")

In [57]:
rating_model_df_train = pd.DataFrame(columns=one_hot_X_train.columns)
rating_model_df_test = pd.DataFrame(columns=one_hot_X_test.columns)

In [58]:
for i in tqdm(unique_user_id_train):
    temp = pd.DataFrame(grouped_train.get_group(i).drop("user_id", axis=1).sum()).T
    temp["user_id"] = i
    t_cols = temp.columns.tolist()
    t_cols = t_cols[-1:] + t_cols[:-1]
    temp = temp[t_cols]
    rating_model_df_train = pd.concat([rating_model_df_train, temp], ignore_index=True)

100%|██████████| 943/943 [00:12<00:00, 73.92it/s]


In [59]:
for i in tqdm(unique_user_id_test):
    temp = pd.DataFrame(grouped_test.get_group(i).drop("user_id", axis=1).sum()).T
    temp["user_id"] = i
    t_cols = temp.columns.tolist()
    t_cols = t_cols[-1:] + t_cols[:-1]
    temp = temp[t_cols]
    rating_model_df_test = pd.concat([rating_model_df_test, temp], ignore_index=True)

100%|██████████| 940/940 [00:10<00:00, 91.23it/s] 


In [60]:
rating_model_df_train.shape

(943, 1654)

In [61]:
rating_model_df_test.shape

(940, 1412)

In [62]:
rating_model_df_train.to_csv("data/preprocess_data/n_preproc_100k_rating_data_train.csv", index=False)

In [63]:
rating_model_df_test.to_csv("data/preprocess_data/n_preproc_100k_rating_data_test.csv", index=False)

### Preprocssing 1m user data for rating model

In [77]:
data_1m = pd.read_csv('data/user_data/ml_1m_user_data.csv')

In [78]:
# converting the 1m user data to one hot encoding, so that we can multiply the seen movies (1's) with there corresponding rating

one_hot = pd.get_dummies(data_1m.movie_id, prefix="movie_id", dtype=np.uint8)

In [79]:
one_hot["rating"] = data_1m.rating.astype(np.uint8)

In [80]:
user_ids = data_1m.user_id

In [81]:
# clearing dataframe from memory for fastere execution of below block

del data_1m

In [82]:
for col in tqdm(one_hot.columns.to_list()[:-1]):
    one_hot[col] = np.multiply(one_hot.rating, one_hot[col])

100%|██████████| 3706/3706 [01:07<00:00, 54.68it/s]


In [None]:
one_hot = one_hot.drop(['rating'], axis=1)
one_hot.head()

In [None]:
one_hot["user_id"] = user_ids

In [None]:
del user_ids

In [None]:
grouped = one_hot.groupby("user_id")

In [None]:
rating_model_data = pd.DataFrame(columns=one_hot.columns)
#del one_hot

In [None]:
for i in tqdm(grouped.user_id.unique().tolist()):
    temp = pd.DataFrame(grouped.get_group(i).drop("user_id", axis=1).sum()).T
    temp["user_id"] = i
    temp = temp[temp.columns.tolist()[-1:] + temp.columns.tolist()[:-1]]
    rating_model_data = pd.concat([rating_model_data, temp], ignore_index=True)

In [None]:
rating_model_data.shape

In [None]:
rating_model_data = rating_model_data[rating_model_data.columns.to_list()[-1:] + rating_model_data.columns.to_list()[:-1]]
rating_model_data = rating_model_data.drop(['rating'], axis=1)
rating_model_data.head()

In [22]:
# Do not uncomment unless you want to write the rating model data again to the csv

#rating_model_data.to_csv("data/preprocess_data/preproc_1m_rating_data.csv", index=False)