In [1]:
import pandas as pd
import os
import sys

import yaml
import logging
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import tensorflow as tf
from tensorflow.keras import layers 
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam

from keras.layers import Dense , Dropout , Flatten , Activation , BatchNormalization , Input , Embedding , Dot
from keras.models import Sequential
from keras.utils import to_categorical

from keras.callbacks import EarlyStopping , ModelCheckpoint  , LearningRateScheduler , TensorBoard , CSVLogger
%matplotlib inline


READING CSV

In [2]:
anime_df = pd.read_csv(r"D:\anime_recommend_mlops\archive\animelist.csv" , low_memory=True , usecols=["user_id","anime_id","rating"])

In [3]:
anime_df.head()

Unnamed: 0,user_id,anime_id,rating
0,0,67,9
1,0,6702,7
2,0,242,10
3,0,4898,0
4,0,21,10


In [4]:
len(anime_df)

109224747

DATA PROCESSING

In [5]:
n_rating = anime_df['user_id'].value_counts().sort_index()

In [6]:
n_rating

user_id
0          74
1         139
2         494
3         358
4         156
         ... 
353400    110
353401     74
353402     44
353403    214
353404     32
Name: count, Length: 325770, dtype: int64

In [7]:
anime_df = anime_df[anime_df["user_id"].isin(n_rating[n_rating >=400].index)].copy()

In [8]:
len(anime_df)

71418114

In [9]:
min_rating = min(anime_df['rating'])

In [10]:
max_rating = max(anime_df['rating'])

In [11]:
min_rating

0

In [12]:
max_rating

10

In [13]:
avg_rating = np.mean(anime_df['rating'])

In [14]:
avg_rating

np.float64(4.047793589172629)

FEATURE SCALING

In [15]:
anime_df["rating"] = anime_df["rating"].apply(lambda x: (x-min_rating)/(max_rating-min_rating)).values.astype(float)

In [16]:
avg_rating = np.mean(anime_df['rating'])

In [17]:
avg_rating

np.float64(0.4047793589172634)

In [18]:
anime_df.isna().sum()

user_id     0
anime_id    0
rating      0
dtype: int64

In [19]:
anime_df.duplicated().sum()

np.int64(1)

In [20]:
user_ids = anime_df['user_id'].unique().tolist()
print(user_ids)

[2, 6, 12, 16, 17, 19, 21, 41, 42, 44, 47, 53, 55, 60, 66, 73, 74, 85, 89, 90, 94, 98, 102, 108, 111, 112, 120, 121, 122, 135, 145, 146, 147, 153, 155, 156, 172, 174, 184, 190, 193, 194, 198, 204, 205, 209, 214, 219, 222, 227, 228, 235, 238, 240, 243, 248, 251, 252, 257, 264, 267, 272, 274, 275, 284, 285, 286, 290, 291, 293, 300, 301, 306, 308, 310, 313, 314, 316, 320, 321, 324, 325, 326, 327, 330, 336, 340, 345, 346, 349, 350, 366, 367, 371, 372, 375, 381, 382, 386, 389, 398, 405, 406, 413, 414, 418, 423, 426, 428, 431, 432, 436, 437, 438, 440, 442, 444, 445, 446, 455, 457, 459, 464, 467, 468, 469, 478, 481, 483, 484, 486, 493, 498, 500, 516, 517, 519, 524, 526, 529, 531, 538, 542, 547, 549, 559, 563, 564, 566, 569, 571, 577, 590, 593, 596, 601, 603, 608, 613, 614, 617, 620, 629, 631, 639, 642, 643, 644, 649, 652, 655, 656, 662, 664, 673, 674, 680, 681, 683, 686, 689, 694, 699, 711, 713, 714, 716, 719, 725, 728, 730, 731, 732, 734, 745, 746, 748, 753, 760, 764, 770, 774, 776, 778, 779

In [21]:
user2user_encoded = {x:i for i , x in enumerate(user_ids)}

In [22]:
user2user_encoded

{2: 0,
 6: 1,
 12: 2,
 16: 3,
 17: 4,
 19: 5,
 21: 6,
 41: 7,
 42: 8,
 44: 9,
 47: 10,
 53: 11,
 55: 12,
 60: 13,
 66: 14,
 73: 15,
 74: 16,
 85: 17,
 89: 18,
 90: 19,
 94: 20,
 98: 21,
 102: 22,
 108: 23,
 111: 24,
 112: 25,
 120: 26,
 121: 27,
 122: 28,
 135: 29,
 145: 30,
 146: 31,
 147: 32,
 153: 33,
 155: 34,
 156: 35,
 172: 36,
 174: 37,
 184: 38,
 190: 39,
 193: 40,
 194: 41,
 198: 42,
 204: 43,
 205: 44,
 209: 45,
 214: 46,
 219: 47,
 222: 48,
 227: 49,
 228: 50,
 235: 51,
 238: 52,
 240: 53,
 243: 54,
 248: 55,
 251: 56,
 252: 57,
 257: 58,
 264: 59,
 267: 60,
 272: 61,
 274: 62,
 275: 63,
 284: 64,
 285: 65,
 286: 66,
 290: 67,
 291: 68,
 293: 69,
 300: 70,
 301: 71,
 306: 72,
 308: 73,
 310: 74,
 313: 75,
 314: 76,
 316: 77,
 320: 78,
 321: 79,
 324: 80,
 325: 81,
 326: 82,
 327: 83,
 330: 84,
 336: 85,
 340: 86,
 345: 87,
 346: 88,
 349: 89,
 350: 90,
 366: 91,
 367: 92,
 371: 93,
 372: 94,
 375: 95,
 381: 96,
 382: 97,
 386: 98,
 389: 99,
 398: 100,
 405: 101,
 406: 102,
 

In [23]:
user2user_decoded = { i : x for x , i in enumerate(user_ids)}

In [24]:
user2user_decoded

{2: 0,
 6: 1,
 12: 2,
 16: 3,
 17: 4,
 19: 5,
 21: 6,
 41: 7,
 42: 8,
 44: 9,
 47: 10,
 53: 11,
 55: 12,
 60: 13,
 66: 14,
 73: 15,
 74: 16,
 85: 17,
 89: 18,
 90: 19,
 94: 20,
 98: 21,
 102: 22,
 108: 23,
 111: 24,
 112: 25,
 120: 26,
 121: 27,
 122: 28,
 135: 29,
 145: 30,
 146: 31,
 147: 32,
 153: 33,
 155: 34,
 156: 35,
 172: 36,
 174: 37,
 184: 38,
 190: 39,
 193: 40,
 194: 41,
 198: 42,
 204: 43,
 205: 44,
 209: 45,
 214: 46,
 219: 47,
 222: 48,
 227: 49,
 228: 50,
 235: 51,
 238: 52,
 240: 53,
 243: 54,
 248: 55,
 251: 56,
 252: 57,
 257: 58,
 264: 59,
 267: 60,
 272: 61,
 274: 62,
 275: 63,
 284: 64,
 285: 65,
 286: 66,
 290: 67,
 291: 68,
 293: 69,
 300: 70,
 301: 71,
 306: 72,
 308: 73,
 310: 74,
 313: 75,
 314: 76,
 316: 77,
 320: 78,
 321: 79,
 324: 80,
 325: 81,
 326: 82,
 327: 83,
 330: 84,
 336: 85,
 340: 86,
 345: 87,
 346: 88,
 349: 89,
 350: 90,
 366: 91,
 367: 92,
 371: 93,
 372: 94,
 375: 95,
 381: 96,
 382: 97,
 386: 98,
 389: 99,
 398: 100,
 405: 101,
 406: 102,
 

In [25]:
anime_df["user"] = anime_df["user_id"].map(user2user_encoded)

In [26]:
anime_df.head()

Unnamed: 0,user_id,anime_id,rating,user
213,2,24833,0.0,0
214,2,235,1.0,0
215,2,36721,0.0,0
216,2,40956,0.0,0
217,2,31933,0.0,0


In [27]:
anime_ids = anime_df['anime_id'].unique().tolist()

In [28]:
anime2anime_encoded = { x : i for i , x in enumerate(anime_ids)}

In [None]:
# Mapping and optimization combined to save memory and handle out-of-order execution
if 'user' not in anime_df.columns:
    anime_df['user'] = anime_df['user_id'].map(user2user_encoded).astype('int32')
if 'anime' not in anime_df.columns:
    anime_df['anime'] = anime_df['anime_id'].map(anime2anime_encoded).astype('int32')

anime_df['rating'] = anime_df['rating'].astype('float32')
if 'user_id' in anime_df.columns:
    anime_df = anime_df.drop(columns=['user_id', 'anime_id'])
anime_df = anime_df.sample(frac=1 , random_state=43).reset_index(drop=True)

In [None]:
anime_df["anime"] = anime_df["anime_id"].map(anime2anime_encoded)

In [None]:
anime_df.head()

Unnamed: 0,user_id,anime_id,rating,user,anime
213,2,24833,0.0,0,0
214,2,235,1.0,0,1
215,2,36721,0.0,0,2
216,2,40956,0.0,0,3
217,2,31933,0.0,0,4


In [None]:
# Mapping and optimization combined to save memory and handle out-of-order execution
if 'user' not in anime_df.columns:
    anime_df['user'] = anime_df['user_id'].map(user2user_encoded).astype('int32')
if 'anime' not in anime_df.columns:
    anime_df['anime'] = anime_df['anime_id'].map(anime2anime_encoded).astype('int32')

anime_df['rating'] = anime_df['rating'].astype('float32')
if 'user_id' in anime_df.columns:
    anime_df = anime_df.drop(columns=['user_id', 'anime_id'])
anime_df = anime_df.sample(frac=1 , random_state=43).reset_index(drop=True)

SPLITING DATA

In [None]:
X = anime_df[["user","anime"]].values
y = anime_df["rating"]