# Chatbot: Load data

In [3]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [4]:
BASE_PATH             = '../../..'
LIB_PATH              = f'{BASE_PATH}/lib'
DATASET_PATH          = f'{BASE_PATH}/datasets'

In [5]:
import sys
sys.path.append(LIB_PATH)

import numpy as np
import pandas as pd

import util as ut

import torch
import data as dt
import data.dataset as ds
import requests
import json

2024-01-04 18:38:38.897092: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
2024-01-04 18:38:39.770064: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:996] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2024-01-04 18:38:39.783152: W tensorflow/core/common_runtime/gpu/gpu_device.cc:1956] Cannot dlopen some GPU libraries. Please make sure the missing libraries mentioned above are installed properly if you would like to use GPU. Follow the guide at https://www.tensorflow.org/install/gpu for how to download and setup the required libraries for your 

<Figure size 640x480 with 0 Axes>

In [6]:
def to_tensor(obs, device, columns): 
    data = obs[columns]
    if type(data) == pd.DataFrame:
        data = data.values
    return torch.tensor(data).to(device)

features_fn = lambda obs, device: to_tensor(obs, device, ['user_seq', 'movie_seq'])
target_fn   = lambda obs, device: to_tensor(obs, device, ['user_movie_rating'])

In [7]:
dataset = ds.MovieLensTMDBDatasetFactory.from_path(
    path             = DATASET_PATH,
    transform        = features_fn,
    target_transform = target_fn,
    device           = torch.device('cpu'),
    filter_fn        = lambda df: df[(df['user_movie_rating_year'] >= 2004)]
)

In [8]:
items = dataset \
    .data[['movie_id', 'movie_title', 'movie_genres', 'movie_overview', 'movie_release_year', 'movie_imdb_id']] \
    .drop_duplicates(subset=['movie_id'])

In [10]:
base_url = 'http://nonosoft.ddns.net:8080/api/v1'

error_items = []
page        = []
page_counter = 0
for _, row in items.iterrows():
    if page_counter < 500:
        page.append({
            'id'          : str(row['movie_id']),
            'title'       : row['movie_title'].strip(),
            'description' : row['movie_overview'].strip(),
            'genres'      : [g.lower() for g in row['movie_genres']],
            'release'     : str(row['movie_release_year']),
            'imdb_id'     : str(row['movie_imdb_id'])
        })
        page_counter += 1
    else:
        try:
            headers =  {"Content-Type":"application/json"}
            response = requests.post(f'{base_url}/items/bulk', data=json.dumps(page), headers=headers)
        except Exception as e:
            print(e)
            error_items.extend(page)
        finally:
            page_counter = 0
            page         = []