In [None]:
data_dir = "anime-data"
personalize_data_dir = "personalize-data"

# Exploración

In [None]:
!rm -rf {data_dir}/html\ folder
!ls -als {data_dir} -h

In [None]:
import time
from time import sleep
import random
from random import randrange
import json
import datetime
import boto3
import numpy as np
import pandas as pd
pd.set_option('display.max_columns', 500)     # Make sure we can see all of the columns
pd.set_option('display.max_rows', 20)         # Keep the output on one page
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns


%matplotlib inline

In [None]:
anime = pd.read_csv(data_dir + '/anime.csv',index_col='MAL_ID')
anime.sample(1)

### Podemos ver que este dataset no tiene una columna de tiempo, lo cual es necesario para entrenar el model. Más adelante  te mostrare como lo solucionamos. 

In [None]:
anime.describe(percentiles=[])

### Es un dataset con 17562 lineas, suficiente para entrenar un buen modelo.  

In [None]:
full_genre_list = []
for g in anime['Genres'][:10]:
    full_genre_list += g.replace(' ','').split(',')
genres, count  =  np.unique(full_genre_list, return_counts= True)

In [None]:
fig, ax = plt.subplots(1,2, figsize=(20,8))

plt.subplot(1,2,1)
plt.hist(anime[anime.Score != 'Unknown']['Score'].astype(float),range=[1,10])
plt.title('Distribucion de los Ratings de Anime');

plt.subplot(1,2,2)
plt.barh(y= genres, width=count)
plt.title('Distribucion de los Generos de Anime');

In [None]:
anime_with_synopsis = pd.read_csv(data_dir + '/anime_with_synopsis.csv',index_col='MAL_ID')
anime_with_synopsis.sample(10)

In [None]:
anime_with_synopsis.describe(percentiles=[])

In [None]:
watching_status = pd.read_csv(data_dir + '/watching_status.csv')
watching_status

### rating_complete
El rating del usuario que ha visto todo el anime

In [None]:
rating_complete = pd.read_csv(data_dir + '/rating_complete.csv')
rating_complete.sample(10)

In [None]:
rating_complete.describe(percentiles=[])

In [None]:
fig, ax = plt.subplots(figsize=(10,8))

plt.hist(rating_complete.rating,range=[1,10])
plt.title('Distribucion de los Ratings de Anime Vistos');

### Animelist

El rating del usuario y con el indicador de whatching status (todos los estados)

In [None]:
animelist = pd.read_csv(data_dir + '/animelist.csv', nrows=100)
animelist.sample(10)

# Preparación del Dataset
## Interacciones

In [None]:
!mkdir $personalize_data_dir

In [None]:
print(rating_complete.isnull().any())
rating_complete.describe()

Amazon Personalize requiere que las interacciones vengan con un TIMESTAMP, que no tenemos. 
Vamos a sintetizar esta información 
https://docs.aws.amazon.com/personalize/latest/dg/interactions-datasets.html#interactions-dataset-requirements


In [None]:
#Primero generamos un release date datetime basado en el año y un random 

def get_release_year(row):
    release_date = row['Premiered'].split(' ')
    year = 2020
    if len(release_date)>1:
        year = int(release_date[1])
        
    start_date = datetime.datetime(year, 1, 1)
    now =  datetime.datetime.today()
    time_between_dates = now - start_date
    days_between_dates = time_between_dates.days - 30
    random_number_of_days = random.randrange(days_between_dates*60*60*24)
    random_date = start_date + datetime.timedelta(seconds=random_number_of_days)
    return int(random_date.timestamp())

In [None]:
anime_copy = anime.copy()[['Name','Genres','Premiered','Studios','Rating']]
anime_copy['release_date']= anime_copy.apply(get_release_year, axis=1)
anime_copy.drop('Premiered', axis=1, inplace=True)
anime_copy.head()

In [None]:
# Si no tienes suficiente capacidad de computo (instancias menores a ml.t3.medium), 
# no puedes procesar el dataset completo, por lo que para este ejercicio usaremos el 10% de este. 
n_sample = int(n*0.1)
print(n_sample)

In [None]:
%%time
#Si vas a emplear el 15% del dataset:
rating_with_date = rating_complete.sample(n_sample).join(anime_copy[['release_date']],on='anime_id')
#Si vas a emplear el 100% del dataset:
#rating_with_date = rating_complete.join(anime_copy[['release_date']],on='anime_id')

In [None]:
%%time
rating_with_date = rating_complete.join(anime_copy[['release_date']],on='anime_id')

In [None]:
%%time
now = int(datetime.datetime.today().timestamp())
rating_with_date['time_since_release']= (now - rating_with_date['release_date']) - 7

In [None]:
%%time
rating_with_date['random_seconds']= (rating_with_date['time_since_release']).apply(lambda x: randrange(x))

In [None]:
%%time
rating_with_date['TIMESTAMP'] = rating_with_date['release_date'] + rating_with_date['random_seconds']

In [None]:
interactions_df = rating_with_date[['user_id', 'anime_id', 'TIMESTAMP', 'rating']]

In [None]:
interactions_df.rename(columns = {'user_id':'USER_ID', 'anime_id':'ITEM_ID', 
                              'timestamp':'TIMESTAMP', 'rating': 'EVENT_VALUE' }, inplace = True) 
interactions_df['EVENT_TYPE'] = 'RATING'
interactions_df.head()

In [None]:
interactions_filename = "interactions.csv"
interactions_df.to_csv((personalize_data_dir+"/"+interactions_filename), index=False, float_format='%.0f')

In [None]:
!ls -alh {personalize_data_dir}/{interactions_filename}

## Creando el dataset groups y el interactions dataset

In [None]:
# Configure the SDK to Personalize:
personalize = boto3.client('personalize')
personalize_runtime = boto3.client('personalize-runtime')

In [None]:
create_dataset_group_response = personalize.create_dataset_group(
    name = "personalize-anime"
)
dataset_group_arn = create_dataset_group_response['datasetGroupArn']

print(json.dumps(create_dataset_group_response, indent=2))

In [None]:
max_time = time.time() + 3*60*60 # 3 hours
while time.time() < max_time:
    describe_dataset_group_response = personalize.describe_dataset_group(
        datasetGroupArn = dataset_group_arn
    )
    status = describe_dataset_group_response["datasetGroup"]["status"]
    print("DatasetGroup: {}".format(status))
    
    if status == "ACTIVE" or status == "CREATE FAILED":
        break
        
    time.sleep(10)

### Create the dataset

In [None]:
interactions_schema = schema = {
    "type": "record",
    "name": "Interactions",
    "namespace": "com.amazonaws.personalize.schema",
    "fields": [
        {
            "name": "USER_ID",
            "type": "string"
        },
        {
            "name": "ITEM_ID",
            "type": "string"
        },
        {
            "name": "TIMESTAMP",
            "type": "long"
        },
        {
            "name": "EVENT_VALUE",
            "type": "float"
        },
        {
            "name": "EVENT_TYPE",
            "type": "string"
        },

    ],
    "version": "1.0"
}

create_schema_response = personalize.create_schema(
    name = "personalize-anime-interactions1",
    schema = json.dumps(interactions_schema)
)
interaction_schema_arn = create_schema_response['schemaArn']
print(json.dumps(create_schema_response, indent=2))

In [None]:
dataset_type = "INTERACTIONS"
create_dataset_response = personalize.create_dataset(
    name = "personalize-anime-interactions1",
    datasetType = dataset_type,
    datasetGroupArn = dataset_group_arn,
    schemaArn = interaction_schema_arn
)

interactions_dataset_arn = create_dataset_response['datasetArn']
print(json.dumps(create_dataset_response, indent=2))

## Configure an S3 bucket and an IAM role

In [None]:
with open('/opt/ml/metadata/resource-metadata.json') as notebook_info:
    data = json.load(notebook_info)
    resource_arn = data['ResourceArn']
    region = resource_arn.split(':')[3]
print(region)

In [None]:
s3 = boto3.client('s3')
account_id = boto3.client('sts').get_caller_identity().get('Account')
bucket_name = account_id + "-" + region + "-" + "personalize-anime-dataset1"
print(bucket_name)
if region == "us-east-1":
    s3.create_bucket(Bucket=bucket_name)
else:
    s3.create_bucket(
        Bucket=bucket_name,
        CreateBucketConfiguration={'LocationConstraint': region}
        )

In [None]:
interactions_file_path = personalize_data_dir + "/" + interactions_filename
boto3.Session().resource('s3').Bucket(bucket_name).Object(interactions_filename).upload_file(interactions_file_path)
interactions_s3DataPath = "s3://"+bucket_name+"/"+interactions_filename

# Bucket Policy

In [None]:
policy = {
    "Version": "2012-10-17",
    "Id": "PersonalizeS3BucketAccessPolicy",
    "Statement": [
        {
            "Sid": "PersonalizeS3BucketAccessPolicy",
            "Effect": "Allow",
            "Principal": {
                "Service": "personalize.amazonaws.com"
            },
            "Action": [
                "s3:*Object",
                "s3:ListBucket"
            ],
            "Resource": [
                "arn:aws:s3:::{}".format(bucket_name),
                "arn:aws:s3:::{}/*".format(bucket_name)
            ]
        }
    ]
}

s3.put_bucket_policy(Bucket=bucket_name, Policy=json.dumps(policy))

# crea Rol

In [None]:
iam = boto3.client("iam")

role_name = "PersonalizeRoleAnime"
assume_role_policy_document = {
    "Version": "2012-10-17",
    "Statement": [
        {
          "Effect": "Allow",
          "Principal": {
            "Service": "personalize.amazonaws.com"
          },
          "Action": "sts:AssumeRole"
        }
    ]
}

create_role_response = iam.create_role(
    RoleName = role_name,
    AssumeRolePolicyDocument = json.dumps(assume_role_policy_document)
)

# AmazonPersonalizeFullAccess provides access to any S3 bucket with a name that includes "personalize" or "Personalize" 
# if you would like to use a bucket with a different name, please consider creating and attaching a new policy
# that provides read access to your bucket or attaching the AmazonS3ReadOnlyAccess policy to the role
policy_arn = "arn:aws:iam::aws:policy/service-role/AmazonPersonalizeFullAccess"
iam.attach_role_policy(
    RoleName = role_name,
    PolicyArn = policy_arn
)

# Now add S3 support
iam.attach_role_policy(
    PolicyArn='arn:aws:iam::aws:policy/AmazonS3FullAccess',
    RoleName=role_name
)
time.sleep(60) # wait for a minute to allow IAM role policy attachment to propagate

role_arn = create_role_response["Role"]["Arn"]
print(role_arn)

## Import the interactions data

In [None]:
create_dataset_import_job_response = personalize.create_dataset_import_job(
    jobName = "personalize-poc-anime-interactions",
    datasetArn = interactions_dataset_arn,
    dataSource = {
        "dataLocation": "s3://{}/{}".format(bucket_name, interactions_filename)
    },
    roleArn = role_arn
)
dataset_import_job_arn = create_dataset_import_job_response['datasetImportJobArn']
print(json.dumps(create_dataset_import_job_response, indent=2))

In [None]:
%%time

max_time = time.time() + 6*60*60 # 6 hours
while time.time() < max_time:
    describe_dataset_import_job_response = personalize.describe_dataset_import_job(
        datasetImportJobArn = dataset_import_job_arn
    )
    status = describe_dataset_import_job_response["datasetImportJob"]['status']
    print("DatasetImportJob: {}".format(status))
    
    if status == "ACTIVE" or status == "CREATE FAILED":
        break
        
    time.sleep(10)

In [None]:
%store data_dir
%store personalize_data_dir
%store interactions_dataset_arn
%store dataset_group_arn
%store bucket_name
%store role_arn
%store role_name
%store data_dir
%store region
%store interaction_schema_arn

In [None]:
%store -r

# Validating and Importing Item Metadata

In [None]:
anime = pd.read_csv(data_dir + '/anime.csv',index_col='MAL_ID')
anime.sample(1)

In [None]:
items = anime[['Name', 'Genres', 'Premiered', 'Studios' ]]
items['year'] = items['Premiered'].apply(lambda x: int(x.split(' ')[1]) if ' ' in x else 2000)
items['GENRE'] = items['Genres'].apply(lambda x: '|'.join(x.replace(' ', '').split(',')))
items.drop(['Name','Genres', 'Premiered'], axis=1,  inplace=True)
items.reset_index(inplace=True)
items.sample(5)

In [None]:
itemmetadata_df = items.rename(columns = {'year':'YEAR', 'MAL_ID':'ITEM_ID', 'Studios':'STUDIOS'}) 

In [None]:
itemmetadata_filename = "item-meta.csv"
itemmetadata_df.to_csv((f"{personalize_data_dir}/{itemmetadata_filename}"), index=False, float_format='%.0f')

In [None]:
!ls -alh {personalize_data_dir}/{itemmetadata_filename}
!head {personalize_data_dir}/{itemmetadata_filename}

In [None]:
itemmetadata_df.sample(5)

In [None]:
# Configure the SDK to Personalize:
personalize = boto3.client('personalize')
personalize_runtime = boto3.client('personalize-runtime')

### Create the dataset

In [None]:
itemmetadata_schema = {
    "type": "record",
    "name": "Items",
    "namespace": "com.amazonaws.personalize.schema",
    "fields": [
        {
            "name": "ITEM_ID",
            "type": "string"
        },
        {
            "name": "STUDIOS",
            "type": "string",
            "categorical": True
        },
        {
            "name": "YEAR",
            "type": "int",
        },
        {
            "name": "GENRE",
            "type": "string",
            "categorical": True
        },

        
    ],
    "version": "1.0"
}

create_schema_response = personalize.create_schema(
    name = "personalize-anime-item",
    schema = json.dumps(itemmetadata_schema)
)
itemmetadataschema_arn = create_schema_response['schemaArn']
print(json.dumps(create_schema_response, indent=2))

In [None]:
dataset_type = "ITEMS"
create_dataset_response = personalize.create_dataset(
    name = "personalize-anime-items",
    datasetType = dataset_type,
    datasetGroupArn = dataset_group_arn,
    schemaArn = itemmetadataschema_arn
)

items_dataset_arn = create_dataset_response['datasetArn']
print(json.dumps(create_dataset_response, indent=2))

### Upload data to S3

In [None]:
itemmetadata_file_path = f"{personalize_data_dir}/{itemmetadata_filename}"
boto3.Session().resource('s3').Bucket(bucket_name).Object(itemmetadata_filename).upload_file(itemmetadata_file_path)
interactions_s3DataPath = "s3://"+bucket_name+"/"+itemmetadata_filename

## Import the item metadata 

In [None]:
create_dataset_import_job_response = personalize.create_dataset_import_job(
    jobName = "personalize-anime-item-import",
    datasetArn = items_dataset_arn,
    dataSource = {
        "dataLocation": "s3://{}/{}".format(bucket_name, itemmetadata_filename)
    },
    roleArn = role_arn
)

dataset_import_job_arn = create_dataset_import_job_response['datasetImportJobArn']
print(json.dumps(create_dataset_import_job_response, indent=2))

In [None]:
%%time

max_time = time.time() + 6*60*60 # 6 hours
while time.time() < max_time:
    describe_dataset_import_job_response = personalize.describe_dataset_import_job(
        datasetImportJobArn = dataset_import_job_arn
    )
    status = describe_dataset_import_job_response["datasetImportJob"]['status']
    print("DatasetImportJob: {}".format(status))
    
    if status == "ACTIVE" or status == "CREATE FAILED":
        break
        
    time.sleep(10)

In [None]:
%store items_dataset_arn
%store itemmetadataschema_arn

# Fin preparacion y exploracion

In [None]:
%store