### Importing necessary libraries

In [1]:
# Importing necessary libraries 
import time 
import numpy as np
import pandas as pd
import json
import matplotlib.pyplot as plt
import boto3
import botocore
import sagemaker
from sagemaker import get_execution_role
from boto3 import client
import os 

### Setting Sagemaker session and bucket

In [2]:
# Pointing to default bucket
sagemaker_session = sagemaker.Session()
bucket = sagemaker.Session().default_bucket()
prefix = 'sagemaker/aws-recommender-system'
execution_role = sagemaker.get_execution_role()
region=boto3.Session().region_name

# Returning S3 bucket values 
print(f"S3 Bucket: {bucket}", f"Prefix: {prefix}", f"Execution Role: {execution_role}", f"Region: {region}", 
      sep="\n")

S3 Bucket: sagemaker-us-east-1-981537519177
Prefix: sagemaker/aws-recommender-system
Execution Role: arn:aws:iam::981537519177:role/service-role/AmazonSageMaker-ExecutionRole-20210707T184971
Region: us-east-1


### Functions - Importing data from an S3 bucket

In [16]:
# Function borrowed from AWS sample notebook with some modifications to the print outputs 
def check_bucket_permissions(bucket):
    permission = False 
    # Checks if a given s3 bucket exists with the bucket specified 
    try:
        boto3.Session().client('s3').head_bucket(Bucket=bucket)
    except botocore.exceptions.ParamValidationError as e:
        print(f"s3 bucket: {bucket} does not exist or was not provided.")
    except botocore.exceptions.ClientError as e:
        if e.response['Error']['Code'] == "403":
            print(f"User does not have access to bucket: {bucket}")
        elif e.response['Error']['Code'] == "404":
            print(f"Bucket: {bucket} does not exist")
        else:
            raise
    # If no exceptions were raised then indicate that S3 bucket exists and user has access
    else:
        permission = True 
    return permission

# Function to return list of files from S3
def list_files_from_s3(bucket, prefix):
    conn = client('s3') 
    file_list = []
    for key in conn.list_objects(Bucket=bucket, Prefix=prefix)['Contents']:
        file_name = key['Key'].partition(f"{prefix}/")
        if len(file_name[2]) > 0: 
            file_list.append(file_name[2])
    return file_list

# Function to download files from s3
def download_from_s3(bucket, dataset_location, file_names):
    s3 = boto3.resource('s3')
    for fn in file_names:
        file = f"{dataset_location}/{fn}"
        file_name = str(fn)
        try:
            s3.Bucket(bucket).download_file(file, str(file_name))
        except botocore.exceptions.ClientError as e:
            if e.response['Error']['Code'] == "404":
                print(f"Object {file} does not exist.")
            else:
                raise

### Start of Script

In [19]:
# Obtain list of files from s3 bucket 
file_list = list_files_from_s3(bucket, prefix)
file_list

['links.csv', 'movies.csv', 'ratings.csv', 'tags.csv']

In [22]:
# Checking bucket permissions 
check_bucket_permissions(bucket)

# Dowloading data from S3 bucket to local notebook environment 
dataset_location = prefix
download_from_s3(bucket, dataset_location, file_list)

In [23]:
# List files in current directory 
os.listdir(os.curdir)

['tags.csv',
 'links.csv',
 'Recommender_System.ipynb',
 '.ipynb_checkpoints',
 'ratings.csv',
 'README.md',
 '.git',
 'Configuration_Parameters.ipynb',
 'movies.csv']

In [27]:
# Importing ratings 
ratings = pd.read_csv('ratings.csv')
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [26]:
movies = pd.read_csv('movies.csv')
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [144]:
# Obtaining ratings
ratings = pd.read_csv('ratings.csv')

# Obtaining movies 
movies = pd.read_csv('movies.csv')

# Merging based on movieId
movie_ratings = pd.merge(movies, ratings, on='movieId')

# Display dataframes 
display('Ratings', ratings, 'Movies', movies, 'movie_ratings', movie_ratings)


'Ratings'

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931
...,...,...,...,...
100831,610,166534,4.0,1493848402
100832,610,168248,5.0,1493850091
100833,610,168250,5.0,1494273047
100834,610,168252,5.0,1493846352


'Movies'

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy
...,...,...,...
9737,193581,Black Butler: Book of the Atlantic (2017),Action|Animation|Comedy|Fantasy
9738,193583,No Game No Life: Zero (2017),Animation|Comedy|Fantasy
9739,193585,Flint (2017),Drama
9740,193587,Bungo Stray Dogs: Dead Apple (2018),Action|Animation


'movie_ratings'

Unnamed: 0,movieId,title,genres,userId,rating,timestamp
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,1,4.0,964982703
1,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,5,4.0,847434962
2,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,7,4.5,1106635946
3,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,15,2.5,1510577970
4,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,17,4.5,1305696483
...,...,...,...,...,...,...
100831,193581,Black Butler: Book of the Atlantic (2017),Action|Animation|Comedy|Fantasy,184,4.0,1537109082
100832,193583,No Game No Life: Zero (2017),Animation|Comedy|Fantasy,184,3.5,1537109545
100833,193585,Flint (2017),Drama,184,3.5,1537109805
100834,193587,Bungo Stray Dogs: Dead Apple (2018),Action|Animation,184,3.5,1537110021


In [145]:
# Obtaining tags 
tags = pd.read_csv('tags.csv')

# Merging tags dataset 
merged_data = pd.merge(movie_ratings, tags, how ='outer', on=['movieId','userId'], indicator=True)

display('tags', tags, 'merged data', merged_data)



'tags'

Unnamed: 0,userId,movieId,tag,timestamp
0,2,60756,funny,1445714994
1,2,60756,Highly quotable,1445714996
2,2,60756,will ferrell,1445714992
3,2,89774,Boxing story,1445715207
4,2,89774,MMA,1445715200
...,...,...,...,...
3678,606,7382,for katie,1171234019
3679,606,7936,austere,1173392334
3680,610,3265,gun fu,1493843984
3681,610,3265,heroic bloodshed,1493843978


'merged data'

Unnamed: 0,movieId,title,genres,userId,rating,timestamp_x,tag,timestamp_y,_merge
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,1,4.0,9.649827e+08,,,left_only
1,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,5,4.0,8.474350e+08,,,left_only
2,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,7,4.5,1.106636e+09,,,left_only
3,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,15,2.5,1.510578e+09,,,left_only
4,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,17,4.5,1.305696e+09,,,left_only
...,...,...,...,...,...,...,...,...,...
102879,6016,,,573,,,not seen,1.186589e+09,right_only
102880,6157,,,573,,,bad,1.186589e+09,right_only
102881,6157,,,573,,,Ben Affleck,1.186589e+09,right_only
102882,273,,,600,,,gothic,1.237739e+09,right_only


In [146]:
# Placing missing titles and genres based on movieId 
merged_data.loc[merged_data.title.isnull(), 'title'] = merged_data['movieId'].map(movies.title)
merged_data.loc[merged_data.genres.isnull(), 'genres'] = merged_data['movieId'].map(movies.genres)

# Dropping unneccessary columns 
merged_data.drop(['timestamp_x', 'timestamp_y'], axis = 1, inplace=True)

# Saving merged data to CSV 
merged_data.to_csv('merged_data.csv')

merged_data

Unnamed: 0,movieId,title,genres,userId,rating,tag,_merge
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,1,4.0,,left_only
1,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,5,4.0,,left_only
2,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,7,4.5,,left_only
3,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,15,2.5,,left_only
4,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,17,4.5,,left_only
...,...,...,...,...,...,...,...
102879,6016,Kiss Kiss Bang Bang (2005),Comedy|Crime|Mystery|Thriller,573,,not seen,right_only
102880,6157,Aquamarine (2006),Children|Comedy|Fantasy,573,,bad,right_only
102881,6157,Aquamarine (2006),Children|Comedy|Fantasy,573,,Ben Affleck,right_only
102882,273,"Secret of Roan Inish, The (1994)",Children|Drama|Fantasy|Mystery,600,,gothic,right_only


### One Hot Encoding the genres column 
https://www.kaggle.com/texasdave/movie-rating-predictor-movielens-dataset

In [156]:
from numpy import array
from numpy import argmax
from sklearn import preprocessing

from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncode

genre_list =['Action',
             'Adventure',
             'Animation',
             'Children',
             'Comedy',
             'Crime', 
             'Documentary',
             'Drama',
             'Fantasy',
             'Film-Noir',
             'Horror',
             'Musical',
             'Mystery',
             'Romance',
             'Sci-Fi',
             'Thriller',
             'War',
             'Western']

values = array(genre_list)
print(values)

# integer encode
label_encoder = LabelEncoder()
integer_encoded = label_encoder.fit_transform(values)

# binary encode
onehot_encoder = OneHotEncoder(sparse=False)
integer_encoded = integer_encoded.reshape(len(integer_encoded), 1)

ImportError: cannot import name 'OneHotEncode'