In [2]:
# Imports
import pandas as pd
import numpy as np
from pathlib import Path
import matplotlib.pyplot as plt
from sklearn import svm
from sklearn.ensemble import AdaBoostClassifier
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split
from pandas.tseries.offsets import DateOffset
from sklearn.metrics import classification_report

In [3]:
# Import metadata for movies
data = pd.read_csv('movies_metadata.csv')

In [4]:
# Review the DataFrame
data.head(3)

Unnamed: 0,budget,genres,id,original_language,original_title,overview,popularity,production_companies,production_countries,release_date,revenue,runtime,status,tagline,title,vote_average,vote_count
0,30000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",862,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",21.946943,"[{'name': 'Pixar Animation Studios', 'id': 3}]","[{'iso_3166_1': 'US', 'name': 'United States o...",1995-10-30,373554033,81.0,Released,,Toy Story,7.7,5415
1,65000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",8844,en,Jumanji,When siblings Judy and Peter discover an encha...,17.015539,"[{'name': 'TriStar Pictures', 'id': 559}, {'na...","[{'iso_3166_1': 'US', 'name': 'United States o...",1995-12-15,262797249,104.0,Released,Roll the dice and unleash the excitement!,Jumanji,6.9,2413
2,0,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",15602,en,Grumpier Old Men,A family wedding reignites the ancient feud be...,11.7129,"[{'name': 'Warner Bros.', 'id': 6194}, {'name'...","[{'iso_3166_1': 'US', 'name': 'United States o...",1995-12-22,0,101.0,Released,Still Yelling. Still Fighting. Still Ready for...,Grumpier Old Men,6.5,92


In [32]:
df = data[['id', 'budget', 'revenue', 'vote_average', 'vote_count']]

In [33]:
df.head()

Unnamed: 0,id,budget,revenue,vote_average,vote_count
0,862,30000000,373554033,7.7,5415
1,8844,65000000,262797249,6.9,2413
2,15602,0,0,6.5,92
3,31357,16000000,81452156,6.1,34
4,11862,0,76578911,5.7,173


In [34]:
# Calculating the minimum number of votes to be in the chart
min_votes = df['vote_count'].quantile(0.90)

In [35]:
# Calculating the average vote rate
vote_rate = data['vote_average'].mean()

# Computing the score(rating) of each movie
def score(x, min_votes = min_votes, vote_rate = vote_rate):
    vote_cnt = x['vote_count']
    vote_avg = x['vote_average']
    # Calculation based on the IMDB formula
    return round((vote_cnt/(vote_cnt+min_votes) * vote_avg) + (min_votes/(min_votes+vote_cnt) * vote_rate))

In [36]:
# Defining a new feature 'score' and calculate its value
df['score'] = df.apply(score, axis=1)
# drop columns not required
df = df.drop(columns = ['vote_count', 'vote_average'])
# review sample movies
df.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


Unnamed: 0,id,budget,revenue,score
0,862,30000000,373554033,8
1,8844,65000000,262797249,7
2,15602,0,0,6
3,31357,16000000,81452156,6
4,11862,0,76578911,6


In [39]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 31952 entries, 0 to 31951
Data columns (total 4 columns):
 #   Column   Non-Null Count  Dtype
---  ------   --------------  -----
 0   id       31952 non-null  int64
 1   budget   31952 non-null  int64
 2   revenue  31952 non-null  int64
 3   score    31952 non-null  int64
dtypes: int64(4)
memory usage: 998.6 KB


In [40]:
# Creating the features set X
X = df.drop(columns = ['score'])
# Display sample data
X.head()

Unnamed: 0,id,budget,revenue
0,862,30000000,373554033
1,8844,65000000,262797249
2,15602,0,0
3,31357,16000000,81452156
4,11862,0,76578911


In [41]:
# Creating the target set y
y = df["score"]

# Display sample data
y.head()

0    8
1    7
2    6
3    6
4    6
Name: score, dtype: int64

In [42]:
# Split the preprocessed data into a training and testing dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

In [43]:
# Create a StandardScaler instance
scaler = StandardScaler()

# Fit the StandardScaler
X_scaler = scaler.fit(X_train)

# Scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [44]:
 # Import Amazon SageMaker libraries and modules
import sagemaker
import sagemaker.amazon.common as smac
from sagemaker import get_execution_role
from sagemaker.predictor import csv_serializer, json_deserializer

# Import AWS Python SDK
import boto3

# Import support libraries
import io
import os
import json
import numpy as np

In [16]:
# Set the S3 bucket name
bucket = "fintech-bootcamp-activities-beni-2023-03-19"

In [17]:
 # Set a prefix for the data files
prefix = "movie-scoring"

In [18]:
# Set the IAM execution role
role = get_execution_role()

In [19]:
# Encode the training data as Protocol Buffer
buf = io.BytesIO()
vectors = np.array(X_train).astype("float32")
labels = np.array(y_train).astype("float32")
smac.write_numpy_to_dense_tensor(buf, vectors, labels)
buf.seek(0)

# Upload encoded training data to Amazon S3
key = 'linear_train.data'
boto3.resource("s3").Bucket(bucket).Object(os.path.join(prefix, "train", key)).upload_fileobj(buf)
s3_train_data = "s3://{}/{}/train/{}".format(bucket, prefix, key)
print("Training data uploaded to: {}".format(s3_train_data))

Training data uploaded to: s3://fintech-bootcamp-activities-beni-2023-03-19/movie-scoring/train/linear_train.data


In [20]:
# Encode the testing data as Protocol Buffer
buf = io.BytesIO()
vectors = np.array(X_test).astype("float32")
labels = np.array(y_test).astype("float32")
smac.write_numpy_to_dense_tensor(buf, vectors, labels)
buf.seek(0)

# Upload encoded testing data to Amazon S3
key = "linear_test.data"
boto3.resource("s3").Bucket(bucket).Object(os.path.join(prefix, "test", key)).upload_fileobj(buf)
s3_test_data = "s3://{}/{}/test/{}".format(bucket, prefix, key)
print("Testing data uploaded to: {}".format(s3_test_data))

Testing data uploaded to: s3://fintech-bootcamp-activities-beni-2023-03-19/movie-scoring/test/linear_test.data


In [21]:
 # Save the current session in a variable
sess = sagemaker.Session()

In [22]:
# Import the get_image_uri module from the sagamaker library
from sagemaker.amazon.amazon_estimator import get_image_uri

In [23]:
# Import the container image
container = get_image_uri(boto3.Session().region_name, "linear-learner")

The method get_image_uri has been renamed in sagemaker>=2.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.


In [24]:
# Create an instance of the machine learning model
linear = sagemaker.estimator.Estimator(
    container,
    role,
    train_instance_count=1,
    train_instance_type="ml.m4.xlarge",
    output_path="s3://{}/{}/output".format(bucket, prefix),
    sagemaker_session=sess,
)

train_instance_count has been renamed in sagemaker>=2.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.
train_instance_type has been renamed in sagemaker>=2.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.


In [25]:
# Get the dimension of the feature-input vector
feature_dim = X.shape[1]

In [26]:
 # Define linear learner hyperparameters
linear.set_hyperparameters(
    feature_dim=feature_dim,
    mini_batch_size=200,
    predictor_type="binary_classifier"
)

In [27]:
 # Fitting the linear learner model
linear.fit({"train": s3_train_data, "test": s3_test_data})

INFO:sagemaker:Creating training-job with name: linear-learner-2023-03-19-20-08-27-913


2023-03-19 20:08:32 Starting - Starting the training job...
2023-03-19 20:09:08 Starting - Preparing the instances for training.........
2023-03-19 20:10:35 Downloading - Downloading input data
2023-03-19 20:10:35 Training - Downloading the training image.........
2023-03-19 20:11:51 Training - Training image download completed. Training in progress..[34mDocker entrypoint called with argument(s): train[0m
[34mRunning default environment configuration script[0m
[34m[03/19/2023 20:12:08 INFO 140177437239104] Reading default configuration from /opt/amazon/lib/python3.7/site-packages/algorithm/resources/default-input.json: {'mini_batch_size': '1000', 'epochs': '15', 'feature_dim': 'auto', 'use_bias': 'true', 'binary_classifier_model_selection_criteria': 'accuracy', 'f_beta': '1.0', 'target_recall': '0.8', 'target_precision': '0.8', 'num_models': 'auto', 'num_calibration_samples': '10000000', 'init_method': 'uniform', 'init_scale': '0.07', 'init_sigma': '0.01', 'init_bias': '0.0', 'opt

UnexpectedStatusException: Error for Training job linear-learner-2023-03-19-20-08-27-913: Failed. Reason: ClientError: Detected invalid labels in the dataset. For classification tasks, the labels should be integers between 0 to (num_classes-1), exit code: 2

In [None]:
# Deploy an instance of the linear-learner model to create a predictor
linear_predictor = linear.deploy(initial_instance_count=1, instance_type="ml.t2.medium")

In [None]:
 # Linear predictor configurations
linear_predictor.serializer = csv_serializer
linear_predictor.deserializer = json_deserializer

In [None]:
# Making some predictions using the test data
model_predictions = linear_predictor.predict(X_test_scaled)

In [None]:
# Display sample predictions
model_predictions["predictions"][:3]

In [None]:
# Create a list with the predicted values
y_predictions = [np.uint8(value["predicted_label"]) for value in model_predictions["predictions"]]

# Transforming the list into an array
y_predictions = np.array(y_predictions)

# Display sample data
y_predictions[:10]

In [None]:
# Import the classification report from Scikit-learn
from sklearn.metrics import classification_report

In [None]:
# Display classification report
print("Classification report")
print(classification_report(y_test, y_predictions))