In [1]:
import boto3
import pandas as pd

BUCKET_NAME = 'movie-recommender-dataset'
FILES = ['data/credits.csv', 'data/keywords.csv', 'data/links.csv', 'data/movies_metadata.csv', 'data/ratings.csv']

s3 = boto3.client('s3')
dataframes = {}

for file in FILES:
    try:
        print(f"Loading {file} in chunks...")
        obj = s3.get_object(Bucket=BUCKET_NAME, Key=file)
        chunks = pd.read_csv(obj['Body'], chunksize=10000)  # Adjust chunksize as needed
        dataframes[file.split('/')[-1]] = pd.concat(chunks)
        print(f"Loaded {file} successfully.")
    except Exception as e:
        print(f"Failed to load {file}. Error: {e}")

print("All files loaded successfully.")


Loading data/credits.csv in chunks...
Loaded data/credits.csv successfully.
Loading data/keywords.csv in chunks...
Loaded data/keywords.csv successfully.
Loading data/links.csv in chunks...
Loaded data/links.csv successfully.
Loading data/movies_metadata.csv in chunks...
Loaded data/movies_metadata.csv successfully.
Loading data/ratings.csv in chunks...
Loaded data/ratings.csv successfully.
All files loaded successfully.


In [2]:
# Inspecting each DataFrame
for name, df in dataframes.items():
    print(f"First few rows of {name}:")
    print(df.head())            # View the top rows of each DataFrame
    print(f"Missing values in {name}:")
    print(df.isnull().sum())    # Check for missing values
    print("\n")


First few rows of credits.csv:
                                                cast  \
0  [{'cast_id': 14, 'character': 'Woody (voice)',...   
1  [{'cast_id': 1, 'character': 'Alan Parrish', '...   
2  [{'cast_id': 2, 'character': 'Max Goldman', 'c...   
3  [{'cast_id': 1, 'character': "Savannah 'Vannah...   
4  [{'cast_id': 1, 'character': 'George Banks', '...   

                                                crew     id  
0  [{'credit_id': '52fe4284c3a36847f8024f49', 'de...    862  
1  [{'credit_id': '52fe44bfc3a36847f80a7cd1', 'de...   8844  
2  [{'credit_id': '52fe466a9251416c75077a89', 'de...  15602  
3  [{'credit_id': '52fe44779251416c91011acb', 'de...  31357  
4  [{'credit_id': '52fe44959251416c75039ed7', 'de...  11862  
Missing values in credits.csv:
cast    0
crew    0
id      0
dtype: int64


First few rows of keywords.csv:
      id                                           keywords
0    862  [{'id': 931, 'name': 'jealousy'}, {'id': 4290,...
1   8844  [{'id': 10090, 'name':

In [3]:
import boto3
import pandas as pd

# Define the S3 bucket and file path for movies_metadata.csv
BUCKET_NAME = 'movie-recommender-dataset'
MOVIES_METADATA_FILE = 'data/movies_metadata.csv'

# Load movies_metadata.csv from S3 into a DataFrame
s3 = boto3.client('s3')
obj = s3.get_object(Bucket=BUCKET_NAME, Key=MOVIES_METADATA_FILE)
movies_metadata_df = pd.read_csv(obj['Body'])

# Ensure that the necessary columns exist and combine them into a single feature
if 'title' in movies_metadata_df.columns and 'overview' in movies_metadata_df.columns and 'genres' in movies_metadata_df.columns:
    movies_metadata_df['combined_features'] = (
        movies_metadata_df['title'].fillna('') + " " +
        movies_metadata_df['overview'].fillna('') + " " +
        movies_metadata_df['genres'].fillna('')
    )
    print("Combined features column created successfully!")
else:
    print("One or more columns (title, overview, genres) are missing in movies_metadata_df.")


Combined features column created successfully!


  movies_metadata_df = pd.read_csv(obj['Body'])


In [5]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Create a TF-IDF Vectorizer
tfidf_vectorizer = TfidfVectorizer(stop_words='english')

# Fit and transform the combined_features column
tfidf_matrix = tfidf_vectorizer.fit_transform(movies_metadata_df['combined_features'])

# Check the shape of the TF-IDF matrix
print(f"TF-IDF matrix shape: {tfidf_matrix.shape}")


TF-IDF matrix shape: (45466, 81247)


In [4]:
# Sample the first 10,000 rows
sample_df = movies_metadata_df.sample(n=10000, random_state=42)

# Create the TF-IDF matrix
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf_vectorizer = TfidfVectorizer()  # No need to set max_features here
tfidf_matrix = tfidf_vectorizer.fit_transform(sample_df['combined_features'])

# Import necessary libraries for cosine similarity
from sklearn.metrics.pairwise import cosine_similarity
from scipy.sparse import csr_matrix

# Create a sparse matrix (if not already)
tfidf_sparse = csr_matrix(tfidf_matrix)

# Calculate cosine similarity matrix
cosine_sim = cosine_similarity(tfidf_sparse)

# Check the shape of the cosine similarity matrix
print(f"Cosine similarity matrix shape: {cosine_sim.shape}")



Cosine similarity matrix shape: (10000, 10000)


In [9]:
def recommend_movies(title, cosine_sim=cosine_sim):
    # Check if the title exists in the DataFrame
    if title not in sample_df['title'].values:
        return f"Movie titled '{title}' not found in the dataset."

    # Get the index of the movie that matches the title
    idx = sample_df[sample_df['title'] == title].index[0]

    # Get the pairwise similarity scores of all movies with that movie
    sim_scores = list(enumerate(cosine_sim[idx]))

    # Sort the movies based on the similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    # Get the scores of the 10 most similar movies
    sim_scores = sim_scores[1:11]

    # Get the movie indices
    movie_indices = [i[0] for i in sim_scores]

    # Return the top 10 most similar movies
    return sample_df.iloc[movie_indices][['title', 'overview']]


In [14]:
recommendations = recommend_movies("Jumanji")
print(recommendations)


                      title                                           overview
38293          Mother's Day  Intersecting stories with different moms colli...
15267           Please Give  In New York City, a husband and wife butt head...
35962            Witchcraft  A new mother and her child move into her mothe...
42496               Dukhtar  In the mountains of Pakistan, a mother and her...
37054     Curse of the Wolf  Dakota, a young werewolf, has finally learned ...
39319     Adult Life Skills  Anna is stuck: she’s approaching 30 and has ju...
20139  Girl Walk // All Day  Girl Walk // All Day is a feature-length dance...
33903        The Diabolical  When a single mother and her two young childre...
3438      Empire of Passion  A young man has an affair with an older woman....
10072                Agatha  In real life, mystery writer Agatha Christie d...


In [15]:
import joblib

# Save the TF-IDF vectorizer and cosine similarity matrix
joblib.dump(tfidf_vectorizer, 'tfidf_vectorizer.pkl')
joblib.dump(cosine_sim, 'cosine_similarity_matrix.pkl')


['cosine_similarity_matrix.pkl']

In [16]:
import os

# Check if the files were created
print("Files created:")
print(os.path.isfile('tfidf_vectorizer.pkl'))  # Check for the TF-IDF vectorizer
print(os.path.isfile('cosine_similarity_matrix.pkl'))  # Check for the cosine similarity matrix


Files created:
True
True


In [17]:
import boto3

# Initialize the S3 client
s3 = boto3.client('s3')
BUCKET_NAME = 'movie-recommender-dataset'

# Upload the saved files to S3
s3.upload_file('tfidf_vectorizer.pkl', BUCKET_NAME, 'model/tfidf_vectorizer.pkl')
s3.upload_file('cosine_similarity_matrix.pkl', BUCKET_NAME, 'model/cosine_similarity_matrix.pkl')

print("Files uploaded to S3 successfully.")


Files uploaded to S3 successfully.


In [18]:
import boto3

# Initialize the S3 client
s3 = boto3.client('s3')
BUCKET_NAME = 'movie-recommender-dataset'

# Download the model files from S3
s3.download_file(BUCKET_NAME, 'model/tfidf_vectorizer.pkl', 'model/tfidf_vectorizer.pkl')
s3.download_file(BUCKET_NAME, 'model/cosine_similarity_matrix.pkl', 'model/cosine_similarity_matrix.pkl')

print("Files downloaded from S3 successfully.")


FileNotFoundError: [Errno 2] No such file or directory: 'model/tfidf_vectorizer.pkl.3a631aB3'

In [22]:
import boto3

# Specify the bucket name and file details
BUCKET_NAME = 'your-s3-bucket-name'  # replace with your bucket name
FILE_NAME = 'recommend.py'              # the file you want to upload
OBJECT_NAME = 'path/in/s3/recommend.py' # the S3 object name (where it will be stored in the bucket)

# Initialize the S3 client
s3_client = boto3.client('s3')

# Upload the file
try:
    s3_client.upload_file(FILE_NAME, BUCKET_NAME, OBJECT_NAME)
    print(f"Successfully uploaded {FILE_NAME} to s3://{BUCKET_NAME}/{OBJECT_NAME}")
except Exception as e:
    print(f"Error uploading file: {e}")


Error uploading file: Failed to upload recommend.py to your-s3-bucket-name/path/in/s3/recommend.py: An error occurred (AccessDenied) when calling the PutObject operation: Access Denied
