In [1]:
import pandas as pd
import boto3
import io
import numpy as np
import json

In [11]:
movies = pd.read_csv("s3://movie-content-45/clean-data.csv")

In [3]:
movies.shape

(3099, 16)

In [4]:
movies.head()

Unnamed: 0,movie_id,overview,popularity,release_date,title,vote_average,vote_count,genres,keywords,month,year,director,producer,actors,characters,tags
0,783461,"when her boyfriend loses a mobster's cash, sav...",56.311,2022-02-04 00:00:00,looop lapeta,6.2,54.0,action comedy crime,"['remake','loooplapeta','saadesaati']",february,2022.0,tomtykwer,mukeshchhabra,taapseepannu tahirrajbhasin shreyadhanwanthary...,savi satya julia victor mamleshcharanchaddha,when her boyfriend loses a mobster s cash sav...
1,592508,"a fearless, faithful albeit slightly forgetful...",45.077,2021-11-05 00:00:00,sooryavanshi,5.8,133.0,action crime thriller,"['police','sequel','policeofficer','copuniverse']",november,2021.0,rohitshetty,karanjohar,akshaykumar katrinakaif ajaydevgn ranveersingh...,veersooryavanshi riasooryavanshi bajiraosingha...,a fearless faithful albeit slightly forgetful...
2,864692,a soldier caught by enemies and presumed dead ...,47.611,2023-01-25 00:00:00,pathaan,6.7,70.0,action adventure thriller,"['spy','fakedeath','spythriller','spyuniverse'...",january,2023.0,siddharthanand,adityachopra,shahrukhkhan deepikapadukone johnabraham dimpl...,pathaan rubinamohsin jim nandini colonelsunill...,a soldier caught by enemies and presumed dead ...
3,1018228,a flight attendant and her boyfriend must stea...,43.789,2023-03-24 00:00:00,chor nikal ke bhaga,7.2,55.0,crime thriller,"['heist','airplanehijacking']",march,2023.0,ajaysingh,dineshvijan,yamigautam sunnykaushal sharadkelkar indraneil...,nehagrover ankitsethi parvezshaikh sudhanshuro...,a flight attendant and her boyfriend must stea...
4,20453,rascal. joker. dreamer. genius... you've never...,37.26,2009-12-25 00:00:00,3 idiots,8.0,2052.0,drama comedy,"['suicide','suicideattempt','college','musical...",december,2009.0,vinaywaikul,rahulnanda,aamirkhan r.madhavan sharmanjoshi kareenakapoo...,rancho farhanqureshi rajurastogi pia viru'viru...,rascal joker dreamer genius you ve never...


In [5]:
movies.isnull().sum()

movie_id           0
overview           0
popularity         0
release_date       0
title              0
vote_average       0
vote_count         0
genres           315
keywords           0
month              0
year               0
director         587
producer        1453
actors            33
characters        44
tags               0
dtype: int64

In [6]:
movies.columns

Index(['movie_id', 'overview', 'popularity', 'release_date', 'title',
       'vote_average', 'vote_count', 'genres', 'keywords', 'month', 'year',
       'director', 'producer', 'actors', 'characters', 'tags'],
      dtype='object')

In [13]:
movies = movies[['movie_id', 'overview','title','genres','keywords','director','producer','actors','tags']]
movies.head()

Unnamed: 0,movie_id,overview,title,genres,keywords,director,producer,actors,tags
0,783461,"when her boyfriend loses a mobster's cash, sav...",looop lapeta,action comedy crime,"['remake','loooplapeta','saadesaati']",tomtykwer,mukeshchhabra,taapseepannu tahirrajbhasin shreyadhanwanthary...,when her boyfriend loses a mobster s cash sav...
1,592508,"a fearless, faithful albeit slightly forgetful...",sooryavanshi,action crime thriller,"['police','sequel','policeofficer','copuniverse']",rohitshetty,karanjohar,akshaykumar katrinakaif ajaydevgn ranveersingh...,a fearless faithful albeit slightly forgetful...
2,864692,a soldier caught by enemies and presumed dead ...,pathaan,action adventure thriller,"['spy','fakedeath','spythriller','spyuniverse'...",siddharthanand,adityachopra,shahrukhkhan deepikapadukone johnabraham dimpl...,a soldier caught by enemies and presumed dead ...
3,1018228,a flight attendant and her boyfriend must stea...,chor nikal ke bhaga,crime thriller,"['heist','airplanehijacking']",ajaysingh,dineshvijan,yamigautam sunnykaushal sharadkelkar indraneil...,a flight attendant and her boyfriend must stea...
4,20453,rascal. joker. dreamer. genius... you've never...,3 idiots,drama comedy,"['suicide','suicideattempt','college','musical...",vinaywaikul,rahulnanda,aamirkhan r.madhavan sharmanjoshi kareenakapoo...,rascal joker dreamer genius you ve never...


In [14]:
movies['overview'] = movies['overview'].fillna('')

movies['genres'] = movies['genres'].fillna('[]')
movies['keywords'] = movies['keywords'].fillna('[]')
movies['actors'] = movies['actors'].fillna('[]')
movies['director'] = movies['director'].fillna('[]')

In [15]:
import ast

def convert(obj):
    try:
        return [i['name'] for i in ast.literal_eval(obj)]
    except:
        return []

In [17]:
movies.loc[:, 'genres'] = movies['genres'].apply(convert)
movies.loc[:, 'keywords'] = movies['keywords'].apply(convert)
movies.loc[:, 'actors'] = movies['actors'].apply(convert)
movies.loc[:, 'director'] = movies['director'].apply(convert)
movies.loc[:, 'overview'] = movies['overview'].apply(lambda x: str(x).split())
# Combine all features into one list
for col in ['overview', 'genres', 'keywords', 'actors', 'director']:
    movies[col] = movies[col].apply(lambda x: x if isinstance(x, list) else [])
    
def combine_features(row):
    return row['overview'] + row['genres'] + row['keywords'] + row['actors'] + row['director']

movies['tags'] = movies.apply(combine_features, axis=1)

# Convert tags list into a single string for TF-IDF
movies['tags'] = movies['tags'].apply(lambda x: " ".join([str(i).replace(" ", "") for i in x]).lower())

# Drop rows with empty tags
movies = movies[movies['tags'] != '']
movies.reset_index(drop=True, inplace=True)

# Check final dataframe
movies[['title', 'tags']].head()

Unnamed: 0,title,tags
0,looop lapeta,"['when', 'her', 'boyfriend', 'loses', 'a', ""mo..."
1,sooryavanshi,"['a', 'fearless,', 'faithful', 'albeit', 'slig..."
2,pathaan,"['a', 'soldier', 'caught', 'by', 'enemies', 'a..."
3,chor nikal ke bhaga,"['a', 'flight', 'attendant', 'and', 'her', 'bo..."
4,3 idiots,"['rascal.', 'joker.', 'dreamer.', 'genius...',..."


In [21]:
new_df = df[['movie_id', 'title', 'tags']]
new_df.head()

Unnamed: 0,movie_id,title,tags
0,783461.0,looop lapeta,"['when', 'her', 'boyfriend', 'loses', 'a', ""mo..."
1,592508.0,sooryavanshi,"['a', 'fearless,', 'faithful', 'albeit', 'slig..."
2,864692.0,pathaan,"['a', 'soldier', 'caught', 'by', 'enemies', 'a..."
3,1018228.0,chor nikal ke bhaga,"['a', 'flight', 'attendant', 'and', 'her', 'bo..."
4,20453.0,3 idiots,"['rascal.', 'joker.', 'dreamer.', 'genius...',..."


In [22]:
clean_path = 's3://movie-content-45/tmdb-clean/movies_clean.csv'
new_df.to_csv(clean_path, index=False)
print(f"Clean data uploaded to: {clean_path}")

Clean data uploaded to: s3://movie-content-45/tmdb-clean/movies_clean.csv


In [23]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import pickle

In [24]:
df = pd.read_csv("s3://movie-content-45/tmdb-clean/movies_clean.csv")
df.head()

Unnamed: 0,movie_id,title,tags
0,783461.0,looop lapeta,"['when', 'her', 'boyfriend', 'loses', 'a', ""mo..."
1,592508.0,sooryavanshi,"['a', 'fearless,', 'faithful', 'albeit', 'slig..."
2,864692.0,pathaan,"['a', 'soldier', 'caught', 'by', 'enemies', 'a..."
3,1018228.0,chor nikal ke bhaga,"['a', 'flight', 'attendant', 'and', 'her', 'bo..."
4,20453.0,3 idiots,"['rascal.', 'joker.', 'dreamer.', 'genius...',..."


In [26]:
tfidf = TfidfVectorizer(stop_words='english', max_features=5000)
vectors = tfidf.fit_transform(movies['tags']).toarray()

# Save the TF-IDF vectorizer for later use
with open('tfidf_vectorizer.pkl', 'wb') as f:
    pickle.dump(tfidf, f)

In [27]:
cosine_sim = cosine_similarity(vectors,vectors)

# Save similarity matrix for later
with open('cosine_sim.pkl', 'wb') as f:
    pickle.dump(cosine_sim, f)

In [28]:
indices = pd.Series(df.index, index=df['title']).drop_duplicates()

# Save the mapping
with open('indices.pkl', 'wb') as f:
    pickle.dump(indices, f)

In [50]:
def recommend(title, cosine_sim=cosine_sim):
    movie_title_lower = title.lower()
    # Check if the movie exists
    filtered = new_df[new_df['title'].str.lower() == movie_title_lower]
    if filtered.empty:
        print(f"Sorry, we don't have the movie '{title}' in our database.")
        return None
    idx = filtered.index[0]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:11]  # top 10 similar movies
    movie_indices = [i[0] for i in sim_scores]
    return new_df['title'].iloc[movie_indices]

In [53]:
recommendations = recommend("Lagaan")
if recommendations is None:
    print("No recommendations available for 'Laggan'.")
else:
    print("Recommended movies for 'Lagaan':")
    for movie in recommendations:
        print(movie)

Sorry, we don't have the movie 'Lagaan' in our database.
No recommendations available for 'Laggan'.


In [54]:
import pickle

# Save cleaned dataset
with open('movies.pkl', 'wb') as f:
    pickle.dump(df, f)

# Save TF-IDF vectorizer
with open('tfidf_vectorizer.pkl', 'wb') as f:
    pickle.dump(tfidf, f)

# Save cosine similarity matrix
with open('cosine_sim.pkl', 'wb') as f:
    pickle.dump(cosine_sim, f)

# Save title to index mapping
with open('indices.pkl', 'wb') as f:
    pickle.dump(indices, f)

In [55]:
with open('movies.pkl', 'rb') as f:
    df = pickle.load(f)

with open('tfidf_vectorizer.pkl', 'rb') as f:
    tfidf = pickle.load(f)

with open('cosine_sim.pkl', 'rb') as f:
    cosine_sim = pickle.load(f)

with open('indices.pkl', 'rb') as f:
    indices = pickle.load(f)

In [56]:
import os
print(os.getcwd())

/home/ec2-user/SageMaker


In [57]:
s3 = boto3.client('s3')
bucket_name = 'movie-content-45'  # replace with your bucket name

# List of files
files = ['tfidf_vectorizer.pkl', 'cosine_sim.pkl', 'indices.pkl', 'movies.pkl']

for file in files:
    local_path = f'/home/ec2-user/SageMaker/{file}'
    s3_key = f'models/{file}'  # folder path in S3
    s3.upload_file(local_path, bucket_name, s3_key)
    print(f"{file} uploaded to s3://{bucket_name}/{s3_key}")

tfidf_vectorizer.pkl uploaded to s3://movie-content-45/models/tfidf_vectorizer.pkl
cosine_sim.pkl uploaded to s3://movie-content-45/models/cosine_sim.pkl
indices.pkl uploaded to s3://movie-content-45/models/indices.pkl
movies.pkl uploaded to s3://movie-content-45/models/movies.pkl
