In [20]:
import pandas as pd
import numpy as np

imdb_path = '/Users/allig/ads/507/IMDBTop250Movies_Cleaned.csv'
movies_path = '/Users/allig/ads/507/cleaned_movies.csv'
netflix_path = '/Users/allig/ads/507/netflix_cleaned.csv'
# Function to load and clean datasets
def load_and_clean_data(imdb_path, movies_path, netflix_path):
    # Load datasets
    df_imdb = pd.read_csv(imdb_path)
    df_movies = pd.read_csv(movies_path)
    df_netflix = pd.read_csv(netflix_path)


    # Standardize column names
    for df in [df_imdb, df_movies, df_netflix]:
        df.columns = df.columns.str.strip().str.lower().str.replace(" ", "_")
    
    # Convert year to integer
    for df in [df_imdb, df_movies, df_netflix]:
        if 'year' in df.columns:
            df['year'] = pd.to_numeric(df['year'], errors='coerce').fillna(0).astype(int)
    
    return df_imdb, df_movies, df_netflix

def transform_data(df_imdb, df_movies, df_netflix):
    # Merge datasets
    df = pd.concat([df_imdb, df_movies, df_netflix], ignore_index=True)
    
    # Handle missing values 
    df['rating'] = pd.to_numeric(df['rating'], errors='coerce') if 'rating' in df else 0
    df['box_office'] = pd.to_numeric(df['box_office'], errors='coerce').fillna(0) if 'box_office' in df else 0
    df['budget'] = pd.to_numeric(df['budget'], errors='coerce').fillna(0) if 'budget' in df else 0
    
    # Fill NaNs
    df.fillna('', inplace=True)
    
    return df

def recommend_movies(df, movie_name):
    if 'name' not in df.columns:
        raise KeyError("The dataset must contain a 'name' column for recommendations.")
    
    # Dummy recommendation logic: Find movies from the same year
    movie_row = df[df['name'].str.lower() == movie_name.lower()]
    if movie_row.empty:
        return f"Movie '{movie_name}' not found in dataset."
    
    movie_year = movie_row['year'].values[0]
    recommendations = df[df['year'] == movie_year]['name'].tolist()
    recommendations.remove(movie_name) if movie_name in recommendations else None
    
    return recommendations[:5] if recommendations else "No similar movies found."

def etl_pipeline(imdb_path, movies_path, netflix_path, movie_name):
    df_imdb, df_movies, df_netflix = load_and_clean_data(imdb_path, movies_path, netflix_path)
    df_transformed = transform_data(df_imdb, df_movies, df_netflix)
    return recommend_movies(df_transformed, movie_name)


# Example usage
imdb_file = '/Users/allig/ads/507/IMDBTop250Movies_Cleaned.csv'
movies_file = '/Users/allig/ads/507/cleaned_movies.csv'
netflix_file = '/Users/allig/ads/507/netflix_cleaned.csv'
movie_to_recommend = "The Dark Knight"
print(etl_pipeline(imdb_file, movies_file, netflix_file, movie_to_recommend))


['WALL·E', 'Gran Torino', '88 Minutes (2008)', "My Mom's New Boyfriend (2008)", 'Be Kind Rewind (2008)']


  df.fillna('', inplace=True)
