In [1]:
import pandas as pd
import os

# Path to the folder containing genre-wise CSVs
csv_folder = 'scraping/genre_csvs'

# Get all CSV file names in the folder
csv_files = [file for file in os.listdir(csv_folder) if file.endswith('.csv')]

# Initialize an empty list to hold DataFrames
all_dataframes = []

# Load and append each genre file
for file in csv_files:
    df = pd.read_csv(os.path.join(csv_folder, file))
    all_dataframes.append(df)

# Combine all genre data into a single DataFrame
combined_df = pd.concat(all_dataframes, ignore_index=True)

# 🧹 Clean the data
# Convert Rating to float
combined_df['Rating'] = pd.to_numeric(combined_df['Rating'], errors='coerce')

# Convert Votes to integer (remove commas)
combined_df['Votes'] = combined_df['Votes'].str.replace(',', '')
combined_df['Votes'] = pd.to_numeric(combined_df['Votes'], errors='coerce')

# Convert Duration to integer (extract digits only)
# Example input: "2h 15m" or "120 min"
def parse_duration(text):
    try:
        if 'h' in text:
            parts = text.lower().replace('m', '').split('h')
            hours = int(parts[0].strip())
            minutes = int(parts[1].strip()) if parts[1].strip().isdigit() else 0
            return hours * 60 + minutes
        else:
            return int(text.strip().split()[0])  # e.g., "120 min"
    except:
        return None

combined_df['Duration'] = combined_df['Duration'].apply(parse_duration)

# Drop rows with missing values in critical columns
combined_df.dropna(subset=['Title', 'Rating', 'Votes', 'Duration'], inplace=True)

# Reset index after cleaning
combined_df.reset_index(drop=True, inplace=True)

# 💾 Save the cleaned data
combined_csv_path = 'scraping/all_genres_combined.csv'
combined_df.to_csv(combined_csv_path, index=False)
print(f"✅ Combined and cleaned data saved to: {combined_csv_path}")
print(f"Total records: {len(combined_df)}")


NotADirectoryError: [WinError 267] The directory name is invalid: 'scraping/genre_csvs'