In [25]:
import pandas as pd
import re

# Sample path to your CSV file
input_path = "scraping/scraping/imdb_2024_all_movies.csv"
output_path = "scraping/scraping/imdb_2024_all_movies_cleaned.csv"

# Read the CSV file
df = pd.read_csv(input_path)

# Clean Title
df['Title'] = df['Title'].astype(str).str.strip().str.replace('\n', '', regex=True)

# Clean Rating
df['Rating'] = pd.to_numeric(df['Rating'], errors='coerce')

# Clean Votes (remove parens, 'k', commas, etc.)
def clean_votes(v):
    v = str(v).strip().replace("(", "").replace(")", "").lower()
    if 'k' in v:
        try:
            return int(float(v.replace('k', '')) * 1000)
        except:
            return pd.NA
    try:
        return int(v)
    except:
        return pd.NA

df['Votes'] = df['Votes'].apply(clean_votes).astype('Int64')

# Clean Duration (convert to minutes)
def convert_duration_to_minutes(duration):
    try:
        h = re.search(r'(\d+)h', duration)
        m = re.search(r'(\d+)m', duration)
        total_minutes = 0
        if h:
            total_minutes += int(h.group(1)) * 60
        if m:
            total_minutes += int(m.group(1))
        return total_minutes if total_minutes > 0 else pd.NA
    except:
        return pd.NA

df['Duration'] = df['Duration'].astype(str).apply(convert_duration_to_minutes).astype('Int64')

# Clean Genre (title case)
df['Genre'] = df['Genre'].astype(str).str.strip().str.capitalize()

# Drop any rows missing essential info
df = df.dropna(subset=['Title', 'Rating', 'Votes', 'Duration', 'Genre'])

# Save to cleaned CSV
df.to_csv(output_path, index=False)
print(f"✅ Cleaned data saved to: {output_path}")
print(df.head())


✅ Cleaned data saved to: scraping/scraping/imdb_2024_all_movies_cleaned.csv
                      Title  Rating   Votes  Duration   Genre
0     1. The Unholy Trinity     6.2     294        95  Action
1           2. Gladiator II     6.5  250000       148  Action
2         3. Dune: Part Two     8.5  640000       166  Action
3  4. Venom: The Last Dance     6.0  128000       110  Action
4          5. The Beekeeper     6.3  161000       105  Action
