# Data Exploration Notebook

## 1. Chargement des CSV

In [None]:
import pandas as pd
movies = pd.read_csv('movies.csv')
actors = pd.read_csv('actors.csv')
ratings = pd.read_csv('ratings.csv')
cast = pd.read_csv('cast.csv')
movies.head()

## 2. Statistiques descriptives

In [None]:
def describe(df):
    print("Shape:", df.shape)
    print("\nDtypes:\n", df.dtypes)
    print("\nMissing values:\n", df.isna().sum())
    print("\nEmpty strings:\n", (df == '').sum())
    print("\nUnique values:\n", df.nunique())

for name, df in [('movies', movies), ('actors', actors), ('ratings', ratings), ('cast', cast)]:
    print(f"--- {name} ---")
    describe(df)
    print("\n")

## 3. Analyses exploratoires

In [None]:
import matplotlib.pyplot as plt

# Distribution des films par année
movies['year'].plot.hist(bins=30)
plt.title('Distribution des films par année')
plt.show()

In [None]:
# Top 10 des genres
movies['genre'].value_counts().head(10).plot.bar()
plt.title('Top 10 genres')
plt.show()

In [None]:
# Distribution des notes
ratings['rating'].plot.hist(bins=20)
plt.title('Distribution des notes')
plt.show()

In [None]:
# Nombre moyen d'acteurs par film
avg_actors = cast.groupby('movie_id')['actor_id'].count().mean()
avg_actors

## 4. Relations entre tables

In [None]:
# Vérifier les clés étrangères
all_movie_ids = set(movies['movie_id'])
foreign_ok = cast['movie_id'].isin(all_movie_ids).all()
foreign_ok

In [None]:
# Données orphelines
orphans = cast[~cast['movie_id'].isin(all_movie_ids)]
orphans.head()