In [5]:
import pandas as pd

# Charger data
df = pd.read_csv("../data/data.csv")

df.head()
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1131615 entries, 0 to 1131614
Data columns (total 28 columns):
 #   Column                   Non-Null Count    Dtype  
---  ------                   --------------    -----  
 0   id                       1131615 non-null  int64  
 1   title                    1131602 non-null  object 
 2   vote_average             1131615 non-null  float64
 3   vote_count               1131615 non-null  float64
 4   status                   1131615 non-null  object 
 5   release_date             1014726 non-null  object 
 6   revenue                  1131615 non-null  float64
 7   runtime                  1131615 non-null  float64
 8   budget                   1131615 non-null  float64
 9   imdb_id                  641110 non-null   object 
 10  original_language        1131615 non-null  object 
 11  original_title           1131603 non-null  object 
 12  overview                 957691 non-null   object 
 13  popularity               1131615 non-null 

In [6]:
df = df[df['original_language'] == 'en']
df = df[df['overview'].notna()]

# garder films les plus populaires
df = df.sort_values('popularity', ascending=False).head(30000)

df = df.reset_index(drop=True)

In [7]:
def clean_text(col):
    if isinstance(col, str):
        return col.replace('[', '').replace(']', '').replace("'", "")
    return ""
    
df['genres_clean'] = df['genres'].apply(clean_text)
df['cast_clean'] = df['cast'].apply(clean_text)
df['director_clean'] = df['director'].fillna('')
df['tagline_clean'] = df['tagline'].fillna('')
df['overview_clean'] = df['overview'].fillna('')

In [8]:
df['text'] = (
    df['overview_clean'] + " " +
    df['genres_clean'] + " " +
    df['cast_clean'] + " " +
    df['director_clean'] + " " +
    df['tagline_clean']
)

In [9]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(
    stop_words='english',
    max_features=50000
)

tfidf_matrix = tfidf.fit_transform(df['text'])
tfidf_matrix.shape

(30000, 50000)

In [10]:
from sklearn.neighbors import NearestNeighbors

knn = NearestNeighbors(n_neighbors=10, metric='cosine')
knn.fit(tfidf_matrix)

0,1,2
,n_neighbors,10
,radius,1.0
,algorithm,'auto'
,leaf_size,30
,metric,'cosine'
,p,2
,metric_params,
,n_jobs,


In [11]:
def recommend(title, n=10):
    title = title.lower()

    if title not in df['title'].str.lower().values:
        return f"'{title}' not found."

    idx = df[df['title'].str.lower() == title].index[0]
    vector = tfidf_matrix[idx]

    distances, indices = knn.kneighbors(vector, n_neighbors=n+1)
    recommended = df.iloc[indices[0][1:]][['title', 'overview', 'genres_clean']]

    return recommended

In [12]:
def explain(title, n=5):
    title = title.lower()

    if title not in df['title'].str.lower().values:
        return f"'{title}' not found."

    idx = df[df['title'].str.lower() == title].index[0]
    vector = tfidf_matrix[idx]

    distances, indices = knn.kneighbors(vector, n_neighbors=n+1)
    indices = indices[0][1:]
    distances = distances[0][1:]

    base_genres = set(df.loc[idx,'genres_clean'].split(','))

    explanations = []

    for dist, i in zip(distances, indices):
        score = 1 - dist
        common_genres = base_genres & set(df.loc[i,'genres_clean'].split(','))

        explanations.append({
            'title': df.loc[i,'title'],
            'similarity_score': float(score),
            'shared_genres': list(common_genres)
        })

    return explanations

In [17]:
recommend("Pulp Fiction")

Unnamed: 0,title,overview,genres_clean
4124,Quentin Tarantino: 20 Years of Filmmaking,Take a look at Tarantino's career from the beg...,Documentary
3938,Reservoir Dogs,A botched robbery indicates a police informant...,"Crime, Thriller"
3714,Jackie Brown,Jackie Brown is a flight attendant who gets ca...,"Crime, Drama, Thriller"
20108,QT8: The First Eight,A detailed account of the life and artistic ca...,Documentary
23343,Four Rooms,It's Ted the Bellhop's first night on the job....,Comedy
4384,The Hateful Eight,Bounty hunters seek shelter from a raging bliz...,"Drama, Mystery, Western"
1581,Kill Bill: The Whole Bloody Affair,Quentin Tarantino’s complete cut combining Kil...,"Action, Crime"
3401,Little Nicky,After the lord of darkness decides he will not...,"Comedy, Fantasy"
3386,From Dusk Till Dawn,"After kidnapping a father and his two kids, th...","Horror, Action, Thriller, Crime"
3275,Death Proof,"Austin's hottest DJ, Jungle Julia, sets out in...","Action, Thriller"


In [18]:
explain("Pulp Fiction")

[{'title': 'Quentin Tarantino: 20 Years of Filmmaking',
  'similarity_score': 0.21980825671641946,
  'shared_genres': []},
 {'title': 'Reservoir Dogs',
  'similarity_score': 0.21724842788726395,
  'shared_genres': []},
 {'title': 'Jackie Brown',
  'similarity_score': 0.1898415746179134,
  'shared_genres': []},
 {'title': 'QT8: The First Eight',
  'similarity_score': 0.16321247593949628,
  'shared_genres': []},
 {'title': 'Four Rooms',
  'similarity_score': 0.15258286664868925,
  'shared_genres': []}]

In [20]:
import joblib

# Sauvegarder TF-IDF
joblib.dump(tfidf, "../models/tfidf_model.pkl")
# Sauvegarder kNN
joblib.dump(knn, "../models/knn_model.pkl")
# Sauvegarder le dataset filtré
df.to_pickle("../models/movies_df.pkl")