In [29]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from imblearn.over_sampling import SMOTE
from sklearn.metrics.pairwise import cosine_similarity

In [30]:
df = pd.read_csv("movies_imdb.csv")

In [31]:
df.head()

Unnamed: 0,Poster_Link,Series_Title,Released_Year,Certificate,Runtime,Genre,IMDB_Rating,Overview,Meta_score,Director,Star1,Star2,Star3,Star4,No_of_Votes,Gross
0,https://m.media-amazon.com/images/M/MV5BMDFkYT...,The Shawshank Redemption,1994,A,142 min,Drama,9.3,Two imprisoned men bond over a number of years...,80.0,Frank Darabont,Tim Robbins,Morgan Freeman,Bob Gunton,William Sadler,2343110,28341469
1,https://m.media-amazon.com/images/M/MV5BM2MyNj...,The Godfather,1972,A,175 min,"Crime, Drama",9.2,An organized crime dynasty's aging patriarch t...,100.0,Francis Ford Coppola,Marlon Brando,Al Pacino,James Caan,Diane Keaton,1620367,134966411
2,https://m.media-amazon.com/images/M/MV5BMTMxNT...,The Dark Knight,2008,UA,152 min,"Action, Crime, Drama",9.0,When the menace known as the Joker wreaks havo...,84.0,Christopher Nolan,Christian Bale,Heath Ledger,Aaron Eckhart,Michael Caine,2303232,534858444
3,https://m.media-amazon.com/images/M/MV5BMWMwMG...,The Godfather: Part II,1974,A,202 min,"Crime, Drama",9.0,The early life and career of Vito Corleone in ...,90.0,Francis Ford Coppola,Al Pacino,Robert De Niro,Robert Duvall,Diane Keaton,1129952,57300000
4,https://m.media-amazon.com/images/M/MV5BMWU4N2...,12 Angry Men,1957,U,96 min,"Crime, Drama",9.0,A jury holdout attempts to prevent a miscarria...,96.0,Sidney Lumet,Henry Fonda,Lee J. Cobb,Martin Balsam,John Fiedler,689845,4360000


In [32]:
df.describe()

Unnamed: 0,IMDB_Rating,Meta_score,No_of_Votes
count,1000.0,843.0,1000.0
mean,7.9493,77.97153,273692.9
std,0.275491,12.376099,327372.7
min,7.6,28.0,25088.0
25%,7.7,70.0,55526.25
50%,7.9,79.0,138548.5
75%,8.1,87.0,374161.2
max,9.3,100.0,2343110.0


In [33]:
def categorize_rating(x):
    if x < 8.5:
        return 1
    else:
        return 2

df["Label"] = df["IMDB_Rating"].apply(categorize_rating)
print("Label distribution before balancing:\n", df["Label"].value_counts())

Label distribution before balancing:
 Label
1    947
2     53
Name: count, dtype: int64


# Feature Engineering

In [34]:
for col in ["Genre", "Overview", "Director", "Star1", "Star2", "Star3", "Star4"]:
    df[col] = df[col].fillna("")

df["combined_features"] = (
    df["Genre"] + " " +
    df["Overview"] + " " +
    df["Director"] + " " +
    df["Star1"] + " " + df["Star2"] + " " + df["Star3"] + " " + df["Star4"]
)

vectorizer = TfidfVectorizer(stop_words="english", max_features=5000)
X = vectorizer.fit_transform(df["combined_features"].astype(str))
y = df["Label"]

In [35]:
import joblib

In [37]:
clf = RandomForestClassifier(n_estimators=100, random_state=42)
clf.fit(X, y)

# --- Save artifacts ---
joblib.dump(clf, "rf_model.joblib")
joblib.dump(vectorizer, "tfidf_vectorizer.joblib")
joblib.dump(df[["Series_Title", "Genre", "IMDB_Rating", "combined_features"]], "movies_df.pkl")

print("Saved rf_model.joblib, tfidf_vectorizer.joblib, movies_df.pkl")

Saved rf_model.joblib, tfidf_vectorizer.joblib, movies_df.pkl


In [38]:
# from sklearn.pipeline import Pipeline

In [39]:
# If X is already a csr_matrix from TfidfVectorizer
clf = RandomForestClassifier(n_estimators=100, random_state=42)
clf.fit(X, y)

# Save separately
joblib.dump(clf, "rf_model.joblib")
joblib.dump(vectorizer, "tfidf_vectorizer.joblib")


['tfidf_vectorizer.joblib']

# Balance Dataset with SMOTE

In [40]:
sm = SMOTE(random_state=42)
X_res, y_res = sm.fit_resample(X, y)

print("Label distribution after SMOTE:\n", np.bincount(y_res))

Label distribution after SMOTE:
 [  0 947 947]


In [41]:
X_train, X_test, y_train, y_test = train_test_split(
    X_res, y_res, test_size=0.2, random_state=42, stratify=y_res
)

# Base Models (with class_weight)

In [42]:
log_reg = LogisticRegression(max_iter=2000, class_weight="balanced")
rf = RandomForestClassifier(n_estimators=300, random_state=42, class_weight="balanced")
gb = GradientBoostingClassifier(random_state=42)

# Ensemble (Voting Classifier)

In [43]:
ensemble_model = VotingClassifier(
    estimators=[("lr", log_reg), ("rf", rf), ("gb", gb)],
    voting="soft"
)

In [44]:
ensemble_model.fit(X_train, y_train)

0,1,2
,estimators,"[('lr', ...), ('rf', ...), ...]"
,voting,'soft'
,weights,
,n_jobs,
,flatten_transform,True
,verbose,False

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,'balanced'
,random_state,
,solver,'lbfgs'
,max_iter,2000

0,1,2
,n_estimators,300
,criterion,'gini'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True

0,1,2
,loss,'log_loss'
,learning_rate,0.1
,n_estimators,100
,subsample,1.0
,criterion,'friedman_mse'
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_depth,3
,min_impurity_decrease,0.0


# Evaluation

In [45]:
y_pred = ensemble_model.predict(X_test)
print("\nClassification Report (Medium=1, High=2):")
print(classification_report(y_test, y_pred))


Classification Report (Medium=1, High=2):
              precision    recall  f1-score   support

           1       0.97      1.00      0.99       190
           2       1.00      0.97      0.99       189

    accuracy                           0.99       379
   macro avg       0.99      0.99      0.99       379
weighted avg       0.99      0.99      0.99       379



# Recommendation Function

In [46]:
def recommend_top_movies(n=10):
    probs = ensemble_model.predict_proba(X)
    # Probability of being High (class = 2)
    df["High_Prob"] = probs[:, 1] if probs.shape[1] == 2 else probs[:, 2]
    recommendations = df.sort_values("High_Prob", ascending=False).head(n)
    return recommendations[["Series_Title", "Genre", "IMDB_Rating", "High_Prob"]]

In [47]:
cosine_sim = cosine_similarity(X, X)

def recommend_similar_movies(title, n=5):
    if title not in df["Series_Title"].values:
        return f"Movie '{title}' not found in dataset!"
    
    idx = df[df["Series_Title"] == title].index[0]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)[1:n+1]
    movie_indices = [i[0] for i in sim_scores]
    
    return df.iloc[movie_indices][["Series_Title", "Genre", "IMDB_Rating"]]

In [48]:
print("\nTop Recommended Movies:")
print(recommend_top_movies(10))


Top Recommended Movies:
                                     Series_Title                       Genre  \
13          The Lord of the Rings: The Two Towers    Action, Adventure, Drama   
38                                    The Pianist     Biography, Drama, Music   
5   The Lord of the Rings: The Return of the King    Action, Adventure, Drama   
29                                      Star Wars  Action, Adventure, Fantasy   
51                                   Modern Times       Comedy, Drama, Family   
49                                         Psycho   Horror, Mystery, Thriller   
32                          It's a Wonderful Life      Drama, Family, Fantasy   
28                       The Silence of the Lambs      Crime, Drama, Thriller   
2                                 The Dark Knight        Action, Crime, Drama   
21                                   Interstellar    Adventure, Drama, Sci-Fi   

    IMDB_Rating  High_Prob  
13          8.7   0.933036  
38          8.5   0.92684

In [51]:
print("\nMovies similar to 'The Shawshank Redemption':")
print(recommend_similar_movies("The Dark Knight", 5))


Movies similar to 'The Shawshank Redemption':
              Series_Title                   Genre  IMDB_Rating
155          Batman Begins       Action, Adventure          8.2
63   The Dark Knight Rises       Action, Adventure          8.4
36            The Prestige  Drama, Mystery, Sci-Fi          8.5
773     Brokeback Mountain          Drama, Romance          7.7
241      Kill Bill: Vol. 1    Action, Crime, Drama          8.1


In [50]:
import joblib
joblib.dump(ensemble_model, "movie_model.pkl")


['movie_model.pkl']