# Data Loading & Pre-processing:

In [2]:
import numpy as np
import pandas as pd
import ast
from sklearn.preprocessing import LabelEncoder, StandardScaler
import statsmodels.formula.api as smf
from sklearn.model_selection import KFold
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, roc_auc_score

# Load datasets
movies = pd.read_csv('tmdb_5000_movies.csv')
credits = pd.read_csv('tmdb_5000_credits.csv')

# Merge datasets
movies = movies.merge(credits, on='title')

# Create vote_factor column
movies['vote_factor'] = movies['vote_average'].apply(lambda x: 1 if x >= 7 else 0)

# Function to extract first item from JSON string
def get_first_item(json_str, key='name'):
    try:
        items = ast.literal_eval(json_str)
        return items[0][key] if items else ''
    except:
        return ''

# Function to extract director from crew
def get_director(json_str):
    try:
        crew = ast.literal_eval(json_str)
        for person in crew:
            if person['job'] == 'Director':
                return person['name']
        return ''
    except:
        return ''

# Preprocess complex columns
movies['primary_genre'] = movies['genres'].apply(lambda x: get_first_item(x))
movies['primary_keyword'] = movies['keywords'].apply(lambda x: get_first_item(x))
movies['top_actor'] = movies['cast'].apply(lambda x: get_first_item(x))
movies['director'] = movies['crew'].apply(get_director)
movies['primary_production_company'] = movies['production_companies'].apply(lambda x: get_first_item(x))
movies['primary_production_country'] = movies['production_countries'].apply(lambda x: get_first_item(x))
movies['primary_spoken_language'] = movies['spoken_languages'].apply(lambda x: get_first_item(x))

# Convert release_date to release_year
movies['release_year'] = pd.to_datetime(movies['release_date'], errors='coerce').dt.year

# Identify non-numeric columns for encoding
non_numeric_cols = [
    'original_language', 'status', 'primary_genre', 'primary_keyword',
    'top_actor', 'director', 'primary_production_company',
    'primary_production_country', 'primary_spoken_language'
]

# Apply LabelEncoder to non-numeric columns
label_encoders = {}
for col in non_numeric_cols:
    le = LabelEncoder()
    movies[col] = movies[col].replace('', 'Unknown')
    movies[col] = le.fit_transform(movies[col])
    label_encoders[col] = le

# Select columns for regression
regression_cols = [
    'vote_factor', 'budget', 'popularity', 'revenue', 'runtime', 'vote_count',
    'release_year', 'original_language', 'status', 'primary_genre',
    'primary_keyword', 'top_actor', 'director', 'primary_production_company',
    'primary_production_country', 'primary_spoken_language'
]
movies_regression = movies[regression_cols].copy()

# Handle missing values
movies_regression = movies_regression.dropna()
movies_regression.columns = movies_regression.columns.str.replace(' ', '_')


# Logistic Regression Analysis:

In [4]:
formula = 'vote_factor ~ budget + popularity + revenue + runtime + vote_count + release_year + original_language + status + primary_genre + primary_keyword + top_actor + director + primary_production_company + primary_production_country + primary_spoken_language'
logit_model = smf.logit(formula, data=movies_regression).fit()
print("Logistic Regression Summary:")
print(logit_model.summary())

Optimization terminated successfully.
         Current function value: 0.384949
         Iterations 7
Logistic Regression Summary:
                           Logit Regression Results                           
Dep. Variable:            vote_factor   No. Observations:                 4806
Model:                          Logit   Df Residuals:                     4790
Method:                           MLE   Df Model:                           15
Date:                Tue, 15 Apr 2025   Pseudo R-squ.:                  0.2423
Time:                        15:24:26   Log-Likelihood:                -1850.1
converged:                       True   LL-Null:                       -2441.6
Covariance Type:            nonrobust   LLR p-value:                6.930e-243
                                 coef    std err          z      P>|z|      [0.025      0.975]
----------------------------------------------------------------------------------------------
Intercept                     59.6326      6.30

In [5]:
# K-Fold Cross-Validation with scaling
X_log = movies_regression.drop(columns=['vote_factor'])
y_log = movies_regression['vote_factor']

# Identify numeric columns for scaling
numeric_cols = ['budget', 'popularity', 'revenue', 'runtime', 'vote_count', 'release_year']
scaler = StandardScaler()

kf = KFold(n_splits=5, shuffle=True, random_state=42)
log_reg = LogisticRegression(max_iter=1000, solver='lbfgs')

accuracy_scores = []
auc_scores = []

for train_index, test_index in kf.split(X_log):
    X_train, X_test = X_log.iloc[train_index], X_log.iloc[test_index]
    y_train, y_test = y_log.iloc[train_index], y_log.iloc[test_index]
    
    # Scale numeric columns
    X_train_scaled = X_train.copy()
    X_test_scaled = X_test.copy()
    X_train_scaled[numeric_cols] = scaler.fit_transform(X_train[numeric_cols])
    X_test_scaled[numeric_cols] = scaler.transform(X_test[numeric_cols])
    
    # Fit model
    log_reg.fit(X_train_scaled, y_train)
    y_pred = log_reg.predict(X_test_scaled)
    y_pred_proba = log_reg.predict_proba(X_test_scaled)[:, 1]
    
    accuracy_scores.append(accuracy_score(y_test, y_pred))

print("\nK-Fold Cross-Validation Results:")
print(f"Mean Accuracy: {np.mean(accuracy_scores):.4f}")

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt


K-Fold Cross-Validation Results:
Mean Accuracy: 0.8412


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


# Naive Bayes Classifier Analysis:

In [7]:
# Gausian :
from sklearn.model_selection import train_test_split
import pandas as pd
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import confusion_matrix, accuracy_score

X_nb = movies_regression.drop(['vote_factor'],axis=1)
y_nb = movies_regression["vote_factor"]

X_train, X_test, Y_train, Y_test = train_test_split(X_nb,y_nb,test_size = 0.2, random_state = 123)
model = GaussianNB()

model.fit(X_train,Y_train) # dont fit the model next time, only once.
print("Training scores")
print()
Y_predict = model.predict(X_train)
print("Confusion Matrix")
print(confusion_matrix(Y_train,Y_predict))
print()
print("Accuracy Score ( Train )")
print(accuracy_score(Y_train,Y_predict))
print("-"*60)
print()
print("Testing Scores")
print()
Y_predict_test = model.predict(X_test)
print("Confusion Matrix")
print(confusion_matrix(Y_predict_test, Y_test))
print()
print("Accuracy Score ( Test )")
print(accuracy_score(Y_predict_test, Y_test))

Training scores

Confusion Matrix
[[2901  145]
 [ 698  100]]

Accuracy Score ( Train )
0.7806971904266389
------------------------------------------------------------

Testing Scores

Confusion Matrix
[[741 175]
 [ 31  15]]

Accuracy Score ( Test )
0.7858627858627859


# K fold validation to check which model is better:

In [17]:
logit_model = LogisticRegression(max_iter=1000, solver='lbfgs')
nb_model = GaussianNB()

K = 10
kfold = KFold(n_splits=K, random_state=0, shuffle=True)

# --------------------------------------------------
# Step 8: Cross-Validation for Logistic Regression
# --------------------------------------------------
from sklearn.model_selection import cross_val_score

mse_cv1 = cross_val_score(logit_model, X_log, y_log, cv=kfold, scoring='neg_mean_squared_error')
print("MSE 1 (Logistic Regression):")
print(-mse_cv1)
print()
print("Mean of MSE 1 (Logistic Regression):")
print(np.mean(-mse_cv1))

print()
print("-"*75)
print()
# --------------------------------------------------
# Step 9: Cross-Validation for Gaussian NB
# --------------------------------------------------
mse_cv_2 = cross_val_score(nb_model, X_nb, y_nb, cv=kfold, scoring='neg_mean_squared_error')
print("\nMSE 2 (Gaussian NB):")
print(-mse_cv_2)
print()
print("Mean of MSE 2 (Gaussian NB):")
print(np.mean(-mse_cv_2))


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

MSE 1 (Logistic Regression):
[0.18711019 0.16839917 0.16008316 0.19334719 0.18711019 0.15800416
 0.17083333 0.15208333 0.18541667 0.1625    ]

Mean of MSE 1 (Logistic Regression):
0.17248873873873874

---------------------------------------------------------------------------


MSE 2 (Gaussian NB):
[0.21829522 0.21621622 0.20790021 0.24532225 0.22453222 0.21205821
 0.20833333 0.2125     0.2375     0.20208333]

Mean of MSE 2 (Gaussian NB):
0.21847409909909912


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


# Final Conclusion:
# Logistic Regression is more accurate since its MSE value is lesser than the pre-trained Gaussian model