In [1]:
import pandas as pd
df1 = pd.read_csv(r'C:\Users\Bclea\Downloads\TMDB_movie_dataset_v11.csv\TMDB_movie_dataset_v11.csv')

In [3]:
df1.revenue.value_counts()

revenue
0           1087332
100             406
1               389
10              212
1000            182
             ...   
151086            1
8762890           1
5144717           1
53825515          1
6096              1
Name: count, Length: 14270, dtype: int64

In [4]:
df1['profit'] = (df1.revenue > df1.budget).astype(int)

In [5]:
df1.profit.value_counts()

profit
0    1092870
1      14966
Name: count, dtype: int64

In [6]:
# Filter the rows where profit == 0
profit_zero_df = df1[df1['profit'] == 0]

# Randomly drop 1,077,904 rows from profit == 0
rows_to_drop = profit_zero_df.sample(n=1077904, random_state=42)

# Drop the selected rows from the original dataframe
df1 = df1.drop(rows_to_drop.index)

In [7]:
df1.profit.value_counts()

profit
1    14966
0    14966
Name: count, dtype: int64

In [8]:
df1.tail()

Unnamed: 0,id,title,vote_average,vote_count,status,release_date,revenue,runtime,adult,backdrop_path,...,overview,popularity,poster_path,tagline,genres,production_companies,production_countries,spoken_languages,keywords,profit
1107709,766061,Tiff Needell's Buying A Used Car With Confidence,0.0,0,Released,2001-01-22,0,0,False,,...,BBC Television's Top Gear presenter Tiff Neede...,0.6,/2PDbWgutbMoZ5d3xaKuA3DqvGXr.jpg,,,,,,,0
1107716,766069,Fright Night of the Living Dead,0.0,0,Released,1986-12-25,0,46,False,/u6q2PaSS4SRin1c59t6JOUlDTwV.jpg,...,Three roommates transform into undead after on...,1.09,/25i7sbUz3tLM6ohTNTgiQhM4W54.jpg,,Horror,629 Castro,United States of America,English,"vampire, house, basement, zombie, cannibal, ki...",0
1107718,766048,Outro Dia,0.0,0,Released,2013-06-12,0,0,False,,...,,0.6,,,,,Brazil,,,0
1107725,766027,And Then God Laughed,0.0,0,Released,2020-11-19,0,28,False,,...,After losing both his job and his girlfriend o...,0.899,/3opacphD2UDQTJsvVsGV2QFIf9r.jpg,,"Comedy, Crime",,"United States of America, Mexico, Czech Republic","Czech, English",,0
1107805,765949,Problematický gentleman,0.0,0,Released,1923-01-25,0,0,False,,...,,0.6,,,Drama,Pronax-Film,Czechoslovakia,No Language,,0


In [9]:
df1.columns

Index(['id', 'title', 'vote_average', 'vote_count', 'status', 'release_date',
       'revenue', 'runtime', 'adult', 'backdrop_path', 'budget', 'homepage',
       'imdb_id', 'original_language', 'original_title', 'overview',
       'popularity', 'poster_path', 'tagline', 'genres',
       'production_companies', 'production_countries', 'spoken_languages',
       'keywords', 'profit'],
      dtype='object')

In [10]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import OneHotEncoder
from sklearn.feature_extraction.text import TfidfVectorizer

# Assuming df is your DataFrame with the given columns
# Step 1: Preprocessing
df = df1.copy()
# Drop irrelevant columns
df = df.drop(columns=['id', 'backdrop_path', 'poster_path', 'homepage', 'imdb_id', 'revenue'])
df['genres'] = df['genres'].apply(lambda x: x.split(',')[0] if isinstance(x, str) else None)

# Fill missing numerical columns with their mean value
df['budget'].fillna(df['budget'].mean(), inplace=True)
#df['revenue'].fillna(df['revenue'].mean(), inplace=True)
df['runtime'].fillna(df['runtime'].mean(), inplace=True)
df['vote_average'].fillna(df['vote_average'].mean(), inplace=True)
df['vote_count'].fillna(df['vote_count'].mean(), inplace=True)

# Fill missing categorical columns with the mode (most frequent value)
df['original_language'].fillna(df['original_language'].mode()[0], inplace=True)
df['status'].fillna(df['status'].mode()[0], inplace=True)

# Step 2: Process text columns using TF-IDF
text_columns = ['overview', 'keywords', 'tagline']

def preprocess_text(text):
    if isinstance(text, str):
        return text
    else:
        return ""  # Return an empty string if NaN or non-string data

# Apply the preprocessing
for col in text_columns:
    df[col] = df[col].apply(preprocess_text)

# Combine all the text columns into one
df['combined_text'] = df['overview'] + ' ' + df['keywords'] + ' ' + df['tagline']

# Vectorize the combined text using TF-IDF
tfidf = TfidfVectorizer(max_features=500, stop_words='english')
text_features = tfidf.fit_transform(df['combined_text'])

# Step 3: One-hot encode categorical columns
categorical_columns = ['original_language', 'status', 'genres']

# One-hot encode the categorical columns (use sparse_output=False in recent versions of sklearn)
encoder = OneHotEncoder(handle_unknown='ignore', sparse_output=False)
encoded_categories = encoder.fit_transform(df[categorical_columns])

# Step 4: Combine the TF-IDF features with the other numerical columns
numerical_columns = [ 'runtime', 'vote_average', 'vote_count', 'popularity']

# Convert numerical columns to a NumPy array and combine with the text features and one-hot encoded categorical data
X = np.hstack((text_features.toarray(), encoded_categories, df[numerical_columns].values))

# Define the target variable (Profit)
y = df['profit']

# Step 5: Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 6: Train a classification model (Random Forest in this case)
clf = RandomForestClassifier(random_state=42)
clf.fit(X_train, y_train)

# Step 7: Make predictions
y_pred = clf.predict(X_test)

# Step 8: Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy * 100:.2f}%")
print("Classification Report:")
print(classification_report(y_test, y_pred))


Accuracy: 85.22%
Classification Report:
              precision    recall  f1-score   support

           0       0.83      0.88      0.86      3005
           1       0.88      0.82      0.85      2982

    accuracy                           0.85      5987
   macro avg       0.85      0.85      0.85      5987
weighted avg       0.85      0.85      0.85      5987



In [11]:
df.profit.value_counts()

profit
1    14966
0    14966
Name: count, dtype: int64

In [12]:
# Step 9: Show predictions alongside actual labels
print("Predicted values vs Actual values:")
for i in range(len(y_test)):
    print(f"Prediction: {y_pred[i]:} \t Actual: {y_test.iloc[i]:}")

Predicted values vs Actual values:
Prediction: 1 	 Actual: 1
Prediction: 0 	 Actual: 0
Prediction: 1 	 Actual: 1
Prediction: 1 	 Actual: 1
Prediction: 0 	 Actual: 0
Prediction: 1 	 Actual: 1
Prediction: 1 	 Actual: 0
Prediction: 1 	 Actual: 0
Prediction: 0 	 Actual: 0
Prediction: 1 	 Actual: 1
Prediction: 1 	 Actual: 1
Prediction: 0 	 Actual: 0
Prediction: 0 	 Actual: 0
Prediction: 0 	 Actual: 0
Prediction: 0 	 Actual: 0
Prediction: 0 	 Actual: 0
Prediction: 0 	 Actual: 1
Prediction: 0 	 Actual: 0
Prediction: 1 	 Actual: 1
Prediction: 0 	 Actual: 0
Prediction: 1 	 Actual: 1
Prediction: 0 	 Actual: 1
Prediction: 1 	 Actual: 1
Prediction: 1 	 Actual: 1
Prediction: 1 	 Actual: 1
Prediction: 0 	 Actual: 0
Prediction: 1 	 Actual: 1
Prediction: 1 	 Actual: 1
Prediction: 1 	 Actual: 1
Prediction: 0 	 Actual: 0
Prediction: 1 	 Actual: 1
Prediction: 1 	 Actual: 1
Prediction: 1 	 Actual: 0
Prediction: 0 	 Actual: 0
Prediction: 1 	 Actual: 1
Prediction: 0 	 Actual: 0
Prediction: 0 	 Actual: 0
Pre

In [13]:
# Example new data
new_data = pd.DataFrame({
    'overview': ['A thrilling action-packed adventure.'],
    'keywords': ['action, adventure, fight'],
    'tagline': ['The ultimate battle.'],
    'original_language': ['en'],
    'status': ['Released'],
    'genres': ['Action'],
    #'budget': [120000000],
    'runtime': [169],
    'vote_average': [100],
    'vote_count': [0],
    'popularity': [50]
})

# Preprocess the new data just like we did for the training data

# Step 1: Preprocess the text columns and combine them
new_data['combined_text'] = new_data['overview'] + ' ' + new_data['keywords'] + ' ' + new_data['tagline']

# Step 2: Transform the text using the already fitted TF-IDF vectorizer
new_text_features = tfidf.transform(new_data['combined_text'])

# Step 3: One-hot encode the categorical columns using the fitted encoder
new_encoded_categories = encoder.transform(new_data[['original_language', 'status', 'genres']])

# Step 4: Combine the TF-IDF features with the numerical columns
new_X = np.hstack((new_text_features.toarray(), new_encoded_categories, new_data[[ 'runtime', 'vote_average', 'vote_count', 'popularity']].values))

# Step 5: Make predictions using the trained Random Forest classifier
new_predictions = clf.predict(new_X)

# Step 6: Print the prediction
print("Predicted Profit for New Data:", new_predictions[0])


Predicted Profit for New Data: 1
