In [1]:
import pandas as pd
# Load the new CSV file into a DataFrame
file_path_new = 'Movies_new_FTD_preprocessed.csv'
movies_df_new = pd.read_csv(file_path_new)

# Display the first few rows of the new dataframe to understand its structure
movies_df_new.head()


Unnamed: 0,budget,genres,original_language,popularity,production_companies,production_countries,revenue,runtime,title,vote_average,vote_count,cast,release_year,release_month,release_day,Profit,roi
0,200000000,7,4,37.668301,1,141,310669540,140.0,Robin Hood,6.2,1398,1,2010,5,12,110669540,55.33477
1,180000000,3,4,42.990906,1,141,372234864,113.0,The Golden Compass,5.8,1303,1,2007,12,4,192234864,106.797147
2,150000000,9,4,21.939663,1,2086,836297228,150.0,Transformers: Revenge of the Fallen,6.0,3138,1,2009,6,19,686297228,457.531485
3,170000000,13,4,73.79505,1,2086,400062763,125.0,TRON: Legacy,6.3,2841,1,2010,12,10,230062763,135.331037
4,200000000,2,4,49.98659,11,2086,559852396,106.0,Cars 2,5.8,2033,1,2011,6,11,359852396,179.926198


In [2]:
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

# We'll start with the new dataset and drop non-numeric columns that are not useful for PCA
# Dropping the 'title' as it is unique for each movie and not useful for PCA
pca_data = movies_df_new.drop('title', axis=1)

# Standardizing the features before applying PCA
scaler = StandardScaler()
pca_data_scaled = scaler.fit_transform(pca_data)

# Applying PCA
pca = PCA(n_components=0.95)  # Choose enough components to explain 95% of the variance
pca_data_transformed = pca.fit_transform(pca_data_scaled)

# Create a DataFrame for the PCA-transformed data
pca_columns = ['PCA_Component_' + str(i+1) for i in range(pca_data_transformed.shape[1])]
pca_df = pd.DataFrame(data=pca_data_transformed, columns=pca_columns)

pca_df.head()


Unnamed: 0,PCA_Component_1,PCA_Component_2,PCA_Component_3,PCA_Component_4,PCA_Component_5,PCA_Component_6,PCA_Component_7,PCA_Component_8,PCA_Component_9,PCA_Component_10,PCA_Component_11,PCA_Component_12
0,3.470051,-1.16605,0.981357,1.295419,-1.55577,0.290481,1.227478,-0.592002,2.616234,-0.714719,0.775245,0.868749
1,3.721537,-1.332261,0.449716,1.159781,-2.2372,1.708728,1.529549,-0.185976,0.505024,-0.87453,-0.15967,0.945563
2,7.919775,-0.802569,-1.618784,0.608579,-1.708036,-0.488626,1.235836,1.7535,2.32129,0.254965,-2.350398,-2.09648
3,5.540074,-0.592445,-0.305877,-0.020066,-1.490478,1.11122,1.181902,-0.322566,0.209541,0.567879,0.958426,1.20568
4,5.718726,-2.052875,-1.795157,1.328168,-1.473226,0.550846,0.713923,0.936031,1.253706,-0.034184,0.270197,0.254242


In [3]:
# Selecting features based on domain knowledge for the feature-selected dataset
features_selected = ['budget', 'popularity', 'runtime', 'vote_average']

# Creating the feature-selected DataFrame
feature_selected_df = movies_df_new[features_selected]

# Checking the first few rows of the feature-selected dataset
feature_selected_df.head()


Unnamed: 0,budget,popularity,runtime,vote_average
0,200000000,37.668301,140.0,6.2
1,180000000,42.990906,113.0,5.8
2,150000000,21.939663,150.0,6.0
3,170000000,73.79505,125.0,6.3
4,200000000,49.98659,106.0,5.8


In [4]:
from sklearn.model_selection import train_test_split
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score, mean_squared_error

# Splitting the original data into features and target variable
X_original = movies_df_new.drop(['title', 'revenue'], axis=1)
y_original = movies_df_new['revenue']

# Splitting the data into training and testing sets
X_train_original, X_test_original, y_train_original, y_test_original = train_test_split(
    X_original, y_original, test_size=0.2, random_state=42
)

# Function to train and evaluate a model
def train_and_evaluate_model(model, X_train, y_train, X_test, y_test):
    # Training the model
    model.fit(X_train, y_train)
    # Predicting the test set results
    y_pred = model.predict(X_test)
    # Evaluating the model
    r2 = r2_score(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)
    return r2, mse

# Dictionary to hold the results
results = {}

# Training and evaluating SVM
svm_model = SVR()
results['Original_SVM'] = train_and_evaluate_model(svm_model, X_train_original, y_train_original, X_test_original, y_test_original)

# Training and evaluating Decision Tree
decision_tree_model = DecisionTreeRegressor()
results['Original_DecisionTree'] = train_and_evaluate_model(decision_tree_model, X_train_original, y_train_original, X_test_original, y_test_original)

# Training and evaluating Random Forest
random_forest_model = RandomForestRegressor()
results['Original_RandomForest'] = train_and_evaluate_model(random_forest_model, X_train_original, y_train_original, X_test_original, y_test_original)

results


{'Original_SVM': (-0.14888349737841122, 1.6232019186219896e+16),
 'Original_DecisionTree': (0.9921285835397599, 111211435534495.48),
 'Original_RandomForest': (0.9977225819502838, 32176537971334.94)}

In [5]:
# Splitting the PCA data into training and testing sets
X_train_pca, X_test_pca, y_train_pca, y_test_pca = train_test_split(
    pca_df, y_original, test_size=0.2, random_state=42
)

# Training and evaluating SVM on PCA data
svm_model_pca = SVR()
results['PCA_SVM'] = train_and_evaluate_model(svm_model_pca, X_train_pca, y_train_pca, X_test_pca, y_test_pca)

# Training and evaluating Decision Tree on PCA data
decision_tree_model_pca = DecisionTreeRegressor()
results['PCA_DecisionTree'] = train_and_evaluate_model(decision_tree_model_pca, X_train_pca, y_train_pca, X_test_pca, y_test_pca)

# Training and evaluating Random Forest on PCA data
random_forest_model_pca = RandomForestRegressor()
results['PCA_RandomForest'] = train_and_evaluate_model(random_forest_model_pca, X_train_pca, y_train_pca, X_test_pca, y_test_pca)

results


{'Original_SVM': (-0.14888349737841122, 1.6232019186219896e+16),
 'Original_DecisionTree': (0.9921285835397599, 111211435534495.48),
 'Original_RandomForest': (0.9977225819502838, 32176537971334.94),
 'PCA_SVM': (-0.14888657879685785, 1.6232062722090556e+16),
 'PCA_DecisionTree': (0.9238768651485221, 1075506949350330.9),
 'PCA_RandomForest': (0.9726737154524681, 386079855856390.56)}

In [6]:
# Splitting the feature selected data into training and testing sets
X_train_selected, X_test_selected, y_train_selected, y_test_selected = train_test_split(
    feature_selected_df, y_original, test_size=0.2, random_state=42
)

# Training and evaluating SVM on PCA data
svm_model_feature_selected = SVR()
results['feature_selected_SVM'] = train_and_evaluate_model(svm_model_feature_selected, X_train_selected, y_train_selected, X_test_selected, y_test_selected)

# Training and evaluating Decision Tree on PCA data
decision_tree_model_selected = DecisionTreeRegressor()
results['feature_selected_DecisionTree'] = train_and_evaluate_model(decision_tree_model_selected, X_train_selected, y_train_selected, X_test_selected, y_test_selected)

# Training and evaluating Random Forest on PCA data
random_forest_model_selected = RandomForestRegressor()
results['feature_selected_RandomForest'] = train_and_evaluate_model(random_forest_model_selected, X_train_selected, y_train_selected, X_test_selected, y_test_selected)

results

{'Original_SVM': (-0.14888349737841122, 1.6232019186219896e+16),
 'Original_DecisionTree': (0.9921285835397599, 111211435534495.48),
 'Original_RandomForest': (0.9977225819502838, 32176537971334.94),
 'PCA_SVM': (-0.14888657879685785, 1.6232062722090556e+16),
 'PCA_DecisionTree': (0.9238768651485221, 1075506949350330.9),
 'PCA_RandomForest': (0.9726737154524681, 386079855856390.56),
 'feature_selected_SVM': (-0.14888451247792966, 1.6232033528069958e+16),
 'feature_selected_DecisionTree': (0.057011762832434476,
  1.332302465220459e+16),
 'feature_selected_RandomForest': (0.5061627726324186, 6977187302099415.0)}

In [7]:
import pandas as pd

# The given results as a dictionary
# results = {
#     'Original_SVM': (-0.13141531868415868, 6.3125613905530424e+16),
#     'Original_DecisionTree': (0.5006662806260739, 2.78595729248829e+16),
#     'Original_RandomForest': (0.6997448955115948, 1.6752281400205102e+16),
#     'PCA_SVM': (-0.1314155071991192, 6.312562442343877e+16),
#     'PCA_DecisionTree': (0.8088811581480643, 1.0663187974906058e+16),
#     'PCA_RandomForest': (0.8344155728264411, 9238533760249022.0),
#     'Feature_Selected_SVM': (-0.1314153487878933, 6.312561558512298e+16),
#     'Feature_Selected_DecisionTree': (0.5000373521221251, 2.7894663043656104e+16),
#     'Feature_Selected_RandomForest': (0.6568289923558204, 1.914670958964242e+16)
# }

# Convert the dictionary to a DataFrame
results_df = pd.DataFrame.from_dict(results, orient='index', columns=['R2_Score', 'MSE'])
results_df.reset_index(inplace=True)
results_df.rename(columns={'index': 'Model'}, inplace=True)
results_df


Unnamed: 0,Model,R2_Score,MSE
0,Original_SVM,-0.148883,1.623202e+16
1,Original_DecisionTree,0.992129,111211400000000.0
2,Original_RandomForest,0.997723,32176540000000.0
3,PCA_SVM,-0.148887,1.623206e+16
4,PCA_DecisionTree,0.923877,1075507000000000.0
5,PCA_RandomForest,0.972674,386079900000000.0
6,feature_selected_SVM,-0.148885,1.623203e+16
7,feature_selected_DecisionTree,0.057012,1.332302e+16
8,feature_selected_RandomForest,0.506163,6977187000000000.0
