In [1]:
import pandas as pd

# Load the dataset
file_path = 'Movies_new_FTD_preprocessed.csv'
data = pd.read_csv(file_path)

# Display the first few rows of the dataset for a simple description
data.head()


Unnamed: 0,budget,genres,original_language,popularity,production_companies,production_countries,revenue,runtime,title,vote_average,vote_count,cast,release_year,release_month,release_day,Profit,roi
0,200000000,7,4,37.668301,1,141,310669540,140.0,Robin Hood,6.2,1398,1,2010,5,12,110669540,55.33477
1,180000000,3,4,42.990906,1,141,372234864,113.0,The Golden Compass,5.8,1303,1,2007,12,4,192234864,106.797147
2,150000000,9,4,21.939663,1,2086,836297228,150.0,Transformers: Revenge of the Fallen,6.0,3138,1,2009,6,19,686297228,457.531485
3,170000000,13,4,73.79505,1,2086,400062763,125.0,TRON: Legacy,6.3,2841,1,2010,12,10,230062763,135.331037
4,200000000,2,4,49.98659,11,2086,559852396,106.0,Cars 2,5.8,2033,1,2011,6,11,359852396,179.926198


In [8]:
data.shape

(2965, 17)

In [2]:
# Removing the 'title' column from the dataset
data_cleaned = data.drop(columns=['title'])

# Displaying the first few rows of the modified dataset
data_cleaned.head()


Unnamed: 0,budget,genres,original_language,popularity,production_companies,production_countries,revenue,runtime,vote_average,vote_count,cast,release_year,release_month,release_day,Profit,roi
0,200000000,7,4,37.668301,1,141,310669540,140.0,6.2,1398,1,2010,5,12,110669540,55.33477
1,180000000,3,4,42.990906,1,141,372234864,113.0,5.8,1303,1,2007,12,4,192234864,106.797147
2,150000000,9,4,21.939663,1,2086,836297228,150.0,6.0,3138,1,2009,6,19,686297228,457.531485
3,170000000,13,4,73.79505,1,2086,400062763,125.0,6.3,2841,1,2010,12,10,230062763,135.331037
4,200000000,2,4,49.98659,11,2086,559852396,106.0,5.8,2033,1,2011,6,11,359852396,179.926198


In [3]:
# Adding a new column for ROI categories based on the specified criteria
def categorize_roi(roi):
    if roi < 100:
        return "Failure"
    elif 100 <= roi < 200:
        return "Standard"
    else:
        return "Successful"

data_cleaned['roi_category'] = data_cleaned['roi'].apply(categorize_roi)

# Displaying the first few rows of the modified dataset to confirm the changes
data_cleaned.head()


Unnamed: 0,budget,genres,original_language,popularity,production_companies,production_countries,revenue,runtime,vote_average,vote_count,cast,release_year,release_month,release_day,Profit,roi,roi_category
0,200000000,7,4,37.668301,1,141,310669540,140.0,6.2,1398,1,2010,5,12,110669540,55.33477,失败
1,180000000,3,4,42.990906,1,141,372234864,113.0,5.8,1303,1,2007,12,4,192234864,106.797147,合格
2,150000000,9,4,21.939663,1,2086,836297228,150.0,6.0,3138,1,2009,6,19,686297228,457.531485,成功
3,170000000,13,4,73.79505,1,2086,400062763,125.0,6.3,2841,1,2010,12,10,230062763,135.331037,合格
4,200000000,2,4,49.98659,11,2086,559852396,106.0,5.8,2033,1,2011,6,11,359852396,179.926198,合格


In [5]:
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier

# Preparing the data
X = data_cleaned.drop(columns=['roi_category', 'roi'])  # Dropping the target variable and the original ROI column
y = data_cleaned['roi_category']  # Target variable

# Splitting the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


# Re-initializing the Decision Tree Classifier since it was not defined in the current context
clf = DecisionTreeClassifier(random_state=42)

# Function to perform forward feature selection
def forward_feature_selection(X_train, y_train, X_test, y_test, model):
    selected_features = []
    remaining_features = list(X_train.columns)
    best_accuracy = 0

    while remaining_features:
        best_feature = None
        for feature in remaining_features:
            current_features = selected_features + [feature]
            model.fit(X_train[current_features], y_train)
            predictions = model.predict(X_test[current_features])
            accuracy = accuracy_score(y_test, predictions)
            if accuracy > best_accuracy:
                best_accuracy = accuracy
                best_feature = feature

        if best_feature is not None:
            selected_features.append(best_feature)
            remaining_features.remove(best_feature)
        else:
            break  # No improvement in accuracy, stop the selection process

    return selected_features, best_accuracy

# Performing forward feature selection
selected_features, best_accuracy = forward_feature_selection(X_train, y_train, X_test, y_test, clf)

selected_features, best_accuracy


(['Profit', 'revenue', 'budget', 'original_language'], 0.9679595278246206)

In [6]:
from sklearn.svm import SVC


from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier

# Preparing the data
X = data_cleaned.drop(columns=['roi_category', 'roi'])  # Dropping the target variable and the original ROI column
y = data_cleaned['roi_category']  # Target variable

# Splitting the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


# Re-initializing the Decision Tree Classifier since it was not defined in the current context
clf = SVC(random_state=42)

# Function to perform forward feature selection
def forward_feature_selection(X_train, y_train, X_test, y_test, model):
    selected_features = []
    remaining_features = list(X_train.columns)
    best_accuracy = 0

    while remaining_features:
        best_feature = None
        for feature in remaining_features:
            current_features = selected_features + [feature]
            model.fit(X_train[current_features], y_train)
            predictions = model.predict(X_test[current_features])
            accuracy = accuracy_score(y_test, predictions)
            if accuracy > best_accuracy:
                best_accuracy = accuracy
                best_feature = feature

        if best_feature is not None:
            selected_features.append(best_feature)
            remaining_features.remove(best_feature)
        else:
            break  # No improvement in accuracy, stop the selection process

    return selected_features, best_accuracy

# Performing forward feature selection
selected_features, best_accuracy = forward_feature_selection(X_train, y_train, X_test, y_test, clf)

selected_features, best_accuracy


(['Profit', 'budget'], 0.9359190556492412)

In [7]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split


# Preparing the data
X = data_cleaned.drop(columns=['roi_category', 'roi'])  # Dropping the target variable and the original ROI column
y = data_cleaned['roi_category']  # Target variable

# Splitting the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


# Re-initializing the Decision Tree Classifier since it was not defined in the current context
clf = RandomForestClassifier(random_state=42)

# Function to perform forward feature selection
def forward_feature_selection(X_train, y_train, X_test, y_test, model):
    selected_features = []
    remaining_features = list(X_train.columns)
    best_accuracy = 0

    while remaining_features:
        best_feature = None
        for feature in remaining_features:
            current_features = selected_features + [feature]
            model.fit(X_train[current_features], y_train)
            predictions = model.predict(X_test[current_features])
            accuracy = accuracy_score(y_test, predictions)
            if accuracy > best_accuracy:
                best_accuracy = accuracy
                best_feature = feature

        if best_feature is not None:
            selected_features.append(best_feature)
            remaining_features.remove(best_feature)
        else:
            break  # No improvement in accuracy, stop the selection process

    return selected_features, best_accuracy

# Performing forward feature selection
selected_features, best_accuracy = forward_feature_selection(X_train, y_train, X_test, y_test, clf)

selected_features, best_accuracy


(['Profit', 'budget'], 0.9747048903878583)