In [12]:
import pandas as pd
from sklearn.ensemble import IsolationForest
from sklearn.preprocessing import StandardScaler
from sklearn.svm import OneClassSVM
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LinearRegression
from sklearn.metrics import make_scorer, r2_score

# Load the dataset
file_path = 'Movies_new_FTD_to_detect_outlier.csv'
movies_df = pd.read_csv(file_path)

# Selecting the numerical columns
numerical_cols = ['budget', 'popularity', 'runtime', 'vote_average', 'vote_count']

# Outlier Detection with Isolation Forest
iso_forest = IsolationForest(contamination=0.05, random_state=42)
iso_forest_preds = iso_forest.fit_predict(movies_df[numerical_cols])
iso_forest_outliers = movies_df[iso_forest_preds == -1]

# Outlier Detection with One-Class SVM
scaler = StandardScaler()
scaled_data = scaler.fit_transform(movies_df[numerical_cols])
one_class_svm = OneClassSVM(nu=0.05, kernel="rbf", gamma=0.01)
svm_preds = one_class_svm.fit_predict(scaled_data)
svm_outliers = movies_df[svm_preds == -1]

# Outlier Detection with IQR
def detect_outliers_iqr(data, column):
    Q1 = data[column].quantile(0.25)
    Q3 = data[column].quantile(0.75)
    IQR = Q3 - Q1
    outlier_mask = (data[column] < (Q1 - 1.5 * IQR)) | (data[column] > (Q3 + 1.5 * IQR))
    return data[outlier_mask]

# Removing outliers
movies_no_outliers_iso_forest = movies_df.drop(iso_forest_outliers.index)
movies_no_outliers_svm = movies_df.drop(svm_outliers.index)
movies_no_outliers_iqr = movies_df.copy()
for col in numerical_cols:
    outliers = detect_outliers_iqr(movies_no_outliers_iqr, col)
    movies_no_outliers_iqr.drop(outliers.index, inplace=True)



In [13]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score

# Creating the new target column 'roi'
movies_df['roi'] = (movies_df['revenue'] - movies_df['budget']) / movies_df['budget'] * 100
movies_df['roi_category'] = pd.cut(movies_df['roi'], bins=[-float('inf'), 100, 200, float('inf')], 
                                   labels=['Failed', 'Qualified', 'Successful'])

# Selecting only numeric columns for the classification model
movies_numeric_classification = movies_df.select_dtypes(include=['float64', 'int64'])

# Adding the new categorical target column to the dataset
movies_numeric_classification['roi_category'] = movies_df['roi_category']

# New target column for the classification task
target_column_classification = 'roi_category'

# Removing outliers from the numeric dataset for each method
# Isolation Forest
movies_no_outliers_iso_forest_class = movies_numeric_classification.drop(iso_forest_outliers.index)

# One-Class SVM
movies_no_outliers_svm_class = movies_numeric_classification.drop(svm_outliers.index)

# IQR
movies_no_outliers_iqr_class = movies_numeric_classification.copy()
for col in numerical_cols:
    outliers = detect_outliers_iqr(movies_no_outliers_iqr_class, col)
    movies_no_outliers_iqr_class.drop(outliers.index, inplace=True)

# Function to perform cross-validation for classification
def perform_classification_cv(data, target_column, model, cv=10):
    # Extracting target and features
    X = data.drop(target_column, axis=1)
    y = data[target_column]

    # Performing cross-validation
    scores = cross_val_score(model, X, y, cv=cv, scoring='accuracy')
    return scores.mean()

# Initialize the decision tree classifier
dt_classifier = DecisionTreeClassifier()

# Perform cross-validation and calculate average accuracy score for each dataset
cv_score_iso_forest_class = perform_classification_cv(movies_no_outliers_iso_forest_class, target_column_classification, dt_classifier)
cv_score_svm_class = perform_classification_cv(movies_no_outliers_svm_class, target_column_classification, dt_classifier)
cv_score_iqr_class = perform_classification_cv(movies_no_outliers_iqr_class, target_column_classification, dt_classifier)

cv_score_iso_forest_class, cv_score_svm_class, cv_score_iqr_class


(0.9993265993265993, 0.9993265993265993, 0.9991967871485944)