In [10]:
import pandas as pd

# Load the CSV file into a DataFrame
file_path = 'Movies_new_FTD.csv'
movies_df = pd.read_csv(file_path)

# Get a basic description of the dataset
description = {
    "head": movies_df.head(),          # First few rows
    "info": movies_df.info(),          # Info about columns and data types
    "describe": movies_df.describe(),  # Statistical summary of numerical columns
    "columns": movies_df.columns,      # Column names
    "shape": movies_df.shape           # Number of rows and columns
}

description['head']


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3122 entries, 0 to 3121
Data columns (total 13 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   budget                3122 non-null   int64  
 1   genres                3122 non-null   object 
 2   original_language     3122 non-null   object 
 3   popularity            3122 non-null   float64
 4   production_companies  3122 non-null   object 
 5   production_countries  3122 non-null   object 
 6   release_date          3122 non-null   object 
 7   revenue               3122 non-null   int64  
 8   runtime               3122 non-null   float64
 9   title                 3122 non-null   object 
 10  vote_average          3122 non-null   float64
 11  vote_count            3122 non-null   int64  
 12  cast                  3122 non-null   object 
dtypes: float64(3), int64(3), object(7)
memory usage: 317.2+ KB


Unnamed: 0,budget,genres,original_language,popularity,production_companies,production_countries,release_date,revenue,runtime,title,vote_average,vote_count,cast
0,237000000,"Action, Adventure, Fantasy, Science Fiction",en,150.437577,"Ingenious Film Partners, Twentieth Century Fox...","United States of America, United Kingdom",2009-12-10,2787965087,162.0,Avatar,7.2,11800,"Sam Worthington, Zoe Saldana, Sigourney Weaver..."
1,300000000,"Adventure, Fantasy, Action",en,139.082615,"Walt Disney Pictures, Jerry Bruckheimer Films,...",United States of America,2007-05-19,961000000,169.0,Pirates of the Caribbean: At World's End,6.9,4500,"Johnny Depp, Orlando Bloom, Keira Knightley, S..."
2,245000000,"Action, Adventure, Crime",en,107.376788,"Columbia Pictures, Danjaq, B24","United Kingdom, United States of America",2015-10-26,880674609,148.0,Spectre,6.3,4466,"Daniel Craig, Christoph Waltz, Ralph Fiennes, ..."
3,250000000,"Action, Crime, Drama, Thriller",en,112.31295,"Legendary Pictures, Warner Bros., DC Entertain...",United States of America,2012-07-16,1084939099,165.0,The Dark Knight Rises,7.6,9106,"Christian Bale, Michael Caine, Gary Oldman, An..."
4,260000000,"Action, Adventure, Science Fiction",en,43.926995,Walt Disney Pictures,United States of America,2012-03-07,284139100,132.0,John Carter,6.1,2124,"Taylor Kitsch, Lynn Collins, Samantha Morton, ..."


In [11]:
from sklearn.preprocessing import LabelEncoder

# Creating a copy of the dataframe to apply transformations
encoded_movies_df = movies_df.copy()

# Label encoding for 'original_language'
label_encoder = LabelEncoder()
encoded_movies_df['original_language'] = label_encoder.fit_transform(encoded_movies_df['original_language'])

# Frequency encoding for 'genres', 'production_companies', 'production_countries', 'cast'
# This will replace category labels in each column with their respective count
for column in ['genres', 'production_companies', 'production_countries', 'cast']:
    frequency_encoding = encoded_movies_df[column].value_counts().to_dict()
    encoded_movies_df[column] = encoded_movies_df[column].map(frequency_encoding)

# Convert 'release_date' to datetime
encoded_movies_df['release_date'] = pd.to_datetime(encoded_movies_df['release_date'])

# Extracting year, month and day from 'release_date'
encoded_movies_df['release_year'] = encoded_movies_df['release_date'].dt.year
encoded_movies_df['release_month'] = encoded_movies_df['release_date'].dt.month
encoded_movies_df['release_day'] = encoded_movies_df['release_date'].dt.day

# Dropping the original 'release_date' column
encoded_movies_df.drop('release_date', axis=1, inplace=True)

# Display the transformed dataframe
encoded_movies_df.head()


Unnamed: 0,budget,genres,original_language,popularity,production_companies,production_countries,revenue,runtime,title,vote_average,vote_count,cast,release_year,release_month,release_day
0,237000000,6,4,150.437577,1,12,2787965087,162.0,Avatar,7.2,11800,1,2009,12,10
1,300000000,13,4,139.082615,2,2086,961000000,169.0,Pirates of the Caribbean: At World's End,6.9,4500,1,2007,5,19
2,245000000,3,4,107.376788,1,141,880674609,148.0,Spectre,6.3,4466,1,2015,10,26
3,250000000,19,4,112.31295,1,2086,1084939099,165.0,The Dark Knight Rises,7.6,9106,1,2012,7,16
4,260000000,18,4,43.926995,17,2086,284139100,132.0,John Carter,6.1,2124,1,2012,3,7


In [12]:
# Save as a csv file
encoded_movies_df.to_csv('Movies_new_FTD_1.csv', index=False)

In [13]:
import pandas as pd
from sklearn.ensemble import IsolationForest

# Load the dataset
file_path = 'Movies_new_FTD_1.csv'
df = pd.read_csv(file_path)

# Selecting the numerical columns excluding 'revenue'
numerical_cols = ['budget', 'popularity', 'runtime', 'vote_average', 'vote_count']

# Initialize the Isolation Forest model
iso_forest = IsolationForest(contamination=0.05, random_state=42)

# Fit the model and predict the outliers
iso_forest_preds = iso_forest.fit_predict(df[numerical_cols])

# Extract the outliers
iso_forest_outliers = df[numerical_cols][iso_forest_preds == -1]

# Get the number of outliers detected by Isolation Forest
num_outliers_iso_forest = iso_forest_outliers.shape[0]

iso_forest_outliers, num_outliers_iso_forest




(         budget  popularity  runtime  vote_average  vote_count
 0     237000000  150.437577    162.0           7.2       11800
 1     300000000  139.082615    169.0           6.9        4500
 2     245000000  107.376788    148.0           6.3        4466
 3     250000000  112.312950    165.0           7.6        9106
 4     260000000   43.926995    132.0           6.1        2124
 ...         ...         ...      ...           ...         ...
 2835    3300000  192.528841    105.0           8.3        4254
 2920    2100000    0.039007    126.0           0.0           0
 3005    1200000   88.377076    161.0           8.1        2311
 3058     600000    3.409764    225.0           7.1          66
 3066    2000000   39.756748    207.0           8.2         878
 
 [157 rows x 5 columns],
 157)

In [14]:
# Removing the rows identified as outliers
df_cleaned = df.drop(iso_forest_outliers.index)

df_cleaned.shape


(2965, 15)

In [15]:
# Example of creating new features

# Check the current columns in the dataset
print("Current columns in the dataset:")
print(df_cleaned.columns.tolist())

# Example: Creating a new feature 'Profit' (assuming the dataset has 'revenue' and 'budget' columns)
# Profit = Revenue - Budget
if 'revenue' in df_cleaned.columns and 'budget' in df_cleaned.columns:
    df_cleaned['Profit'] = df_cleaned['revenue'] - df_cleaned['budget']

# Creating a new feature 'roi' (Return on Investment)
# ROI is calculated as (Profit / Budget) * 100
# This feature will be particularly useful if both 'revenue' and 'budget' are present and budget is not zero

# Adding a small value to budget to avoid division by zero
df_cleaned['budget'] = df_cleaned['budget'].replace(0, 1)

df_cleaned['roi'] = (df_cleaned['Profit'] / df_cleaned['budget']) * 100

# Display the first few rows of the dataset with the new 'roi' feature
df_cleaned.head()


# Display the first few rows of the dataset with the new features
df_cleaned.head()


Current columns in the dataset:
['budget', 'genres', 'original_language', 'popularity', 'production_companies', 'production_countries', 'revenue', 'runtime', 'title', 'vote_average', 'vote_count', 'cast', 'release_year', 'release_month', 'release_day']


Unnamed: 0,budget,genres,original_language,popularity,production_companies,production_countries,revenue,runtime,title,vote_average,vote_count,cast,release_year,release_month,release_day,Profit,roi
21,200000000,7,4,37.668301,1,141,310669540,140.0,Robin Hood,6.2,1398,1,2010,5,12,110669540,55.33477
23,180000000,3,4,42.990906,1,141,372234864,113.0,The Golden Compass,5.8,1303,1,2007,12,4,192234864,106.797147
35,150000000,9,4,21.939663,1,2086,836297228,150.0,Transformers: Revenge of the Fallen,6.0,3138,1,2009,6,19,686297228,457.531485
39,170000000,13,4,73.79505,1,2086,400062763,125.0,TRON: Legacy,6.3,2841,1,2010,12,10,230062763,135.331037
40,200000000,2,4,49.98659,11,2086,559852396,106.0,Cars 2,5.8,2033,1,2011,6,11,359852396,179.926198


In [16]:
# Save as a csv file
df_cleaned.to_csv('Movies_new_FTD_1.csv', index=False)