Melakukan import library

In [2]:
import pandas as pd

Muat dataset

In [3]:
df = pd.read_csv('/content/movie_sample_dataset.csv')
df.head()

Unnamed: 0,color,director_name,duration,gross,genres,movie_title,title_year,language,country,budget,imdb_score,actors,movie_facebook_likes
0,Color,Martin Scorsese,240,116866727.0,Biography|Comedy|Crime|Drama,The Wolf of Wall Street,2013,English,USA,100000000.0,8.2,"Leonardo DiCaprio,Matthew McConaughey,Jon Favreau",138000
1,Color,Shane Black,195,408992272.0,Action|Adventure|Sci-Fi,Iron Man 3,2013,English,USA,200000000.0,7.2,"Robert Downey Jr.,Jon Favreau,Don Cheadle",95000
2,color,Quentin Tarantino,187,54116191.0,Crime|Drama|Mystery|Thriller|Western,The Hateful Eight,2015,English,USA,44000000.0,7.9,"Craig Stark,Jennifer Jason Leigh,Zoë Bell",114000
3,Color,Kenneth Lonergan,186,46495.0,Drama,Margaret,2011,English,usa,14000000.0,6.5,"Matt Damon,Kieran Culkin,John Gallagher Jr.",0
4,Color,Peter Jackson,186,258355354.0,Adventure|Fantasy,The Hobbit: The Desolation of Smaug,2013,English,USA,225000000.0,7.9,"Aidan Turner,Adam Brown,James Nesbitt",83000


In [4]:
df.shape

(99, 13)

Cek tipe data dan nilai yang hilang

In [5]:
import numpy as np

# replace "?" menjadi NaN
df.replace("?", np.nan, inplace = True)

# cek banyaknya missing value pada masing-masing kolom
print(df.isnull().sum())

color                   11
director_name           11
duration                 0
gross                    8
genres                   1
movie_title              0
title_year               0
language                 0
country                  0
budget                   4
imdb_score               0
actors                   0
movie_facebook_likes     0
dtype: int64


### Membersihkan Data

Menghapus baris yang memiliki nilai NaN di kolom penting seperti gross dan budget.

In [6]:
# simply drop whole row with NaN in "budget" column
df.dropna(subset=["budget"], axis=0, inplace=True)

# simply drop whole row with NaN in "gross" column
df.dropna(subset=["gross"], axis=0, inplace=True)

# simply drop whole row with NaN in "genres" column
df.dropna(subset=["genres"], axis=0, inplace=True)

In [7]:
# reset index, because we droped rows
df.reset_index(drop=True, inplace=True)

Menggunakan frequency untuk missing data 'color'

In [8]:
# fill NaN in "color" with the most frequent value
df['color'].fillna(df['color'].mode()[0], inplace=True)
print(df.isnull().sum())

color                    0
director_name           10
duration                 0
gross                    0
genres                   0
movie_title              0
title_year               0
language                 0
country                  0
budget                   0
imdb_score               0
actors                   0
movie_facebook_likes     0
dtype: int64


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['color'].fillna(df['color'].mode()[0], inplace=True)


Drop kolom 'director_name'

In [12]:
# Drop the 'director_name' columns
df.drop(['director_name'], axis=1, inplace=True)

In [13]:
print(df.head())

    color  duration        gross                                genres  \
0   color     240.0  116866727.0          Biography|Comedy|Crime|Drama   
1   color     195.0  408992272.0               Action|Adventure|Sci-Fi   
2  color      187.0   54116191.0  Crime|Drama|Mystery|Thriller|Western   
3   color     186.0      46495.0                                 Drama   
4   color     186.0  258355354.0                     Adventure|Fantasy   

                           movie_title  title_year language country  \
0              The Wolf of Wall Street        2013  English     USA   
1                           Iron Man 3        2013  English     USA   
2                    The Hateful Eight        2015  English     USA   
3                             Margaret        2011  English     usa   
4  The Hobbit: The Desolation of Smaug        2013  English     USA   

        budget  imdb_score                                             actors  \
0  100000000.0         8.2  Leonardo DiCaprio,M

Mengatasi nilai yang tidak konsisten

In [14]:
# Standardize 'color' column by converting to lowercase
df['color'] = df['color'].str.lower()

Mengubah atau hapus nilai-nilai yang tidak standar, seperti nilai negatif atau "N/A"

In [15]:
# Handle negative values in 'duration' and 'imdb_score' by replacing them with NaN
df['duration'] = df['duration'].apply(lambda x: np.nan if x < 0 else x)
df['imdb_score'] = df['imdb_score'].apply(lambda x: np.nan if x < 0 else x)

# Replace any remaining non-standard string values in object columns with NaN
object_cols = df.select_dtypes(include='object').columns
for col in object_cols:
    df[col] = df[col].replace(['N/A', '#N/A', 'NA', 'NULL', 'n/a', '#n/a', 'na', 'null'], np.nan)

# Display the number of missing values after handling non-standard values
print(df.isnull().sum())

color                   0
duration                1
gross                   0
genres                  0
movie_title             0
title_year              0
language                0
country                 0
budget                  0
imdb_score              2
actors                  0
movie_facebook_likes    0
dtype: int64


In [16]:
# Check for negative values in numerical columns
numerical_cols = df.select_dtypes(include=np.number).columns
for col in numerical_cols:
    if (df[col] < 0).any():
        print(f"Column '{col}' contains negative values.")

# Check for non-standard string values in object columns (beyond NaN which are already handled)
object_cols = df.select_dtypes(include='object').columns
for col in object_cols:
    # This is a basic check and might need adjustment based on actual non-standard values
    if df[col].astype(str).str.contains('N/A|#N/A|NA|NULL', case=False).any():
        print(f"Column '{col}' contains non-standard string values.")

Column 'movie_title' contains non-standard string values.
Column 'actors' contains non-standard string values.


Cek missing value setelah penanganan nilai negatif

In [17]:
print(df.isnull().sum())

color                   0
duration                1
gross                   0
genres                  0
movie_title             0
title_year              0
language                0
country                 0
budget                  0
imdb_score              2
actors                  0
movie_facebook_likes    0
dtype: int64


Melakukan pembersihan data kembali

drop row

In [19]:
# simply drop whole row with NaN in "duration" column
df.dropna(subset=["duration"], axis=0, inplace=True)

# simply drop whole row with NaN in "imdb_score" column
df.dropna(subset=["imdb_score"], axis=0, inplace=True)

In [20]:
# reset index, because we droped rows
df.reset_index(drop=True, inplace=True)

In [21]:
print("\nMissing values after cleaning and transformation:")
print(df.isnull().sum())


Missing values after cleaning and transformation:
color                   0
duration                0
gross                   0
genres                  0
movie_title             0
title_year              0
language                0
country                 0
budget                  0
imdb_score              0
actors                  0
movie_facebook_likes    0
dtype: int64


### Transformasi Data

Mengubah tipe data kolom menjadi tipe data yang sesuai

Cek tipe data

In [22]:
print(df.dtypes)

color                    object
duration                float64
gross                   float64
genres                   object
movie_title              object
title_year                int64
language                 object
country                  object
budget                  float64
imdb_score              float64
actors                   object
movie_facebook_likes      int64
dtype: object


Ubah tipe data
Konversi budget dan gross menjadi int/numerik

In [23]:
df["gross"] = df["gross"].astype("int")
df["budget"] = df["budget"].astype("int")

Cek kembali tipe data

In [24]:
print(df.dtypes)

color                    object
duration                float64
gross                     int64
genres                   object
movie_title              object
title_year                int64
language                 object
country                  object
budget                    int64
imdb_score              float64
actors                   object
movie_facebook_likes      int64
dtype: object


Memisahkan genre yang bergabung

In [25]:
# Separate genres into multiple columns
genres = df['genres'].str.get_dummies('|')
df = pd.concat([df, genres], axis=1)

Menghilangkan kolom 'genres' dan digantikan dengan memisah tiap genre

In [26]:
df.drop('genres', axis=1, inplace=True)

In [27]:
display(df.head())
print(df.columns)

Unnamed: 0,color,duration,gross,movie_title,title_year,language,country,budget,imdb_score,actors,...,History,Music,Musical,Mystery,Romance,Sci-Fi,Sport,Thriller,War,Western
0,color,240.0,116866727,The Wolf of Wall Street,2013,English,USA,100000000,8.2,"Leonardo DiCaprio,Matthew McConaughey,Jon Favreau",...,0,0,0,0,0,0,0,0,0,0
1,color,195.0,408992272,Iron Man 3,2013,English,USA,200000000,7.2,"Robert Downey Jr.,Jon Favreau,Don Cheadle",...,0,0,0,0,0,1,0,0,0,0
2,color,187.0,54116191,The Hateful Eight,2015,English,USA,44000000,7.9,"Craig Stark,Jennifer Jason Leigh,Zoë Bell",...,0,0,0,1,0,0,0,1,0,1
3,color,186.0,46495,Margaret,2011,English,usa,14000000,6.5,"Matt Damon,Kieran Culkin,John Gallagher Jr.",...,0,0,0,0,0,0,0,0,0,0
4,color,186.0,258355354,The Hobbit: The Desolation of Smaug,2013,English,USA,225000000,7.9,"Aidan Turner,Adam Brown,James Nesbitt",...,0,0,0,0,0,0,0,0,0,0


Index(['color', 'duration', 'gross', 'movie_title', 'title_year', 'language',
       'country', 'budget', 'imdb_score', 'actors', 'movie_facebook_likes',
       'Action', 'Adventure', 'Biography', 'Comedy', 'Crime', 'Drama',
       'Fantasy', 'History', 'Music', 'Musical', 'Mystery', 'Romance',
       'Sci-Fi', 'Sport', 'Thriller', 'War', 'Western'],
      dtype='object')


Normalisasi teks untuk memastikan konsistensi

In [28]:
# Normalize text in object columns by converting to lowercase
object_cols_to_normalize = ['director_name', 'movie_title', 'language', 'country', 'actors']
for col in object_cols_to_normalize:
    if col in df.columns and df[col].dtype == 'object':
        df[col] = df[col].str.lower()

In [29]:
display(df.head())
print(df.columns)

Unnamed: 0,color,duration,gross,movie_title,title_year,language,country,budget,imdb_score,actors,...,History,Music,Musical,Mystery,Romance,Sci-Fi,Sport,Thriller,War,Western
0,color,240.0,116866727,the wolf of wall street,2013,english,usa,100000000,8.2,"leonardo dicaprio,matthew mcconaughey,jon favreau",...,0,0,0,0,0,0,0,0,0,0
1,color,195.0,408992272,iron man 3,2013,english,usa,200000000,7.2,"robert downey jr.,jon favreau,don cheadle",...,0,0,0,0,0,1,0,0,0,0
2,color,187.0,54116191,the hateful eight,2015,english,usa,44000000,7.9,"craig stark,jennifer jason leigh,zoë bell",...,0,0,0,1,0,0,0,1,0,1
3,color,186.0,46495,margaret,2011,english,usa,14000000,6.5,"matt damon,kieran culkin,john gallagher jr.",...,0,0,0,0,0,0,0,0,0,0
4,color,186.0,258355354,the hobbit: the desolation of smaug,2013,english,usa,225000000,7.9,"aidan turner,adam brown,james nesbitt",...,0,0,0,0,0,0,0,0,0,0


Index(['color', 'duration', 'gross', 'movie_title', 'title_year', 'language',
       'country', 'budget', 'imdb_score', 'actors', 'movie_facebook_likes',
       'Action', 'Adventure', 'Biography', 'Comedy', 'Crime', 'Drama',
       'Fantasy', 'History', 'Music', 'Musical', 'Mystery', 'Romance',
       'Sci-Fi', 'Sport', 'Thriller', 'War', 'Western'],
      dtype='object')


In [30]:
df.to_csv('movie_dataset_cleaned.csv')

Melakukan verifikasi hasil

In [31]:
# Check for remaining missing values
print("\nMissing values after cleaning and transformation:")
print(df.isnull().sum())


Missing values after cleaning and transformation:
color                   0
duration                0
gross                   0
movie_title             0
title_year              0
language                0
country                 0
budget                  0
imdb_score              0
actors                  0
movie_facebook_likes    0
Action                  0
Adventure               0
Biography               0
Comedy                  0
Crime                   0
Drama                   0
Fantasy                 0
History                 0
Music                   0
Musical                 0
Mystery                 0
Romance                 0
Sci-Fi                  0
Sport                   0
Thriller                0
War                     0
Western                 0
dtype: int64
