In [1]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
import numpy as np

In [2]:
# get the response in the form of html
wikiurl='https://id.wikipedia.org/wiki/Daftar_film_Indonesia_tahun_2023'
# table_class="wikitable sortable jquery-tablesorter"
response = requests.get(wikiurl)
print(response.status_code)

200


In [3]:
# parse data from the html into a beautifulsoup object
soup = BeautifulSoup(response.text, 'html.parser')
tables = soup.find_all('table',{'class':"wikitable"})

In [4]:
df = pd.read_html(str(tables))
df.pop(0)

Unnamed: 0,Peringkat,Judul,Produksi,Penonton
0,1,Sewu Dino,MD Pictures,4.891.609
1,2,Di Ambang Kematian,MVP Pictures,3.302.047
2,3,Air Mata di Ujung Sajadah,Beehave Pictures,3.127.671
3,4,172 Days,Starvision Plus,3.086.659
4,5,Petualangan Sherina 2,Miles Films,2.414.504
5,6,Waktu Maghrib,Rapi Films,2.409.112
6,7,Suzzanna: Malam Jumat Kliwon,Soraya Intercine Films,2.189.363
7,8,Siksa Neraka,Dee Company,2.055.308
8,9,Sijjin,Rapi Films,1.930.901
9,10,Panggonan Wingit,Hitmaker Studios,1.728.714


In [5]:
data = pd.DataFrame(columns=df[0].columns)
for i in df:
    data = pd.concat([data,i], ignore_index=True)

In [6]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 131 entries, 0 to 130
Data columns (total 12 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   Tayang            131 non-null    object 
 1   Tayang.1          126 non-null    object 
 2   Judul             126 non-null    object 
 3   Sutradara         126 non-null    object 
 4   Pemeran           125 non-null    object 
 5   Genre             126 non-null    object 
 6   Produksi          126 non-null    object 
 7   Distributor       19 non-null     object 
 8   Penonton          106 non-null    object 
 9   Klasifikasi usia  116 non-null    object 
 10  Ref.              88 non-null     object 
 11  Unnamed: 11       0 non-null      float64
dtypes: float64(1), object(11)
memory usage: 12.4+ KB


In [7]:
empty_title = data.loc[data['Judul'].isnull() == True]
data.drop(empty_title.index, inplace=True)

In [8]:
data.drop(['Ref.', 'Unnamed: 11'], axis=1, inplace=True)

In [9]:
bulan_angka = {
    'J A N U A R I': 1,
    'F E B R U A R I': 2,
    'M A R E T': 3,
    'A P R I L': 4,
    'M E I': 5,
    'J U N I': 6,
    'J U L I' : 7,
    'A G U S T U S': 8,
    'S E P T E M B E R': 9,
    'O K T O B E R': 10,
    'N O V E M B E R': 11,
    'D E S E M B E R': 12
}

In [10]:
data['Tayang'] = data['Tayang'].map(bulan_angka)
data['Tayang.1'] = data['Tayang.1'].astype(int)

In [11]:
data['Date'] = data['Tayang'].astype(str)+'-'+data['Tayang.1'].astype(str)+'-'+'2023'
data['Date'] = pd.to_datetime(data['Date'])

In [12]:
data['Penonton'] = data['Penonton'].astype(str).str.replace('.', '')
data['Penonton'] = pd.to_numeric(data['Penonton'], errors='coerce', downcast='signed')

In [13]:
data['Distributor'].fillna('Bioskop', inplace = True)

In [14]:
list = data['Produksi'].str.split(', ')
data['Produksi'] = [x[0] for x in list]

In [15]:
list_2 = data['Genre'].str.split(', ')
data['Genre'] = [x[0] for x in list_2]

In [16]:
data.drop(['Tayang','Tayang.1', 'Pemeran'], axis=1, inplace=True)

In [17]:
from PyMovieDb import IMDB
import json
imdb = IMDB()

In [18]:
rate=[]
for i in data['Judul']:
    res = imdb.get_by_name(i, tv=False)
    film = json.loads(res)
    try:
        rate.append(film['rating']['ratingValue'])
    except:
        rate.append(np.nan)

In [19]:
data['Rating'] = rate

In [20]:
data.loc[(data['Rating'].isnull() == True) | (data['Klasifikasi usia'].isnull() == True)]

Unnamed: 0,Judul,Sutradara,Genre,Produksi,Distributor,Penonton,Klasifikasi usia,Date,Rating
11,Adagium,Rizal Mantovani,Drama,Brainstorminc,Bioskop,41645.0,13+,2023-01-26,
29,"Glo, Kau Cahaya",Ani Ema Susanti,Drama,Bhuana Art Sinema,Bioskop,3100.0,SU,2023-03-09,
33,Sukmailang,Rizqon Agustia Fahsa,Horor,Genia Visinema,Bioskop,,13+,2023-03-16,
38,Teman Tidur,Ray Nayoan,Horor,Robagu Pictures,Bioskop,18786.0,17+,2023-03-30,
41,Kartu Pos Wini,Tarmizi Abka,Drama,Sinemata Productions,Bioskop,5057.0,SU,2023-04-06,
48,Angel: Kami Semua Punya Mimpi,Ivan Hamdhani Putra,Drama,Fast Films,Bioskop,7019.0,13+,2023-05-04,
54,LDR: Love Distance Relationshi*,George Timothy,Drama,Creative Goods Inc.,Maxstream,,,2023-05-31,
57,Mbutik,Rizal Wimba,Drama,Darinol Production,Bioskop,3957.0,,2023-06-01,
68,Kutukan Peti Mati,Irham Acho Bachtiar,Horor,Balai Pustaka,Bioskop,53822.0,13+,2023-07-20,
70,Ketika Berhenti di Sini,Umay Shahab,Drama,Sinemaku Pictures,Bioskop,1604359.0,13+,2023-07-27,


In [21]:
data.to_csv('film_indonesia_2023.csv',index=False)