In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

%matplotlib inline

In [3]:
drama_df = pd.read_csv("./top100_kdrama_aug_2023.csv")
drama_df.head()

Unnamed: 0,Name,Year of release,Aired Date,Aired On,Number of Episode,Network,Duration,Content Rating,Synopsis,Cast,Genre,Tags,Rank,Rating
0,Move to Heaven,2021,"May 14, 2021",Friday,10,Netflix,52 min.,18+ Restricted (violence & profanity),Han Geu Roo is an autistic 20-year-old. He wor...,"Lee Je Hoon, Tang Jun Sang, Hong Seung Hee, Ju...","Life, Drama","Uncle-Nephew Relationship, Autism, Death, Mour...",#1,9.2
1,Weak Hero Class 1,2022,"Nov 18, 2022",Friday Duration: 40 min. Content Rating: 18+ ...,8,,40 min.,18+ Restricted (violence & profanity),Yeon Shi Eun is a model student who ranks at t...,"Park Ji Hoon, Choi Hyun Wook, Hong Kyung, Kim ...","Action, Youth, Drama","Violence, Friendship, Bromance, School Bullyin...",#2,9.1
2,Hospital Playlist Season 2,2021,"Jun 17, 2021 - Sep 16, 2021",Thursday,12,"Netflix, tvN",1 hr. 40 min.,15+ - Teens 15 or older,Everyday is extraordinary for five doctors and...,"Jo Jung Suk, Yoo Yeon Seok, Jung Kyung Ho, Kim...","Romance, Life, Drama, Medical","Multiple Mains, Band, Music, Multiple Couples,...",#3,9.1
3,Flower of Evil,2020,"Jul 29, 2020 - Sep 23, 2020","Wednesday, Thursday",16,tvN,1 hr. 10 min.,15+ - Teens 15 or older,Although Baek Hee Sung is hiding a dark secret...,"Lee Joon Gi, Moon Chae Won, Jang Hee Jin, Seo ...","Thriller, Romance, Crime, Melodrama","Deception, Family Secret, Mystery, Suspense, H...",#4,9.1
4,Hospital Playlist,2020,"Mar 12, 2020 - May 28, 2020",Thursday,12,"Netflix, tvN",1 hr. 30 min.,15+ - Teens 15 or older,The stories of people going through their days...,"Jo Jung Suk, Yoo Yeon Seok, Jung Kyung Ho, Kim...","Romance, Life, Drama, Medical","Multiple Mains, Slow Romance, Multiple Couples...",#5,9.1


In [4]:
drama_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 14 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Name               100 non-null    object 
 1   Year of release    100 non-null    int64  
 2   Aired Date         100 non-null    object 
 3   Aired On           100 non-null    object 
 4   Number of Episode  100 non-null    int64  
 5   Network            99 non-null     object 
 6   Duration           100 non-null    object 
 7   Content Rating     100 non-null    object 
 8   Synopsis           100 non-null    object 
 9   Cast               100 non-null    object 
 10  Genre              100 non-null    object 
 11  Tags               100 non-null    object 
 12  Rank               100 non-null    object 
 13  Rating             100 non-null    float64
dtypes: float64(1), int64(2), object(11)
memory usage: 11.1+ KB


In [5]:
drama_df[drama_df['Network'].isna()]

Unnamed: 0,Name,Year of release,Aired Date,Aired On,Number of Episode,Network,Duration,Content Rating,Synopsis,Cast,Genre,Tags,Rank,Rating
1,Weak Hero Class 1,2022,"Nov 18, 2022",Friday Duration: 40 min. Content Rating: 18+ ...,8,,40 min.,18+ Restricted (violence & profanity),Yeon Shi Eun is a model student who ranks at t...,"Park Ji Hoon, Choi Hyun Wook, Hong Kyung, Kim ...","Action, Youth, Drama","Violence, Friendship, Bromance, School Bullyin...",#2,9.1


In [6]:
drama_df['Network'].unique()

array(['Netflix ', nan, 'Netflix,  tvN ', 'tvN ', 'SBS ',
       'ENA,  Netflix ', 'jTBC,  Netflix,  TVING ', 'TVING,  tvN ',
       'SBS,  ViuTV ', 'MBC,  Netflix ', 'KBS2 ', 'jTBC ', 'MBC,  Viki ',
       'Netflix,  OCN ', 'MBC ', 'OCN ', 'Netflix,  SBS ', 'TVING ',
       'ENA,  SBS ', 'KBS2,  ViuTV ', 'jTBC,  Netflix '], dtype=object)

In [7]:
drama_df['Content Rating'].unique()

array(['18+ Restricted (violence & profanity) ',
       '15+ - Teens 15 or older ', '13+ - Teens 13 or older '],
      dtype=object)

In [8]:
drama_df.columns

Index(['Name', 'Year of release', 'Aired Date', 'Aired On',
       'Number of Episode', 'Network', 'Duration', 'Content Rating',
       'Synopsis', 'Cast', 'Genre', 'Tags', 'Rank', 'Rating'],
      dtype='object')

# 데이터 전처리

In [10]:
import re
import time

columns = ['name', 'release_year', 'air_start', 'air_end', 'air_on_1', 'air_on_2',
       'num_episodes', 'net_1', 'net_2', 'net_3', 'duration', 'restrict',
       'synopsis', 'cast', 'genre', 'tags', 'rank', 'rating']
drama_df_processed = pd.DataFrame(columns=columns)

name_list = []
year_list = []
air_date_list = []
air_on_list = []
num_ep_list = []
net_list = []
dur_list = []
restrict_list = []
synop_list = []
cast_list = []
genre_list = []
tags_list = []
rank_list = []
rating_list = []

def air_on_preprocessing(air_on):
    air_on_search = re.search(r'\w+:', air_on)
    if air_on_search:
        air_on = air_on[:air_on_search.start()].strip()

    air_on_1 = air_on_2 = np.nan
    if "," in air_on:
        air_on = air_on.split(', ')
        air_on_1 = air_on[0]
        air_on_2 = air_on[1]
    else :
        air_on_1 = air_on
    return air_on_1, air_on_2

def net_preprocessing(net):
    net_list = [np.nan] * 3
    if "," in net:
        net = net.split(", ")
        for i in range(len(net)):
            net_list[i] = net[i]
    else:
        net_list[0] = net
    return net_list[0], net_list[1], net_list[2]

for idx in drama_df.index:
    s = drama_df.iloc[idx]
    name = s['Name']
    year = s['Year of release']
    air_date = s['Aired Date']
    air_on = s['Aired On']
    num_ep = s['Number of Episode']
    net = s['Network']
    dur = s['Duration']
    restrict = s['Content Rating']
    synop = s['Synopsis']
    cast = s['Cast']
    genre = s['Genre']
    tags = s['Tags']
    rank = s['Rank']
    rating = s['Rating']

    air_date = air_date.split(' - ')
    air_start = time.strftime("%Y-%m-%d", time.strptime(air_date[0], "%b %d, %Y"))
    air_end = time.strftime("%Y-%m-%d", time.strptime(air_date[1], "%b %d, %Y")) if len(air_date) >= 2 else np.nan

    air_on_1, air_on_2 = air_on_preprocessing(air_on)

    try:
        net_1, net_2, net_3 = net_preprocessing(net)
    except :
        net_1 = net_2 = net_3 = np.nan

    hour = minute = 0
    if "hr." in dur:
        dur = dur.split("hr.")
        hour = int(dur[0])
        minute = int(dur[1].strip("min."))
    else:
        minute = int(dur.strip("min."))
    dur = hour*60 + minute

    restrict = int(restrict[:2])
    
    # genre
    # tags
    rank = rank.strip("#")

    drama_df_processed.loc[idx] = [name, year, air_start, air_end, air_on_1, air_on_2, num_ep, net_1, net_2, net_3, dur, restrict, synop, cast, genre, tags, rank, rating]
    
drama_df_processed.tail(20)


Unnamed: 0,name,release_year,air_start,air_end,air_on_1,air_on_2,num_episodes,net_1,net_2,net_3,duration,restrict,synopsis,cast,genre,tags,rank,rating
80,Jewel in the Palace,2003,2003-09-15,2004-03-23,Monday,Tuesday,54,MBC,,,65,13,About 500 years ago during the time of Chosun ...,"Lee Young Ae, Ji Jin Hee, Hong Ri Na, Im Ho, Y...","Food, Historical, Romance, Medical","Strong Female Lead, Cooking, Royal Rules And E...",81,8.7
81,The First Responders,2022,2022-11-12,2022-12-30,Friday,Saturday,12,ENA,SBS,,60,18,Follow the joint operations of a police force ...,"Kim Rae Won, Son Ho Jun, Gong Seung Yeon, Kang...","Action, Thriller, Mystery, Drama","Teamwork, Firefighter Male Lead, Slight Romanc...",82,8.6
82,Tunnel,2017,2017-03-25,2017-05-21,Saturday,Sunday,16,OCN,,,64,15,"In 1986, Park Gwang Ho works as an excellent a...","Choi Jin Hyuk, Yoon Hyun Min, Lee Yoo Young, J...","Thriller, Mystery, Sci-Fi, Fantasy","Time Travel, Murder, Bromance, Hardworking Mal...",83,8.6
83,Descendants of the Sun,2016,2016-02-24,2016-04-14,Wednesday,Thursday,16,KBS2,,,60,15,A love story that develops between a surgeon a...,"Song Joong Ki, Song Hye Kyo, Jin Goo, Kim Ji W...","Action, Comedy, Romance, Melodrama","Hardworking Male Lead, Multiple Couples, Broma...",84,8.6
84,My Perfect Stranger,2023,2023-05-01,2023-06-20,Monday,Tuesday,16,KBS2,ViuTV,,70,15,Yoon Hae Joon is the youngest anchor to ever w...,"Kim Dong Wook, Jin Ki Joo, Seo Ji Hye, Lee Won...","Mystery, Romance, Drama, Sci-Fi","Time Travel, 1980s, Murder, Investigation, Cal...",85,8.6
85,Hotel del Luna,2019,2019-07-13,2019-09-01,Saturday,Sunday,16,tvN,,,80,15,Nestled deep in the heart of Seoul’s thriving ...,"IU, Yeo Jin Goo, Shin Jung Keun, Bae Hae Seon,...","Horror, Comedy, Romance, Fantasy","Boss-Employee Relationship, Strong Female Lead...",86,8.6
86,My Liberation Notes,2022,2022-04-09,2022-05-29,Saturday,Sunday,16,jTBC,Netflix,,67,15,Set in Sanpo Village where more people leave t...,"Lee Min Ki, Kim Ji Won, Son Suk Ku, Lee El, Le...","Romance, Life, Drama, Melodrama","Slow Burn Story, Introverted Female Lead, Lone...",87,8.6
87,Missing,2020,2020-08-29,2020-10-11,Saturday,Sunday,12,OCN,,,70,15,"A village holds the spirits of missing, deceas...","Go Soo, Heo Joon Ho, Ahn So Hee, Ha Joon, Seo ...","Thriller, Mystery, Supernatural","Ghost-seeing Male Lead, Spirit, Missing Person...",88,8.6
88,Law School,2021,2021-04-14,2021-06-09,Wednesday,Thursday,16,jTBC,Netflix,,65,15,When a grim incident occurs at their prestigio...,"Kim Myung Min, Kim Bum, Ryu Hye Young, Lee Jun...","Mystery, Law, Crime, Drama","Smart Male Lead, University, Death, Hardworkin...",89,8.6
89,Children of Nobody,2018,2018-11-21,2019-01-16,Wednesday,Thursday,32,MBC,,,30,15,Cha Woo Kyung is a child counselor who works a...,"Kim Sun Ah, Lee Yi Kyung, Nam Gyu Ri, Cha Hak ...","Thriller, Mystery, Psychological, Drama","Trauma, Child Abuse, Smart Female Lead, Murder...",90,8.6
