In [None]:
import pandas as pd
import numpy as np
import os
from pathlib import Path
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

In [None]:
base_path = Path(os.getcwd()) / "data" / "combined.csv"

In [None]:
df = pd.read_csv(base_path)

In [None]:
df

In [None]:
features = ['genre', 'cast']

In [None]:
df = df.fillna('')

In [None]:
df.cast.unique()

In [None]:
for feature in features:
  df[feature] = df[feature].apply(lambda x: x.split(','))

In [None]:
def get_director(x):
  for i in x:
    if "(dir.)" in i:
      return i.strip("(dir.)")
  return np.nan

def get_acting_cast(x):
  return [i for i in x if "(dir.)" not in i]

In [None]:
df['director'] = df.cast.apply(get_director)
df['cast'] = df.cast.apply(get_acting_cast)

In [None]:
df = df[df['synopsis'] != "n/a"]

In [None]:
def clean_data(x):
  if isinstance(x, list):
    return [str.lower(i.replace(" ", "")) for i in x]
  else:
    if isinstance(x, str):
      return str.lower(x.replace(" ", ""))
    else:
      return ''

In [None]:
features = ['cast', 'director', 'genre']
for f in features:
  df[f] = df[f].apply(clean_data)

In [None]:
df['synopsis'] = df.synopsis.apply(str.lower)
df['lang'] = df['lang'].apply(str.lower)

In [None]:
def create_soup(x):
  return x['synopsis'] + ' ' + ' '.join(x['genre']) + ' ' + ' '.join(x['cast']) + ' ' + x['director']

In [None]:
df['text'] = df.apply(create_soup, axis=1)

In [None]:
df = df.drop(['synopsis', 'cast', 'director', 'year'], axis=1)

In [None]:
df.head()

In [None]:
df.to_csv('final_processed.csv', sep=',', index=False, encoding='utf-16')

In [None]:
def get_recommendations(df,title, genre: list, lang, type="drama", num: int = 20):
  data = df[(df['type'] == type) & (df['lang'] == lang)]
  # Filter by genre
  if data.shape[0] == 0:
    return []
  data.reset_index(level=0, inplace=True)

  indices = pd.Series(data.index, index=data['title']).drop_duplicates()
  count = CountVectorizer(stop_words='english')
  count_matrix = count.fit_transform(data['text'])
  idx = indices[title]
  cosine_sim = cosine_similarity(count_matrix, count_matrix)
  scores = list(enumerate(cosine_sim[idx]))
  scores = sorted(scores, key = lambda x: x[1], reverse=True)
  scores = scores[1:num+1]

  movies = [i[0] for i in scores]
  return data[['title', 'link', 'genre']].iloc[movies]

In [None]:
get_recommendations(df,"Hanzawa Naoki 2", ['thriller'], 'japanese')

# Fetch Latest Data from 2021 to 2024

In [None]:
URL = "https://mydramalist.com/search?adv=titles&ty=68,77&co=3,1&re=2021,2024&rt=1,10&so=date&page={page}"

In [None]:
from bs4 import BeautifulSoup
import requests
import re

In [None]:
def get_metadata():
    url = URL.format(page=1)
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')
    # m-b-sm pull-right
    count = soup.find_all('p', {'class': 'm-b-sm pull-right'})
    count = count[0].text.strip(" results")
    entries = 20
    num_pages = round(int(count) / entries)
    return count, num_pages

In [None]:
count, pages = get_metadata()

In [None]:
def get_info_from_page(page_no=1):
    url = URL.format(page=page_no)
    response = requests.get(url)
    print("Got Response Code: ", response.status_code)
    if response.status_code != 200:
        print("Error in fetching page: ", page_no)
        return None
    soup = BeautifulSoup(response.text, 'html.parser')
    title = soup.find_all('h6', {'class': 'text-primary title'})
    title_names = [i.text.strip("\n") for i in title]
    links = [i.find('a')['href'] for i in title]
    type_year = soup.find_all('span', {'class': 'text-muted'})
    type_year = [i.text for i in type_year]

    result = [{
        'id': links[i].split('-')[0][1:],
        'title': title_names[i],
        'type': "movie" if "movie" in type_year[i].split(' - ')[0].lower() else "drama",
        'year': type_year[i].split(' - ')[1],
        'link': links[i]
    } for i in range(len(title))]

    return result

In [1]:
import time

In [2]:
current_page = 1

In [None]:
results = []
for i in range(current_page, pages+1):
    print(f"Getting page {i} / {pages}")
    entries = get_info_from_page(i)
    if entries is None:
        time.sleep(5)
    results.extend(entries)
    current_page += 1

In [3]:
import pandas as pd

In [None]:
df = pd.DataFrame(results)

In [None]:
df.head()

In [None]:
df[df['title'].str.contains("Last Man")]

In [None]:
df.to_csv('metadata.csv', sep=',', index=False, encoding='utf-16')

In [4]:
lang_map = {
    "Japan": "Japanese",
    "South Korea": "Korean",
    "China": "Chinese",
}

In [None]:
def get_cast(link):
    url = "https://mydramalist.com" + link + "/cast"
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')
    lists = soup.find_all('ul', {'class': 'list no-border p-b clear'})
    return lists

In [12]:
from PyMDL.Infopage import info as get_info_mdl
import pandas as pd

In [6]:
df = pd.read_csv('metadata.csv', encoding='utf-16')

In [7]:
# title	link	year	genre	synopsis	cast	lang	type
def get_data_per_entry(link):
    try:
        info = get_info_mdl(link)
        if info is None:
            return None
        if info.type == "Movie":
            release_date = info.date
        else:
            release_date = info.date.split(' - ')[0].split(',')[-1].lstrip().strip()
        
        genre = info.genre
        if isinstance(genre, list):
            genre = ', '.join(genre).strip()
        print(info.title, release_date, genre, info.type, info.country, info.casts, info.director)
        return {
            'title': info.title,
            'link': link,
            'year': release_date,
            'type': info.type,
            'synopsis': info.synopsis,
            'lang': lang_map[info.country],
            'genre': genre,
            'cast': ', '.join(info.casts).strip(),
            'director': info.director if info.director is not None else ""
        }
    except Exception as e:
        print(e)
        return None

In [8]:
data = []

In [9]:
d_lst = df.to_dict(orient='records')

In [10]:
current_index = 0

In [None]:
for idx in range(current_index, len(d_lst)):
    entries = d_lst[idx]
    print(f"Getting data for {entries['title']} ---> {idx} / {len(d_lst)}")
    entry = get_data_per_entry(f"https://mydramalist.com{entries['link']}")
    if entry is None:
        print("Error in fetching data, retrying...")
        time.sleep(3)
        entry = get_data_per_entry(f"https://mydramalist.com{entries['link']}")
    data.append(entry)
    current_index += 1

In [None]:
len(data)

In [None]:
f = open("data.txt", "w")
f.write(str(data))
f.close()

In [11]:
import threading
import time

In [12]:
main_data = []

In [13]:
def threaded(tid, start, end):
    t_data = []
    current_index = start
    print(f"Thread {tid} started")
    for idx in range(start, end):
        entries = d_lst[idx]
        print(f"Getting data for {entries['title']} ---> {idx} / {len(d_lst)}")
        entry = get_data_per_entry(f"https://mydramalist.com{entries['link']}")
        if entry is None:
            print("Error in fetching data, retrying...")
            time.sleep(3)
            entry = get_data_per_entry(f"https://mydramalist.com{entries['link']}")
        t_data.append(entry)
        current_index += 1
    print(f"Thread {tid} ended")
    main_data.extend(t_data)
    return t_data

In [14]:
start = 310
end = 2631
threads = []

for i in range(start, end, 100):
    s_idx = i
    e_idx = i + 100
    if e_idx > end:
        e_idx = end
    t = threading.Thread(target=threaded, args=(i, s_idx, e_idx))
    threads.append(t)
    t.start()

Thread 310 startedThread 410 started
Getting data for I:LOVE:DM ---> 410 / 2632

Getting data for Saiko no Obahan Nakajima Haruko ---> 310 / 2632
Thread 510 started
Getting data for Nevertheless, ---> 510 / 2632
Thread 610 started
Getting data for Action Hero ---> 610 / 2632
Thread 710 started
Getting data for Miss Osaka ---> 710 / 2632
Thread 810 started
Getting data for Nihon Chinbotsu: Kibo no Hito ---> 810 / 2632
Thread 910 started
Getting data for Zero Ability for Dating ---> 910 / 2632
Thread 1010 started
Getting data for The World of My 17 Season 2 ---> 1010 / 2632
Thread 1110 started
Getting data for Rokuhodo Yotsuiro Biyori ---> 1110 / 2632
Thread 1210 started
Getting data for Serve the People ---> 1210 / 2632
Thread 1310 started
Getting data for To Be Killed by a High School Girl ---> 1310 / 2632
Thread 1410 started
Getting data for Monstrous ---> 1410 / 2632
Thread 1510 started
Getting data for Red Bridge: Beginning ---> 1510 / 2632
Thread 1610 started
Getting data for Cabri

And So I'm at a Loss (2022) Oct 28, 2022 Drama Movie Japan ['Fujigaya Taisuke', 'Maeda Atsuko', 'Nakao Akiyoshi', 'Maiguma Katsuya', 'Nomura Shuhei', 'Karina'] Miura Daisuke
Getting data for Sadako DX ---> 1911 / 2632
To Be Killed by a High School Girl (2022) Apr  1, 2022 Thriller, Psychological, Drama Movie Japan ['Tanaka Kei', 'Minami Sara', 'Kawai Yuumi', 'Riko', 'Kayashima Mizuki', 'Hosoda Kanata'] Jojo Hideo
Getting data for Eternity Passes By ---> 1311 / 2632
Serve the People (2022) Feb 23, 2022 Romance, Drama, Melodrama Movie South Korea ['Yeon Woo Jin', 'Ji An', 'Jo Sung Ha', 'Kim Ji Chul', 'Han Min Yeop', 'Woo Ju Bin'] Jang Cheol Soo
Getting data for Hoshi Kara Kita Anata ---> 1211 / 2632
Red Bridge: Beginning (2022) Jun  4, 2022 Youth, Drama Movie Japan ['Toyoda Yudai', 'Okura Takato', 'Hyodo Katsumi', 'Matsuo Jun', 'Kida Keisuke', 'Sugawara Ken'] None
Getting data for To My Star Season 2: Our Untold Stories ---> 1511 / 2632
I:LOVE:DM (2021) 2021 Romance, Youth Drama South Ko

In [15]:
len(main_data)

2321

In [17]:
f = open("data.txt", "r")
old_data = f.read()
f.close()

In [18]:
from ast import literal_eval

In [19]:
old_data = literal_eval(old_data)

In [21]:
old_data.extend(main_data)

In [25]:
old_data = [i for i in old_data if i is not None]

In [27]:
df = pd.DataFrame(old_data)

In [29]:
df = df.drop_duplicates()

In [33]:
df.to_csv('new_data.csv', sep=',', index=False, encoding='utf-16')

In [34]:
df.head()

Unnamed: 0,title,link,year,type,synopsis,lang,genre,cast,director
0,Falling in Love with A# (2021),/753257-falling-in-love-with-a,2021,Movie,Akio visits the Nishimuro family of kimono dea...,Japanese,Romance,Kitaura Ayu,
1,Veils (2021),/748043-veils,2021,Movie,Ayumi and Sayaka are a closed lesbian couple l...,Japanese,"Romance, Drama",Nakayama Erika,Nakayama Erika
2,Blooming Love (2021),/720989-blooming-love,2021,Movie,Ji An is burdened with the courtship of a youn...,Korean,Romance,Lee Ji Yeon,Lee Ji Yeon
3,How Do I Kill That B? (2021),/734117-how-do-i-kill-that-b,2021,Movie,"Ha Yoon, who works as a maid in a mansion, is ...",Korean,"Thriller, Drama","Ha Yoon Kyung, Bang Hyo Rin",
4,Shiver (2021),/702319-shiver,2021,Movie,Taiko ensemble Kodo and composer Hino Koshiro ...,Japanese,Music,Shibukawa Kiyohiko,Toyoda Toshiaki


In [35]:
df = pd.read_csv('new_data.csv', encoding='utf-16')

In [36]:
df['type'] = df['type'].apply(str.lower)

In [37]:
df.director.unique()

array([nan, 'Nakayama Erika', 'Lee Ji Yeon', ...,
       'Hara Keinosuke, Yasoshima Miyako',
       'Higurashi Ken, Ochiai Masayuki', 'Han In Mi'], dtype=object)

In [40]:
df = df.fillna('')

In [41]:
features = ['genre', 'cast', 'director']
for feature in features:
  df[feature] = df[feature].apply(lambda x: x.split(','))

In [42]:
df.head()

Unnamed: 0,title,link,year,type,synopsis,lang,genre,cast,director
0,Falling in Love with A# (2021),/753257-falling-in-love-with-a,2021,movie,Akio visits the Nishimuro family of kimono dea...,Japanese,[Romance],[Kitaura Ayu],[]
1,Veils (2021),/748043-veils,2021,movie,Ayumi and Sayaka are a closed lesbian couple l...,Japanese,"[Romance, Drama]",[Nakayama Erika],[Nakayama Erika]
2,Blooming Love (2021),/720989-blooming-love,2021,movie,Ji An is burdened with the courtship of a youn...,Korean,[Romance],[Lee Ji Yeon],[Lee Ji Yeon]
3,How Do I Kill That B? (2021),/734117-how-do-i-kill-that-b,2021,movie,"Ha Yoon, who works as a maid in a mansion, is ...",Korean,"[Thriller, Drama]","[Ha Yoon Kyung, Bang Hyo Rin]",[]
4,Shiver (2021),/702319-shiver,2021,movie,Taiko ensemble Kodo and composer Hino Koshiro ...,Japanese,[Music],[Shibukawa Kiyohiko],[Toyoda Toshiaki]


In [46]:
# Get all rows where synopsis is an empty string
df = df[df['synopsis'] != ""]

In [48]:
def clean_data(x):
  if isinstance(x, list):
    return [str.lower(i.replace(" ", "")) for i in x]
  else:
    if isinstance(x, str):
      return str.lower(x.replace(" ", ""))
    else:
      return ''

In [49]:
features = ['cast', 'director', 'genre']
for f in features:
  df[f] = df[f].apply(clean_data)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[f] = df[f].apply(clean_data)


In [50]:
df

Unnamed: 0,title,link,year,type,synopsis,lang,genre,cast,director
0,Falling in Love with A# (2021),/753257-falling-in-love-with-a,2021,movie,Akio visits the Nishimuro family of kimono dea...,Japanese,[romance],[kitauraayu],[]
1,Veils (2021),/748043-veils,2021,movie,Ayumi and Sayaka are a closed lesbian couple l...,Japanese,"[romance, drama]",[nakayamaerika],[nakayamaerika]
2,Blooming Love (2021),/720989-blooming-love,2021,movie,Ji An is burdened with the courtship of a youn...,Korean,[romance],[leejiyeon],[leejiyeon]
3,How Do I Kill That B? (2021),/734117-how-do-i-kill-that-b,2021,movie,"Ha Yoon, who works as a maid in a mansion, is ...",Korean,"[thriller, drama]","[hayoonkyung, banghyorin]",[]
4,Shiver (2021),/702319-shiver,2021,movie,Taiko ensemble Kodo and composer Hino Koshiro ...,Japanese,[music],[shibukawakiyohiko],[toyodatoshiaki]
...,...,...,...,...,...,...,...,...,...
2617,Kotodamasou (2021),https://mydramalist.com/709699-kotodama-so,2021,drama,Utagawa Kotoha is a bright 25-year-old woman. ...,Japanese,"[horror, supernatural]","[nishinonanase, nagayamakento, saitoyuki, miyo...","[higurashiken, ochiaimasayuki]"
2618,We Have to Love Each Other (2021),https://mydramalist.com/712005-we-have-to-love...,"Oct 9, 2021",movie,The only school in the riverside village is ab...,Korean,[drama],[],[]
2619,Nobody′s Lover (2021),https://mydramalist.com/712533-nobody-s-lover,"Oct 9, 2021",movie,Eighteen-year-old Yu Jin is leading a harsh li...,Korean,[drama],"[hwangboun, seoyounghee, kimdogyeong, parkjeon...",[haninmi]
2620,Real Siblings (2021),https://mydramalist.com/717013-real-siblings,2021,drama,A realistic story of a real brother and sister...,Korean,[comedy],[],[]


In [51]:
df['synopsis'] = df.synopsis.apply(str.lower)
df['lang'] = df['lang'].apply(str.lower)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['synopsis'] = df.synopsis.apply(str.lower)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['lang'] = df['lang'].apply(str.lower)


In [54]:
def create_soup(x):
  return x['synopsis'] + ' ' + ' '.join(x['genre']) + ' ' + ' '.join(x['cast']) + ' ' + ' '.join(x['director'])

In [55]:
df['text'] = df.apply(create_soup, axis=1)

In [56]:
df = df.drop(['synopsis', 'cast', 'director', 'year'], axis=1)

In [57]:
df.head()

Unnamed: 0,title,link,type,lang,genre,text
0,Falling in Love with A# (2021),/753257-falling-in-love-with-a,movie,japanese,[romance],akio visits the nishimuro family of kimono dea...
1,Veils (2021),/748043-veils,movie,japanese,"[romance, drama]",ayumi and sayaka are a closed lesbian couple l...
2,Blooming Love (2021),/720989-blooming-love,movie,korean,[romance],ji an is burdened with the courtship of a youn...
3,How Do I Kill That B? (2021),/734117-how-do-i-kill-that-b,movie,korean,"[thriller, drama]","ha yoon, who works as a maid in a mansion, is ..."
4,Shiver (2021),/702319-shiver,movie,japanese,[music],taiko ensemble kodo and composer hino koshiro ...


In [58]:
orig_df = pd.read_csv('final_processed.csv', encoding='utf-16')

In [60]:
df['genre'] = df['genre'].apply(str)

In [62]:
orig_df.head()

Unnamed: 0,title,link,genre,lang,type,text
0,Move to Heaven,/49231-move-to-heaven,"['life', 'drama', 'family']",korean,drama,han geu roo is a 20-year-old with autism. he w...
1,Extraordinary Attorney Woo,/705723-strange-lawyer-woo-young-woo,"['law', 'romance', 'life', 'drama']",korean,drama,about a 27-year-old lawyer on the autism spect...
2,Flower of Evil,/54625-flower-of-evil,"['thriller', 'romance', 'crime', 'melodrama']",korean,drama,although baek hee sung is hiding a dark secret...
3,Nirvana in Fire,/9025-nirvana-in-fire,"['military', 'historical', 'drama', 'political']",chinese,drama,"in sixth-century china, the emperor of great l..."
4,Hospital Playlist,/36269-doctor-playbook,"['friendship', 'romance', 'life', 'medical']",korean,drama,the stories of people going through their days...


In [63]:
complete_df = pd.concat([orig_df, df])

In [65]:
complete_df.shape

(12354, 6)

In [66]:
complete_df = complete_df.drop_duplicates()

In [68]:
complete_df.to_csv('final_processed_full.csv', sep=',', index=False, encoding='utf-16')

In [69]:
complete_df.head()

Unnamed: 0,title,link,genre,lang,type,text
0,Move to Heaven,/49231-move-to-heaven,"['life', 'drama', 'family']",korean,drama,han geu roo is a 20-year-old with autism. he w...
1,Extraordinary Attorney Woo,/705723-strange-lawyer-woo-young-woo,"['law', 'romance', 'life', 'drama']",korean,drama,about a 27-year-old lawyer on the autism spect...
2,Flower of Evil,/54625-flower-of-evil,"['thriller', 'romance', 'crime', 'melodrama']",korean,drama,although baek hee sung is hiding a dark secret...
3,Nirvana in Fire,/9025-nirvana-in-fire,"['military', 'historical', 'drama', 'political']",chinese,drama,"in sixth-century china, the emperor of great l..."
4,Hospital Playlist,/36269-doctor-playbook,"['friendship', 'romance', 'life', 'medical']",korean,drama,the stories of people going through their days...


In [67]:
import pandas as pd
import swifter

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
df = pd.read_csv('data/final_processed_full.csv', encoding='utf-16')

In [3]:
from PyMDL.Infopage import info as get_info_mdl

In [4]:
def get_image(df: pd.DataFrame):
    try:
        print(f"Processing {df['title']}")
        info = get_info_mdl(df['link'])
        if info is None:
            return None
        return info.thumbnail
    except Exception as e:
        print(e)
        return None

In [5]:
df['image'] = df.swifter.allow_dask_on_strings(enable=True).apply(get_image, axis=1)

Processing Dok Go Bin Is Updating
Processing Shaolin Temple 2: Kids from ShaolinProcessing Megalopolis Expressway Trial: Max

Processing Fearless Kungfu King
Processing Apat Dapat, Dapat Apat
Processing The Endless Love
Processing Shin Ultraman
Processing Phone Call To The Bar 2
Processing Bump Off Lover
Processing Mother
Processing Cherry Magic! Thirty Years of Virginity Can Make You a Wizard?!: The Movie
Processing Majurat See Nam Pueng
Processing Special Actors
Processing Miss Butcher
Processing Asuko March!
Processing A Man Called Pirate
Processing Brothel 8
Processing Rich Family's Son
Processing Girlfriend The Movie (Ex)
Processing The Professor's Beloved Equation
Processing Sud Sai Pan
Processing Kanon
Processing Sid and Aya: Not a Love Story
Processing Triple Fling
Processing Sunflower
Processing Woman in the Dunes
Processing Ruk Sutrit
Processing Leaving on the 15th Spring
Processing In Our Prime
Processing Liar
Processing Finding Mr. Right 2: Book of Love
Processing Battle of

Dask Apply:   0%|          | 0/16 [00:00<?, ?it/s]

Processing Gakko no Toshi Densetsu Toire no Hanako-san
Processing Stairway to Stardom
Processing Branded to Kill
Processing Before a Falling Star Fades Away
Processing S.W.A.T.
Processing 4 Reasons Why I Hate Christmas
Processing A Man Called Pirate
Processing Citizen Jake
Processing The Whirlwind Girl 2
Processing Restart: Runway-Episode Zero
Processing TRICK: The Movie 2
Processing Sennyuu Tantei Tokage
Processing Young Aunt
Processing Pushing Hands
Processing Samee
Processing Erotic Tutoring 2
Processing Garo And The Wailing Dragon
Processing The [email protected]
Processing Awaken
Processing DOG x POLICE: The K-9 Force
Processing Chained Flowers
Processing Wish Woosh 2
Processing Into the Faraway Sky
Processing Dipped in Gold
Processing Rak Kan Panlawan
Processing Begin Again
Processing Nezha Conquers the Dragon King
Processing Hi My Sweetheart
Processing An Actor's Revenge
Processing My sister, Sister, Sister...
Processing Love O'Clock
Processing Carrying Spring
Processing March G

Dask Apply:   6%|▋         | 1/16 [50:21<12:35:18, 3021.23s/it]

HTTPSConnectionPool(host='mydramalist.com', port=443): Max retries exceeded with url: /19556-the-legend-of-the-condor-heroes (Caused by NameResolutionError("<urllib3.connection.HTTPSConnection object at 0x15085a550>: Failed to resolve 'mydramalist.com' ([Errno 8] nodename nor servname provided, or not known)"))
Processing Ojakgyo Brothers
HTTPSConnectionPool(host='mydramalist.com', port=443): Max retries exceeded with url: /4635-deranged (Caused by NameResolutionError("<urllib3.connection.HTTPSConnection object at 0x165179910>: Failed to resolve 'mydramalist.com' ([Errno 8] nodename nor servname provided, or not known)"))
HTTPSConnectionPool(host='mydramalist.com', port=443): Max retries exceeded with url: /705663-sakeguse-50 (Caused by NameResolutionError("<urllib3.connection.HTTPSConnection object at 0x16265fb10>: Failed to resolve 'mydramalist.com' ([Errno 8] nodename nor servname provided, or not known)"))
HTTPSConnectionPool(host='mydramalist.com', port=443): Max retries exceeded 

Dask Apply:  31%|███▏      | 5/16 [55:42<1:35:01, 518.29s/it]  

Processing Pornographer - Spring Life (2021)
Processing FAKE MOTION: Tatta Hitotsu no Negai (2021)
Processing Nijiiro Karute (2021)
Processing Nara's Marvelous Days (2021)
Processing A High School Girl Works Part-Time at a Convenience Store (2021)
Processing Ore no Ie no Hanashi (2021)
Processing To My Star (2021)
Processing The Night Beyond the Tricornered Window (2021)
Processing Dream Team (2021)
Processing Croissant (2021)
Processing True Scary Story -Accident property entertainer- (2021)
Processing Mokomi: Kanojo Chotto Hendakedo (2021)
Processing Red Eyes: Kanshi Sousa-han (2021)
Processing Convenience Store Fling (2021)
Processing Love (ft. Marriage and Divorce) (2021)
Processing Dekke Furoba de Mattemasu (2021)
Processing Anonymous: Keishicho ”Yubisatsujin” Taisakushitsu (2021)
Processing Replay: The Moment When It Starts Again (2021)
Processing Toy Soldiers: Fake Men 2 The Complete (2021)
Processing Three Sisters (2021)
Processing Rule of the Game: Human Hunting (2021)
Process

Dask Apply: 100%|██████████| 16/16 [56:17<00:00, 211.09s/it] 


In [7]:
df.to_csv('data/final_processed_full_with_image.csv', sep=',', index=False, encoding='utf-16')

In [1]:
import pandas as pd
df = pd.read_csv('data/final_processed_full_with_image.csv', encoding='utf-16')

In [2]:
df.image.isna().sum()

2737

In [3]:
df.image = df.image.fillna('https://via.placeholder.com/150')

In [4]:
df.image.isna().sum()

0

In [5]:
df.to_csv('data/final_processed_full_with_image.csv', sep=',', index=False, encoding='utf-16')

In [6]:
df = pd.read_csv('data/final_processed_full_with_image.csv', encoding='utf-16')

In [7]:
empty_images = df[df['image'] == "https://via.placeholder.com/150"]

In [8]:
empty_images.shape

(2737, 7)

In [29]:
import requests
from bs4 import BeautifulSoup

In [58]:
def get_image(df: pd.DataFrame):
    try:
        if str(df['link']).startswith("https"):
            url = df['link']
        else:
            url = f"https://mydramalist.com{df['link']}"
        print(f"Processing {df['title']}")
        data = requests.get(url)
        soup = BeautifulSoup(data.text, 'html.parser')
        image = soup.find_all('div', {'class': 'col-sm-4 film-cover cover'})
        image = image[0].find_all('img')
        print(image[0]['src'])
        return image[0]['src']
    except Exception as e:
        print(e)
        return None

In [62]:
slice = empty_images.iloc[0:2]

In [63]:
slice

Unnamed: 0,title,link,genre,lang,type,text,image
655,With You,/58585-together,"['drama', 'medical']",chinese,drama,with you tells the stories of the chinese mili...,https://via.placeholder.com/150
656,The Queen's Classroom,/490-the-queens-classroom,"['school', 'drama']",japanese,drama,a new school year begins at hanzaki elementary...,https://via.placeholder.com/150


In [64]:
res = slice.copy()

In [65]:
res['image'] = res.apply(get_image, axis=1)

Processing With You
https://i.mydramalist.com/vxlJW_4c.jpg?v=1
Processing The Queen's Classroom
https://i.mydramalist.com/vN35gc.jpg?v=1


In [66]:
res

Unnamed: 0,title,link,genre,lang,type,text,image
655,With You,/58585-together,"['drama', 'medical']",chinese,drama,with you tells the stories of the chinese mili...,https://i.mydramalist.com/vxlJW_4c.jpg?v=1
656,The Queen's Classroom,/490-the-queens-classroom,"['school', 'drama']",japanese,drama,a new school year begins at hanzaki elementary...,https://i.mydramalist.com/vN35gc.jpg?v=1


In [68]:
empty_images['image'] = empty_images.swifter.allow_dask_on_strings(enable=True).apply(get_image, axis=1)

Processing Ten Years of Love
Processing Soirée
Processing Mairunovich (2021)
https://i.mydramalist.com/oeEQzc.jpg?v=1
Processing The M Riders 5
https://i.mydramalist.com/0Rv2e_4c.jpg?v=1
Processing Chilsu and Mansu
https://i.mydramalist.com/67rx0_4c.jpg?v=1
Processing The Limit (2021)
https://i.mydramalist.com/akBZgc.jpg?v=1
Processing Awakened Demon
https://i.mydramalist.com/RYJgrc.jpg?v=1
Processing Paano ang Pasko?
https://i.mydramalist.com/XX3rg_4c.jpg?v=1
Processing Kanojo no Ura Sekai: Akiko Side (2021)
https://i.mydramalist.com/q67K5_4c.jpg?v=1
Processing Hanging Garden
https://i.mydramalist.com/EZx8O_4c.jpg?v=1
Processing Without Knowing Dawn Break in the East
https://i.mydramalist.com/QARe5_4c.jpg?v=1
Processing Scripting Your Destiny (2021)
https://i.mydramalist.com/xjP1Xc.jpg?v=1
Processing Mr. Socrates
https://i.mydramalist.com/p8RNyc.jpg?v=1
Processing Green Grass by the River
https://i.mydramalist.com/dkrdD_4c.jpg?v=1
Processing Dangerous Relationship: Professor and Femal

Dask Apply:   0%|          | 0/16 [00:00<?, ?it/s]

Processing Pornographer: Continued Spring Life (2021)
Processing Her Legend
Processing Seiri-Chan
https://i.mydramalist.com/xO70j_4c.jpg?v=1
Processing Insa (2021)
https://i.mydramalist.com/X8gqnc.jpg?v=1
Processing I am...
https://i.mydramalist.com/XK4mpc.jpg?v=1
Processing Hanging Garden
https://i.mydramalist.com/1Xlw3_4c.jpg?v=1
Processing Don't Go Too Far (2021)
https://i.mydramalist.com/7nKlec.jpg?v=1
Processing Love Poison
https://i.mydramalist.com/vqmLpc.jpg?v=1
Processing Advice (2021)
https://i.mydramalist.com/wl2vkc.jpg?v=1
Processing Momo Salon
https://i.mydramalist.com/xjP1Xc.jpg?v=1
Processing Didi's Dreams
https://i.mydramalist.com/royA2_4c.jpg?v=1
Processing It's Okay, That's Friendship (2021)
https://i.mydramalist.com/8qr02c.jpg?v=1
Processing Sanaeha Sunya Kaen
https://i.mydramalist.com/X44Pdc.jpg?v=1
Processing Mr. Socrates
https://i.mydramalist.com/rl6Vp_4c.jpg?v=1
Processing To My Star (Movie) (2021)
https://i.mydramalist.com/4eWwQc.jpg?v=1
Processing HIStory1: My H

Dask Apply:   6%|▋         | 1/16 [09:54<2:28:38, 594.54s/it]

https://i.mydramalist.com/85qEbc.jpg?v=1
https://i.mydramalist.com/4kP2Kc.jpg?v=1
Processing Showa Genroku Rakugo Shinju
https://i.mydramalist.com/4Z03k_4c.jpg?v=1
Processing Hey! Our Dear Don-chan (2022)
https://i.mydramalist.com/qqYn5c.jpg?v=1
Processing Who is the Winner
https://i.mydramalist.com/RB0q5x_4c.jpg?v=1
Processing My Perfect Roommate (2022)
https://i.mydramalist.com/Md5bec.jpg?v=1
Processing Like a Flowing River
https://i.mydramalist.com/dgkWD_4c.jpg?v=1
Processing Move to Mind (2022)
https://i.mydramalist.com/bwny6c.jpg?v=1
Processing Poot Ratikarn
https://i.mydramalist.com/ZOjJk_4c.jpg?v=1
Processing Kyakuhon Geinin (2022)
https://i.mydramalist.com/QlmwQ_4c.jpg?v=1
Processing Animal Whisper
https://i.mydramalist.com/roeDp_4c.jpg?v=1
Processing Animals (2022)
https://i.mydramalist.com/exEDE_4c.jpg?v=1
Processing Viral Scandal
https://i.mydramalist.com/vpn0e_4c.jpg?v=1
Processing Just For Laughs! (2022)
https://i.mydramalist.com/ZLAvW_4c.jpg?v=1
Processing The Message
htt

Dask Apply:  31%|███▏      | 5/16 [14:47<27:38, 150.78s/it]  

https://i.mydramalist.com/0wmO66_4c.jpg?v=1
Processing Dr. Chocolate (2023)
https://i.mydramalist.com/jgbRd_4c.jpg?v=1
https://i.mydramalist.com/4eexyd_4c.jpg?v=1
Processing Tsukuyomi-kun no Kindan Oyashoku (2023)
https://i.mydramalist.com/eYyExd_4c.jpg?v=1
Processing Last Man: Zenmo no Sosakan (2023)
https://i.mydramalist.com/eYz3Dn_4c.jpg?v=1
Processing Fixer (2023)
https://i.mydramalist.com/Xdjx6q_4c.jpg?v=1
Processing Queen of Masks (2023)
https://i.mydramalist.com/QJ1l3g_4c.jpg?v=1
Processing Haru wa Mijikashi Koiseyo Danshi. (2023)
https://i.mydramalist.com/QJJLbW_4c.jpg?v=1
Processing The Good Bad Mother (2023)
https://i.mydramalist.com/E5x540_4c.jpg?v=1
Processing Dream (2023)
https://i.mydramalist.com/E5xv7b_4c.jpg?v=1
Processing Happy Merry Ending (2023)
https://i.mydramalist.com/d02VOb_4c.jpg?v=1
Processing Trap by Seo In Guk (2023)
https://i.mydramalist.com/d0yby0_4c.jpg?v=1
Processing Dr. Romantic Season 3 (2023)
https://i.mydramalist.com/d02LlA_4c.jpg?v=1
Processing Knigh

Dask Apply: 100%|██████████| 16/16 [15:04<00:00, 56.53s/it]

https://i.mydramalist.com/wJJnjb_4c.jpg?v=1



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  empty_images['image'] = empty_images.swifter.allow_dask_on_strings(enable=True).apply(get_image, axis=1)


In [71]:
# Replace all empty images with the new ones
df.loc[empty_images.index] = empty_images

In [73]:
df[df['image'] == "https://via.placeholder.com/150"]

Unnamed: 0,title,link,genre,lang,type,text,image


In [74]:
df.to_csv('data/final_processed_full_with_image.csv', sep=',', index=False, encoding='utf-16')