In [2]:
import numpy as np
import pandas as pd
import bs4
from bs4 import BeautifulSoup
import requests
import re
import os

from bsoup_scrape import bsoup_scrape_data
from selenium_scrape import scrape_page2

In [2]:
# initialize lists for storing data
titles = []
years = []
episodes = []
countries = []
scores = []
ranks = []

titles_p2 = []
genres = []
network = []
watchers = []

# Part 1 - Beautiful Soup

This section of code will use beautiful soup to scrape everything shown on the first page (the one listing all the titles)

In [42]:
# URL to scrape
url_base = 'https://mydramalist.com/shows/top?page='
page_number = "27"
url = url_base+page_number
# Send a GET request to the URL
response = requests.get(url)
# Parse the HTML content using BeautifulSoup
soup = BeautifulSoup(response.content, 'html.parser')

bsoup_scrape_data(titles, years, episodes, countries, scores, ranks, soup)

my_drama_list = pd.DataFrame({
    "title": titles,
    "year": years,
    "episodes": episodes,
    "country": countries,
    "viewer_score": scores,
    "rank": ranks
})
soup_scrape = pd.DataFrame(my_drama_list)

#if the csv file already exists, delete it
#if os.path.exists("msoup_scrapedl.csv"):
   #os.remove("soup_scrape.csv")

# commenting this out so I don't accidentally delete it again
# soup_scrape.to_csv("combined_soup.csv", index=False) 

# Part 2 - Selenium

This section of the code will use selenium to click through each title page and scrape additional elements.

See selenium_notes.ipynb for this code

In [1]:
import pandas as pd
import os
ss = pd.read_csv('combined_selenium.csv')
ss19 = pd.read_csv('ss_df19.csv')
ss20 = pd.read_csv('ss_df20.csv')
ss21 = pd.read_csv('ss_df21.csv')
ss22 = pd.read_csv('ss_df22.csv')
ss23 = pd.read_csv('ss_df23.csv')
ss24 = pd.read_csv('ss_df24.csv')
ss25 = pd.read_csv('ss_df25.csv')
ss26 = pd.read_csv('ss_df26.csv')

# Combine the files
selenium_mdl = pd.concat([ss, ss19, ss20, ss21, ss22, ss23, ss24,ss25,ss26])
# if the csv file already exists, delete it
if os.path.exists("combined_selenium.csv"):
    os.remove("combined_selenium.csv")
# Write the combined data to a new CSV file
selenium_mdl.to_csv('combined_selenium.csv', index=False)

## Part 3 - Merging The DataFrames
This section of the code will merge the csv files created from part 1 and part 2

In [163]:
soup = pd.read_csv('combined_soup.csv')
sel = pd.read_csv('combined_selenium.csv')

# Merge based on the 'title' column
merged = pd.merge(soup, sel, on='title', how='inner') #'inner' keeps only matching rows

# if the csv file already exists, delete it
if os.path.exists("combined_mdl.csv"):
    os.remove("combined_mdl.csv")

# Write the merged data to a new CSV file
merged.to_csv('combined_mdl.csv', index=False)

## Polishing
This section puts the finishing touches on the final csv file :D

Currently, the genre and network are input as long strings. I am going to split them up into their own columns with a 1 or 0 if present.

In [3]:
mdl = pd.read_csv('combined_mdl.csv')
test = mdl
# split the network and genres into lists separated by commas
with pd.option_context('mode.chained_assignment', None):
    test['network'] = test['network'].str.split(': ').str[1]
    test['genres'] = test['genres'].astype(str).str.replace(", ", ",")
    test['genres'] = test['genres'].str.split(': ').str[1]
    test['genres'] = test['genres'].astype(str).str.replace(", ", ",")

In [4]:
# collect all unique genres present in the df before splitting the column into separate columns
unique_genres = set()
for genres in test['genres']:
    split_genres = genres.split(',')
    for genre in split_genres:
        unique_genres.add(genre)

unique_network = set()
for network in test['network']:
    if pd.notnull(network) and network != 'na':  # Check for non-null and non-'na' values
        split_network = network.split(',')
        for option in split_network:
            unique_network.add(option)

In [5]:
# Creating columns for each unique genre/network, put a zero if it's null/na
with pd.option_context('mode.chained_assignment', None):
    for genre in unique_genres:
        test.loc[:, genre.strip()] = test['genres'].apply(lambda x: 1 if pd.notnull(x) and x != 'na' and genre in x else 0)
    
    for network in unique_network:
        test.loc[:, network.strip()] = test['network'].apply(lambda x: 1 if pd.notnull(x) and x != 'na' and network in x else 0)

My dataframe is unnecesarily large, so I am dropping genres and networks that don't have more than 15 occurences

In [7]:
print(test.shape)

sums = test.iloc[:, 8:].sum(axis=0)
# Get columns to drop where the sum is less than 16
columns_to_drop = sums[sums < 16].index

# Drop columns where the sum is less than 2
test2 = test.drop(columns=columns_to_drop)
print(test2.shape)


(523, 94)
(523, 39)


Writing the official dataframe to a csv file:

In [204]:
test2.to_csv("mdl_final.csv", index=False)

### Hooray we made it!