In [9]:
# pip install selenium
# pip install webdriver-manager
# pip install fake-useragent

In [13]:
import pandas as pd
import numpy as np
import re
import time
from tqdm import tqdm
from bs4 import BeautifulSoup

# Scrapping
import selenium
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from fake_useragent import UserAgent

# Error Handling
import socket
# import etcd

import urllib3
import urllib.request
from urllib.request import urlopen
from urllib.parse import quote_plus
from urllib.request import urlretrieve
from selenium.common.exceptions import NoSuchElementException, ElementNotInteractableException, ElementClickInterceptedException

import warnings
warnings.filterwarnings("ignore")

# file writing
import json

In [55]:
class Audio:
    def __init__(self):
        self.missed_links = []
        self.df = pd.DataFrame(columns =  ["title", "audio", "tags"])
    
    def handleMissedLink(self):
        if self.missed_links:
            print(f"{self.missed_links} missed!")
        else:
            print("peeerfect")
    
    def scraping(self):
        options = Options()
        ua = UserAgent()
        userAgent = ua.chrome
        print(userAgent)
        options.add_argument('headless')
        options.add_argument('window-size=1920x1080')
        options.add_argument("disable-gpu")
        options.add_argument(f'user-agent={userAgent}')

        time.sleep(1)
        wd = webdriver.Chrome(ChromeDriverManager().install(), chrome_options=options)
        socket.setdefaulttimeout(30)

        last = False
        page = 1
        error_link = []
        df = pd.DataFrame(columns =  ["title", "audio", "tags"])

        while True: 
            print(f"*Page {page} started")
            url = f"https://mixkit.co/free-sound-effects/ambience/?page={page}"
            try:
                wd.get(url)
            except WebDriverException:
                error_link.append(url)
                wd.close()
                wd.quit()

            html = wd.page_source
            soup = BeautifulSoup(html,'html.parser')

            containers = soup.find_all("div", "item-grid-item")
            for container in containers:
                title = container.find("h2", class_="item-grid-card__title").text.replace("\n","")
                audio = container.find("div", attrs={"data-controller": "audio-player"})['data-audio-player-preview-url-value']
                tags = [el.text for el in container.find_all("a", class_="meta-links__link")]
                df = df.append({"title": title, "audio": audio, "tags": tags}, ignore_index=True)

            if not soup.find('a', class_="pagination__link--next"): # 다음페이지 이동 버튼이 없다면 마지막 페이지
                break
                
            page += 1

        wd.close()
        wd.quit()

        self.df = df
        self.missed_links = error_link
        
    def save_df(self):
        self.df.to_csv("df_ambient_sound.csv")
        
        tags = []
        for i, row in self.df.iterrows():
            tags += row['tags']
        tags = list(set(tags)) # extract unique values
        
        tag_values = []
        for tag in tags:
            tag_values.append({ "value": tag, "label": tag.capitalize() })

        tags_json = json.dumps(tag_values)
        tag_lst_json = json.dumps(tags)
        
        with open("tags.json", "w") as file:
            file.write(tags_json)
            
        with open("tags_lst.json", "w") as file:
            file.write(tag_lst_json)
    

In [56]:
# 스크래핑 시작
audio = Audio()
audio.scraping()

Mozilla/5.0 (Windows NT 6.2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/28.0.1464.0 Safari/537.36
*Page 1 started
*Page 2 started
*Page 3 started
*Page 4 started
*Page 5 started


In [57]:
# 유실 데이터 확인
audio.handleMissedLink()

peeerfect


In [58]:
# 파일 저장
audio.save_df()

In [44]:
with open("df_ambient_sound.csv") as file:
    df = file
    print(df)

<_io.TextIOWrapper name='df_ambient_sound.csv' mode='r' encoding='UTF-8'>


In [59]:
with open("tags_lst.json") as tags:
    print(tags.readlines()[0])

["Stomp", "Phone Ring", "Alerts", "Horror", "Splash", "Traffic", "Win", "Drop", "Bathroom", "Heartbeat", "Cheer", "Terror", "Chicken", "Children", "Horse", "Volcano", "Movie", "Sports", "Cinematic", "Waves", "Drone", "Beach", "Laugh", "Animals", "Rivers", "Keyboard", "Lightning", "Morning", "Drum", "Nature", "Footsteps", "Public Places", "Supermarket", "Tools", "Happy", "Truck", "Train", "Night", "Rain", "Scary Woods", "Office", "Farm", "Forest", "Voices", "High Tech", "Choir", "Scream", "Crowd", "Applause", "Lifestyle", "Party", "Technology", "Storm", "Jungle", "Warfare", "Halloween", "Fire", "Walk", "Hit", "Human", "Bird", "Asteroid", "Crickets", "Bus", "Wild", "Magic", "Ambience", "Restaurant", "Cow", "Wind", "Bubbles", "Bedroom", "Thunder", "Sea", "Run", "Tram", "Suspense Music", "Water", "Hum", "Insect", "Orchestra", "Airport", "Construction", "Hall", "Bar", "Transport", "Church Bell", "Engine", "Wood", "Car", "Robot", "City", "Safari", "Dog", "War"]


In [47]:
with open("tags.json") as tags:
    print(tags.readlines())

['[{"value": "Stomp", "label": "Stomp"}, {"value": "Phone Ring", "label": "Phone ring"}, {"value": "Alerts", "label": "Alerts"}, {"value": "Horror", "label": "Horror"}, {"value": "Splash", "label": "Splash"}, {"value": "Traffic", "label": "Traffic"}, {"value": "Win", "label": "Win"}, {"value": "Drop", "label": "Drop"}, {"value": "Bathroom", "label": "Bathroom"}, {"value": "Heartbeat", "label": "Heartbeat"}, {"value": "Cheer", "label": "Cheer"}, {"value": "Terror", "label": "Terror"}, {"value": "Chicken", "label": "Chicken"}, {"value": "Children", "label": "Children"}, {"value": "Horse", "label": "Horse"}, {"value": "Volcano", "label": "Volcano"}, {"value": "Movie", "label": "Movie"}, {"value": "Sports", "label": "Sports"}, {"value": "Cinematic", "label": "Cinematic"}, {"value": "Waves", "label": "Waves"}, {"value": "Drone", "label": "Drone"}, {"value": "Beach", "label": "Beach"}, {"value": "Laugh", "label": "Laugh"}, {"value": "Animals", "label": "Animals"}, {"value": "Rivers", "label"