<a href="https://colab.research.google.com/github/aiyaszk/Pioneer24/blob/main/Data_scraping.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!pip install pytube ffmpeg-python

Collecting pytube
  Downloading pytube-15.0.0-py3-none-any.whl (57 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/57.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m57.6/57.6 kB[0m [31m2.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting ffmpeg-python
  Downloading ffmpeg_python-0.2.0-py3-none-any.whl (25 kB)
Installing collected packages: pytube, ffmpeg-python
Successfully installed ffmpeg-python-0.2.0 pytube-15.0.0


In [None]:
!pip install youtube_dl
!pip install moviepy

Collecting youtube_dl
  Downloading youtube_dl-2021.12.17-py2.py3-none-any.whl (1.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.9/1.9 MB[0m [31m17.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: youtube_dl
Successfully installed youtube_dl-2021.12.17


In [None]:
from bs4 import BeautifulSoup
import requests
from pytube import YouTube
import ffmpeg
import os
import shutil
from google.colab import drive

# Mount Google Drive
drive.mount('/content/drive')

# Global counter for iframes
iframe_counter = 0

# Dictionary to store unique matches
matches = {}

# Create directories for train and test datasets in Google Drive
train_dir = '/content/drive/MyDrive/FRC_Scouter/train'
test_dir = '/content/drive/MyDrive/FRC_Scouter/test'
os.makedirs(train_dir, exist_ok=True)
os.makedirs(test_dir, exist_ok=True)

# Function to download YouTube video
def download_youtube_video(youtube_url, match_id):
    try:
        yt = YouTube(youtube_url)
        stream = yt.streams.filter(progressive=True, file_extension='mp4').order_by('resolution').desc().first()
        if stream:
            video_path = stream.download(filename=f"/content/{match_id}.mp4")
            print(f"Downloaded video for match {match_id}")
            return video_path
        else:
            print(f"No suitable stream found for URL {youtube_url}")
            return None
    except Exception as e:
        print(f"Error downloading YouTube video {youtube_url}: {e}")
        return None

# Function to capture screenshots from video
def capture_screenshots(video_path, match_id):
    try:
        duration = ffmpeg.probe(video_path)['format']['duration']
        duration = int(float(duration))
        screenshots = []
        for i in range(0, duration, 5):
            output_path = f"/content/{match_id}_{i//5+1:02}.png"
            try:
                (
                    ffmpeg
                    .input(video_path, ss=i)
                    .output(output_path, vframes=1)
                    .overwrite_output()
                    .run(capture_stdout=True, capture_stderr=True)
                )
                screenshots.append(output_path)
                print(f"Captured screenshot for match {match_id} at {i} seconds")
            except ffmpeg.Error as e:
                print(f"ffmpeg error for {video_path} at {i} seconds: {e.stderr.decode('utf-8')}")
        return screenshots
    except Exception as e:
        print(f"Error capturing screenshots from {video_path}: {e}")
        return []

# Function to sort screenshots into train and test folders
def sort_screenshots(screenshots):
    for idx, screenshot in enumerate(screenshots):
        if idx % 5 == 0:
            shutil.move(screenshot, os.path.join(test_dir, os.path.basename(screenshot)))
            print(f"Moved {screenshot} to test folder")
        else:
            shutil.move(screenshot, os.path.join(train_dir, os.path.basename(screenshot)))
            print(f"Moved {screenshot} to train folder")

# Function to get the title of a YouTube video
def get_youtube_title(youtube_url):
    try:
        response = requests.get(youtube_url)
        if response.status_code != 200:
            return "Unknown Title"
        soup = BeautifulSoup(response.text, 'html.parser')
        title_tag = soup.find("title")
        if title_tag:
            return title_tag.text.replace(" - YouTube", "").strip()
        return "Unknown Title"
    except Exception as e:
        print(f"Error fetching YouTube title for URL {youtube_url}: {e}")
        return "Unknown Title"

# Function to process match URLs
def process_match_url(base_url, match_type):
    global iframe_counter
    i = 1
    while True:
        match_url = f"{base_url}_{match_type}{i}"
        try:
            response = requests.get(match_url)
            if response.status_code == 404:
                break

            soup = BeautifulSoup(response.text, 'html.parser')
            iframes = soup.find_all('iframe')
            for iframe in iframes:
                src = iframe.get('src')
                if src and 'youtube.com/embed/' in src:
                    match_id = match_url.split('/')[-1]
                    if match_id not in matches:
                        iframe_counter += 1
                        youtube_url = src.replace('/embed/', '/watch?v=')
                        matches[match_id] = youtube_url
                        print(f"{match_id}: {youtube_url}")

                        # Download the YouTube video
                        video_path = download_youtube_video(youtube_url, match_id)
                        if video_path:
                            # Capture screenshots
                            screenshots = capture_screenshots(video_path, match_id)
                            # Sort screenshots into train and test folders
                            sort_screenshots(screenshots)
                            # Remove the downloaded video
                            os.remove(video_path)
                            print(f"Removed downloaded video for match {match_id}")

        except Exception as e:
            print(f"Error processing URL {match_url}: {e}")

        i += 1

# Function to process playoff match URLs (semifinals and finals)
def process_playoff_match_url(base_url, match_type):
    global iframe_counter
    i = 1
    while True:
        if match_type == "sf":
            match_url = f"{base_url}_{match_type}{i}m1"
            i += 1
        elif match_type == "f":
            m = 1
            while True:
                match_url = f"{base_url}_{match_type}1m{m}"
                try:
                    response = requests.get(match_url)
                    if response.status_code == 404:
                        break

                    soup = BeautifulSoup(response.text, 'html.parser')
                    iframes = soup.find_all('iframe')
                    for iframe in iframes:
                        src = iframe.get('src')
                        if src and 'youtube.com/embed/' in src:
                            match_id = match_url.split('/')[-1]
                            if match_id not in matches:
                                iframe_counter += 1
                                youtube_url = src.replace('/embed/', '/watch?v=')
                                matches[match_id] = youtube_url
                                print(f"{match_id}: {youtube_url}")

                                # Download the YouTube video
                                video_path = download_youtube_video(youtube_url, match_id)
                                if video_path:
                                    # Capture screenshots
                                    screenshots = capture_screenshots(video_path, match_id)
                                    # Sort screenshots into train and test folders
                                    sort_screenshots(screenshots)
                                    # Remove the downloaded video
                                    os.remove(video_path)
                                    print(f"Removed downloaded video for match {match_id}")

                except Exception as e:
                    print(f"Error processing URL {match_url}: {e}")
                m += 1
            break
        else:
            break

        try:
            response = requests.get(match_url)
            if response.status_code == 404:
                break

            soup = BeautifulSoup(response.text, 'html.parser')
            iframes = soup.find_all('iframe')
            for iframe in iframes:
                src = iframe.get('src')
                if src and 'youtube.com/embed/' in src:
                    match_id = match_url.split('/')[-1]
                    if match_id not in matches:
                        iframe_counter += 1
                        youtube_url = src.replace('/embed/', '/watch?v=')
                        matches[match_id] = youtube_url
                        print(f"{match_id}: {youtube_url}")

                        # Download the YouTube video
                        video_path = download_youtube_video(youtube_url, match_id)
                        if video_path:
                            # Capture screenshots
                            screenshots = capture_screenshots(video_path, match_id)
                            # Sort screenshots into train and test folders
                            sort_screenshots(screenshots)
                            # Remove the downloaded video
                            os.remove(video_path)
                            print(f"Removed downloaded video for match {match_id}")

        except Exception as e:
            print(f"Error processing URL {match_url}: {e}")

        i += 1

# Function to process event URLs
def process_event_url(event_url):
    base_url = event_url.replace('/event/', '/match/')
    process_match_url(base_url, "qm")  # Process qualification matches
    process_playoff_match_url(base_url, "sf")  # Process semifinals
    process_playoff_match_url(base_url, "f")   # Process finals

# Main function to start the process
def main(url):
    try:
        response = requests.get(url)
        if response.status_code != 200:
            return

        soup = BeautifulSoup(response.text, 'html.parser')
        links = soup.find_all('a', href=True)
        event_urls = []
        for link in links:
            sub_url = link['href']
            if sub_url.startswith('/'):
                sub_url = requests.compat.urljoin(url, sub_url)
            if sub_url.startswith('http') and 'thebluealliance.com/event/' in sub_url and not sub_url.endswith('/feed'):
                event_urls.append(sub_url)

        # Process each event URL in sequence
        for event_url in event_urls:
            process_event_url(event_url)

    except Exception as e:
        print(f"Error fetching main URL {url}: {e}")

# Example URL to start the process
url = "https://www.thebluealliance.com/events"
main(url)


[1;30;43mGörüntülenen çıkış son 5000 satıra kısaltıldı.[0m
Moved /content/2024inmis_sf11m1_17.png to train folder
Moved /content/2024inmis_sf11m1_18.png to train folder
Moved /content/2024inmis_sf11m1_19.png to train folder
Moved /content/2024inmis_sf11m1_20.png to train folder
Moved /content/2024inmis_sf11m1_21.png to test folder
Moved /content/2024inmis_sf11m1_22.png to train folder
Moved /content/2024inmis_sf11m1_23.png to train folder
Moved /content/2024inmis_sf11m1_24.png to train folder
Moved /content/2024inmis_sf11m1_25.png to train folder
Moved /content/2024inmis_sf11m1_26.png to test folder
Moved /content/2024inmis_sf11m1_27.png to train folder
Moved /content/2024inmis_sf11m1_28.png to train folder
Moved /content/2024inmis_sf11m1_29.png to train folder
Moved /content/2024inmis_sf11m1_30.png to train folder
Moved /content/2024inmis_sf11m1_31.png to test folder
Moved /content/2024inmis_sf11m1_32.png to train folder
Moved /content/2024inmis_sf11m1_33.png to train folder
Moved /

In [None]:
#ALL DATA IN THE WANTED FORMAT

from bs4 import BeautifulSoup
import requests

# Global counter for iframes
iframe_counter = 0

# Dictionary to store unique matches
matches = {}

# Function to get the title of a YouTube video
def get_youtube_title(youtube_url):
    try:
        response = requests.get(youtube_url)
        if response.status_code != 200:
            return "Unknown Title"
        soup = BeautifulSoup(response.text, 'html.parser')
        title_tag = soup.find("title")
        if title_tag:
            return title_tag.text.replace(" - YouTube", "").strip()
        return "Unknown Title"
    except Exception as e:
        print(f"Error fetching YouTube title for URL {youtube_url}: {e}")
        return "Unknown Title"

# Function to process match URLs
def process_match_url(base_url, match_type):
    global iframe_counter
    i = 1
    while True:
        match_url = f"{base_url}_{match_type}{i}"
        try:
            response = requests.get(match_url)
            if response.status_code == 404:
                break

            soup = BeautifulSoup(response.text, 'html.parser')
            iframes = soup.find_all('iframe')
            for iframe in iframes:
                src = iframe.get('src')
                if src and 'youtube.com/embed/' in src:
                    match_id = match_url.split('/')[-1]
                    if match_id not in matches:
                        iframe_counter += 1
                        youtube_url = src.replace('/embed/', '/watch?v=')
                        matches[match_id] = youtube_url
                        print(f"{match_id}: {youtube_url}")

        except Exception as e:
            print(f"Error processing URL {match_url}: {e}")

        i += 1

# Function to process playoff match URLs (semifinals and finals)
def process_playoff_match_url(base_url, match_type):
    global iframe_counter
    i = 1
    while True:
        if match_type == "sf":
            match_url = f"{base_url}_{match_type}{i}m1"
            i += 1
        elif match_type == "f":
            m = 1
            while True:
                match_url = f"{base_url}_{match_type}1m{m}"
                try:
                    response = requests.get(match_url)
                    if response.status_code == 404:
                        break

                    soup = BeautifulSoup(response.text, 'html.parser')
                    iframes = soup.find_all('iframe')
                    for iframe in iframes:
                        src = iframe.get('src')
                        if src and 'youtube.com/embed/' in src:
                            match_id = match_url.split('/')[-1]
                            if match_id not in matches:
                                iframe_counter += 1
                                youtube_url = src.replace('/embed/', '/watch?v=')
                                matches[match_id] = youtube_url
                                print(f"{match_id}: {youtube_url}")

                except Exception as e:
                    print(f"Error processing URL {match_url}: {e}")
                m += 1
            break
        else:
            break

        try:
            response = requests.get(match_url)
            if response.status_code == 404:
                break

            soup = BeautifulSoup(response.text, 'html.parser')
            iframes = soup.find_all('iframe')
            for iframe in iframes:
                src = iframe.get('src')
                if src and 'youtube.com/embed/' in src:
                    match_id = match_url.split('/')[-1]
                    if match_id not in matches:
                        iframe_counter += 1
                        youtube_url = src.replace('/embed/', '/watch?v=')
                        matches[match_id] = youtube_url
                        print(f"{match_id}: {youtube_url}")

        except Exception as e:
            print(f"Error processing URL {match_url}: {e}")

        i += 1

# Function to process event URLs
def process_event_url(event_url):
    base_url = event_url.replace('/event/', '/match/')
    process_match_url(base_url, "qm")  # Process qualification matches
    process_playoff_match_url(base_url, "sf")  # Process semifinals
    process_playoff_match_url(base_url, "f")   # Process finals

# Main function to start the process
def main(url):
    try:
        response = requests.get(url)
        if response.status_code != 200:
            return

        soup = BeautifulSoup(response.text, 'html.parser')
        links = soup.find_all('a', href=True)
        event_urls = []
        for link in links:
            sub_url = link['href']
            if sub_url.startswith('/'):
                sub_url = requests.compat.urljoin(url, sub_url)
            if sub_url.startswith('http') and 'thebluealliance.com/event/' in sub_url and not sub_url.endswith('/feed'):
                event_urls.append(sub_url)

        # Process each event URL in sequence
        for event_url in event_urls:
            process_event_url(event_url)

    except Exception as e:
        print(f"Error fetching main URL {url}: {e}")

# Example URL to start the process
url = "https://www.thebluealliance.com/events"
main(url)


[1;30;43mGörüntülenen çıkış son 5000 satıra kısaltıldı.[0m
2024caoc_qm57: https://www.youtube.com/watch?v=3KUN0YX0UCE
2024caoc_qm58: https://www.youtube.com/watch?v=B2FhkA1uGxI
2024caoc_qm59: https://www.youtube.com/watch?v=5PAL0p5_Z0E
2024caoc_qm60: https://www.youtube.com/watch?v=QHkSJrzNQnI
2024caoc_qm61: https://www.youtube.com/watch?v=tTqUxfqu0ew
2024caoc_qm62: https://www.youtube.com/watch?v=6xPiSJfAu0w
2024caoc_qm63: https://www.youtube.com/watch?v=a0Ad7GrV5iw
2024caoc_qm64: https://www.youtube.com/watch?v=Ql7tDVKpgV4
2024caoc_qm66: https://www.youtube.com/watch?v=ylRKpg2sBU4
2024caoc_qm67: https://www.youtube.com/watch?v=amkGpq9SQuM
2024caoc_qm68: https://www.youtube.com/watch?v=VDfPMhMgT6g
2024caoc_qm69: https://www.youtube.com/watch?v=O6AiwSytSiU
2024caoc_qm70: https://www.youtube.com/watch?v=VWaG5jsY3rw
2024caoc_qm71: https://www.youtube.com/watch?v=bW7TCws2IEI
2024caoc_qm72: https://www.youtube.com/watch?v=7-P3d3D1Rhc
2024caoc_sf1m1: https://www.youtube.com/watch?v=wYC5lS

In [None]:
#ALL YOUTUBE VIDEO LINKS FOR 2024 SEASON PRINTED, LINKS ARE NOT TRACED BUT MADE WITH ADDITION OF _qm1 etc.
#FINAL MATCHES HAD AN ISSUE, ALL PROCESSİNG LINKS WERE USELESS

from bs4 import BeautifulSoup
import requests

# Global counter for iframes
iframe_counter = 0

# Function to get the title of a YouTube video
def get_youtube_title(youtube_url):
    try:
        response = requests.get(youtube_url)
        if response.status_code != 200:
            return "Unknown Title"
        soup = BeautifulSoup(response.text, 'html.parser')
        title_tag = soup.find("title")
        if title_tag:
            return title_tag.text.replace(" - YouTube", "").strip()
        return "Unknown Title"
    except Exception as e:
        print(f"Error fetching YouTube title for URL {youtube_url}: {e}")
        return "Unknown Title"

# Function to process match URLs
def process_match_url(base_url, match_type, max_num):
    global iframe_counter
    for i in range(1, max_num + 1):
        if match_type == "qm":
            match_url = f"{base_url}_{match_type}{i}"
        else:
            match_url = f"{base_url}_{match_type}{i}m1"
        try:
            response = requests.get(match_url)
            if response.status_code == 404:
                break

            soup = BeautifulSoup(response.text, 'html.parser')
            iframes = soup.find_all('iframe')
            for iframe in iframes:
                src = iframe.get('src')
                if src and 'youtube.com/embed/' in src:
                    iframe_counter += 1
                    youtube_url = src.replace('/embed/', '/watch?v=')
                    title = get_youtube_title(youtube_url)
                    print(f"Frame {iframe_counter}: {match_url}\nYouTube link: {youtube_url}")

        except Exception as e:
            print(f"Error processing URL {match_url}: {e}")

# Function to process event URLs
def process_event_url(event_url):
    print(f"Processing Event URL: {event_url}")
    base_url = event_url.replace('/event/', '/match/')
    process_match_url(base_url, "qm", 100)  # Process qualification matches
    process_match_url(base_url, "sf", 20)  # Process semifinals
    process_match_url(base_url, "f", 10)   # Process finals

# Main function to start the process
def main(url):
    try:
        response = requests.get(url)
        if response.status_code != 200:
            return

        soup = BeautifulSoup(response.text, 'html.parser')
        links = soup.find_all('a', href=True)
        event_urls = []
        for link in links:
            sub_url = link['href']
            if sub_url.startswith('/'):
                sub_url = requests.compat.urljoin(url, sub_url)
            if sub_url.startswith('http') and 'thebluealliance.com/event/' in sub_url and not sub_url.endswith('/feed'):
                event_urls.append(sub_url)

        # Process each event URL in sequence
        for event_url in event_urls:
            process_event_url(event_url)

    except Exception as e:
        print(f"Error fetching main URL {url}: {e}")

# Example URL to start the process
url = "https://www.thebluealliance.com/events"
main(url)


[1;30;43mGörüntülenen çıkış son 5000 satıra kısaltıldı.[0m
Frame 16098: https://www.thebluealliance.com/match/2024wila_f1m1
YouTube link: https://www.youtube.com/watch?v=aBOQVN7HFMI
Processing Event URL: https://www.thebluealliance.com/event/2024caav
Frame 16099: https://www.thebluealliance.com/match/2024caav_qm2
YouTube link: https://www.youtube.com/watch?v=m8Kz4tlmoUQ
Frame 16100: https://www.thebluealliance.com/match/2024caav_qm3
YouTube link: https://www.youtube.com/watch?v=bnqlSGu85IU
Frame 16101: https://www.thebluealliance.com/match/2024caav_qm4
YouTube link: https://www.youtube.com/watch?v=_2u2MiimQoA
Frame 16102: https://www.thebluealliance.com/match/2024caav_qm5
YouTube link: https://www.youtube.com/watch?v=GrTBL9gHzpg
Frame 16103: https://www.thebluealliance.com/match/2024caav_qm6
YouTube link: https://www.youtube.com/watch?v=fLd7Ko0KeAg
Frame 16104: https://www.thebluealliance.com/match/2024caav_qm7
YouTube link: https://www.youtube.com/watch?v=52oyH4lhZXo
Frame 16105: ht

In [None]:
# THIS CODE WAS TO ACESS ALL LINKS INSIDE EVENTS AND THE LINKS INSIDE THE SPECIFIC EVENT PAGES


from bs4 import BeautifulSoup
import requests

# Recursive function to process URLs
def process_url(url, depth=0, max_depth=2):
    if depth > max_depth:
        return
    print(f"{'  ' * depth}Fetching content from URL: {url}")
    try:
        response = requests.get(url)
        if response.status_code != 200:
            print(f"{'  ' * depth}Failed to retrieve URL: {url} with status code: {response.status_code}")
            return

        soup = BeautifulSoup(response.text, 'html.parser')

        # Check for iframes
        iframes = soup.find_all('iframe')
        print(f"{'  ' * depth}Found iframes: {len(iframes)}")

        # Find all links
        links = soup.find_all('a', href=True)
        print(f"{'  ' * depth}Found links: {len(links)}")

        for index, link in enumerate(links):
            sub_url = link['href']
            if sub_url.startswith('/'):
                sub_url = requests.compat.urljoin(url, sub_url)
            if sub_url.startswith('http') and 'thebluealliance.com' in sub_url:
                print(f"{'  ' * depth}Processing sub-URL {index + 1}: {sub_url}")
                process_url(sub_url, depth + 1, max_depth)

    except Exception as e:
        print(f"Error processing URL {url}: {e}")

# MAIN FUNCTION START
def main(url):
    print(f"Fetching content from URL: {url}")
    try:
        response = requests.get(url)
        if response.status_code != 200:
            print(f"Failed to retrieve main URL: {url} with status code: {response.status_code}")
            return

        soup = BeautifulSoup(response.text, 'html.parser')
        links = soup.find_all('a', href=True)
        print(f"Found {len(links)} links on the main URL")

        for index, link in enumerate(links):
            sub_url = link['href']
            if sub_url.startswith('/'):
                sub_url = requests.compat.urljoin(url, sub_url)
            if sub_url.startswith('http') and 'thebluealliance.com' in sub_url:
                print(f"Processing Events Link {index + 1}: {sub_url}")
                process_url(sub_url, 1)
    except Exception as e:
        print(f"Error fetching main URL {url}: {e}")

# Example URL to start the process
url = "https://www.thebluealliance.com/events"
main(url)


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
    Processing sub-URL 329: https://www.thebluealliance.com/team/1923/2024
    Processing sub-URL 330: https://www.thebluealliance.com/team/9424/2024
    Processing sub-URL 331: https://www.thebluealliance.com/match/2024njski_qm19
    Processing sub-URL 332: https://www.thebluealliance.com/team/5490/2024
    Processing sub-URL 333: https://www.thebluealliance.com/team/6860/2024
    Processing sub-URL 334: https://www.thebluealliance.com/team/8714/2024
    Processing sub-URL 335: https://www.thebluealliance.com/team/223/2024
    Processing sub-URL 336: https://www.thebluealliance.com/team/1923/2024
    Processing sub-URL 337: https://www.thebluealliance.com/team/9424/2024
    Processing sub-URL 338: https://www.thebluealliance.com/match/2024njski_qm20
    Processing sub-URL 339: https://www.thebluealliance.com/match/2024njski_qm20
    Processing sub-URL 340: https://www.thebluealliance.com/team/4750/2024
    Processing sub