Импорты

In [159]:
import os
import requests
import re
import csv
import pandas as pd

from bs4 import BeautifulSoup
from datetime import datetime
from google.colab import drive

In [160]:
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


Константы

In [161]:
CURR_DIR = '/content/drive/MyDrive/lab2_ntrps'

Функции

Проверка наличия репозиториев датасета

In [162]:
def check_repository(dir: str, name: str) -> None:
    dataset_directory = os.path.join(dir, name)
    if not os.path.exists(dataset_directory):
        os.makedirs(dataset_directory)
    return dataset_directory

Парсер ссылки на картинку

In [163]:
def parser_url(url: str) -> str:
    pattern = r'img_url=([^&]+)&text='
    match = re.search(pattern, url)

    if match:
        img_url_encoded = match.group(1)
        img_url_decoded = img_url_encoded.replace('%2F', '/').replace('%3A', ':')
        return img_url_decoded
    else:
        print('Ошибка: Ссылка после img_url не найдена в URL')

Вычисления необходимого количества страниц для скачивания (1 страница = 30 картинкам)

In [164]:
def calc_pages(num_images: int) -> int:
    return num_images // 30 + (num_images % 30 > 0) if num_images > 30 else 1

Получение HTML тегов

In [165]:
def get_html_tags(mini_images: bool) -> tuple[str, str, str]:
    if mini_images:
        return 'img', 'serp-item__thumb', 'src'
    else:
        return 'a', 'serp-item__link', 'href'

Запись в CSV

In [166]:
def write_csv_file(path: str, data: list) -> None:
    mode = 'w' if not os.path.exists(path) else 'a'
    with open(path, mode, newline='') as csv_file:
      csv_writer = csv.writer(csv_file)
      csv_writer.writerow(data)

Закачка картинок

In [167]:
def download_image(url: str, save_path: str) -> bool:
    try:
        response = requests.get(url, headers={'User-Agent':'Mozilla/5.0'}, stream=True)
        if(response.status_code == 200):
            with open(save_path, 'wb') as file:
                for chunk in response.iter_content(1024):
                    file.write(chunk)
            return True
        else:
            print(f'Ошибка: Не удалось загрузить изображение: {url}')
            return False
    except Exception as e:
        print(f'Ошибка при загрузке изображения: {url}')
        return False

In [168]:
def download_images(query: str, num_images: int, mini_images: bool = False) -> None:
    pages = calc_pages(num_images)
    class_folder = check_repository(CURR_DIR, f'dataset/{query}')

    downloaded_count = 0

    base_url = 'https:'

    csv_file_path = CURR_DIR + '/csv/' + f'{query}_dataset.csv'

    #а вот это чтобы без движков было, грузим странички
    for page in range(0, pages):
        search_url = f'https://yandex.ru/images/search?text={query}&p={page}'

        write_csv_file(csv_file_path, ['date', 'file_name', 'url'])

        #сделал с with для автоматического закрытия соединения
        with requests.get(search_url, headers={'User-Agent':'Mozilla/5.0'}) as response:
            soup = BeautifulSoup(response.text, 'html.parser')

            tag, tag_class, tag_source = get_html_tags(mini_images)

            for a in soup.find_all(tag, class_=tag_class):
                img_url = a[tag_source]
                # получаем полный URL изображения
                if mini_images and not img_url.startswith('http'):
                    img_url = base_url + img_url
                elif img_url.startswith('/images'):
                    img_url = parser_url(img_url)

                #csv_image_filename = image_filename
                image_filename = f'{downloaded_count:04d}.jpg'
                image_path = os.path.join(class_folder, image_filename)
                if(download_image(img_url, image_path)):
                    downloaded_count += 1
                    print(f"Загружено изображений для {query}: {downloaded_count}/{num_images}")

                    write_csv_file(csv_file_path, [datetime.now().strftime('%Y-%m-%d'), image_filename, img_url])

                if(downloaded_count >= num_images):
                    break

Pandas()

In [169]:
def check_dataset(df: pd.DataFrame, required_fields: list) -> bool:
    for field in required_fields:
        if field not in df.columns:
            return False
    return True

In [170]:
def create_dataset_from_files(files: list) -> pd.DataFrame:
    df = pd.DataFrame()
    for file in files:
        data = pd.read_csv(file)
        if check_dataset(data, ['date', 'file_name', 'url']):
            data['date'] = pd.to_datetime(data['date'])
            df = df.append(data, ignore_index=True)
        else:
            print(f"Ошибка: Файл {file} не содержит необходимых полей.")
        #FutureWarning: The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.
        #df = df.append(data, ignore_index=True) --> менять на df = df.append(data)
        #df = pd.concat(df_list, ignore_index=True) добавлять в данную строчку (просто раскомменитить)
    return df

Разделяет даты и данные

In [171]:
def separation_date_by_data(df: pd.DataFrame) -> None:
    df_date = df['date']
    df_data = df.drop('date', axis=1)

    df_date.to_csv(CURR_DIR + '/X.csv', index=False)
    df_data.to_csv(CURR_DIR + '/Y.csv', index=False)

Разделение по годам

In [172]:
def separation_by_years(df: pd.DataFrame) -> None:
    df['date'] = pd.to_datetime(df['date'])

    for year, group in df.groupby(df['date'].dt.year):
        start_date = group['date'].min().strftime('%Y%m%d')
        end_date = group['date'].max().strftime('%Y%m%d')
        filename = f'{start_date}_{end_date}.csv'
        group.to_csv(CURR_DIR + '/' + filename, index=False)

Разделение по неделям

In [173]:
def separation_by_weeks(df: pd.DataFrame) -> None:
    df['date'] = pd.to_datetime(df['date'])

    #вылазиет предупреждение в мейне вот как советует использовать данный цикл сам пандас (из-за устаревшей версии)
    #FutureWarning: Series.dt.weekofyear and Series.dt.week have been deprecated. Please use Series.dt.isocalendar().week instead.
    #for (year, week), group in df.groupby([df['date'].dt.isocalendar().year, df['date'].dt.isocalendar().week]):
    for (year, week), group in df.groupby([df['date'].dt.year, df['date'].dt.week]):
        start_date = group['date'].min().strftime('%Y%m%d')
        end_date = group['date'].max().strftime('%Y%m%d')
        filename = f'{start_date}_{end_date}.csv'
        group.to_csv(CURR_DIR + '/' + filename, index=False)

4 пункт

In [174]:
def get_data(df: pd.DataFrame, date: datetime) -> None | pd.DataFrame:
    data = df[df['date'] == date]
    if data.empty:
        return None
    else:
        return data.drop(columns=['date'])

In [175]:
def next(df: pd.DataFrame, index: int) -> None | tuple[str, str, str]:
    if index < len(df):
        return tuple(df.loc[index, ['date', 'file_name', 'url']])
    return None

Итератор

In [176]:
class DataIterator:
    def __init__(self, df: pd.DataFrame):
        self.df = df
        self.counter = 0

    def __iter__(self):
        return self

    def __next__(self):
        if self.counter < len(self.df):
            result = tuple(self.df.loc[self.counter, ['date', 'file_name', 'url']])
            self.counter += 1
            return result
        else:
            raise StopIteration

main()

In [177]:
def main():
    #check_repository(CURR_DIR, 'dataset')
    #check_repository(CURR_DIR, 'csv')
    #download_images('polar bear', 5, False)
    #download_images('brown bear', 5, False)

    #c(CURR_DIR + '/csv/brown bear_dataset.csv')
    #f(CURR_DIR + '/csv/test_dataset.csv')
    #h(CURR_DIR + '/csv/test_weeks_dataset.csv')

    #df = pd.read_csv(CURR_DIR + '/csv/test_weeks_dataset.csv', parse_dates=['date'])
    df = create_dataset_from_files([CURR_DIR + '/csv/test_weeks_dataset.csv'])
    print(get_data(df, datetime(2023, 1, 24)))

    #for index in range(0, len(df)):
    #    print(next(df, index))

    #iterator = DataIterator(df)
    #for item in iterator:
    #    print(item)

In [178]:
if __name__ == '__main__':
    main()

  file_name                                                url
7  0002.jpg  https://wallpapersgood.ru/wallpapers/main2/201...


  df = df.append(data, ignore_index=True)
