In [1]:
import os
import re
import json
import gzip
import time
import calendar
from datetime import datetime
import zipfile
import requests
import numpy as np
import pandas as pd
from datetime import date
from decimal import Decimal
CACHE = {} # глобальный кэш, в процессе скрейпинга не обновлять!
SETS = './sets/' # папка с файлами с наборами

In [2]:
def default_serializer(obj): # для правильного преобразования даты в ISO формат
    if isinstance(obj, (date)):
        return obj.isoformat()
    raise TypeError(f'Object of type {obj.__class__.__name__} is not JSON serializable')
    

def load_cache(filename='cache.gz'): # загрузка кэша
    global CACHE
    if os.path.exists(filename) and len(CACHE)==0:
        with gzip.open(filename, 'rb') as gzip_ref:
            CACHE = json.load(gzip_ref)
        print(f'Загружено {len(CACHE)} записей из {filename} в кэш')


def save_cache(filename='cache.gz'): # сохранение кэша
    global CACHE
    with gzip.open(filename, 'wb') as gzip_file:
        json_data = json.dumps(CACHE, ensure_ascii=False, default=default_serializer)
        gzip_file.write(json_data.encode('utf-8'))
    print(f'Сохранено {len(CACHE)} записей в {filename}')
    
    
def load_gzon(filename): 
    if os.path.exists(SETS + filename + '.gz'):
        with gzip.open(SETS + filename + '.gz', 'rb') as gzip_ref:
            return json.load(gzip_ref)
    else:
        return []
    

def save_gzon(filename):
    data = globals().get(filename)
    if data is not None:
        json_data = json.dumps(data, ensure_ascii=False, default=default_serializer)
        with gzip.open(SETS + filename + '.gz', 'wb') as gzip_file:
            gzip_file.write(json_data.encode('utf-8'))
        print(f'Сохранено {len(json_data)} записей в {filename}.gz')
    else:
        print(f'Переменная {filename} не найдена')
        
     
def load_dataset(filename): 
    if os.path.exists(SETS + filename + '.gz'):
        with gzip.open(SETS + filename + '.gz', 'rb') as gzip_ref:
            return pd.DataFrame(json.load(gzip_ref))
        
    
def save_dataset(filename):
    data = globals().get(filename)
    if data is not None:
        data = data.to_dict(orient='records')
        json_data = json.dumps(data, ensure_ascii=False, default=default_serializer)
        with gzip.open(SETS + filename + '.gz', 'wb') as gzip_file:
            gzip_file.write(json_data.encode('utf-8'))
        print(f'Сохранено {len(data)} записей в {filename}.gz')

In [3]:
pd.set_option('display.max_colwidth', None)
pd.set_option('display.max_columns', None)
ecology = load_dataset('ecology')

In [4]:
ecology.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 189165 entries, 0 to 189164
Data columns (total 23 columns):
 #   Column           Non-Null Count   Dtype 
---  ------           --------------   ----- 
 0   start            189165 non-null  object
 1   end              189165 non-null  object
 2   air              291 non-null     object
 3   mean             903 non-null     object
 4   noise            3418 non-null    object
 5   dirt             124 non-null     object
 6   digging          6740 non-null    object
 7   capital          3490 non-null    object
 8   house_lines      31582 non-null   object
 9   clinics          280 non-null     object
 10  roads            3842 non-null    object
 11  parks            37 non-null      object
 12  transports       108 non-null     object
 13  city_places      3093 non-null    object
 14  live_area        103108 non-null  object
 15  streets          24 non-null      object
 16  trashes          26820 non-null   object
 17  factories 

In [5]:
# ищем записи, в которых все поля пустые
ecology[ecology[ecology.columns[2:]].isna().all(axis=1)] 

Unnamed: 0,start,end,air,mean,noise,dirt,digging,capital,house_lines,clinics,roads,parks,transports,city_places,live_area,streets,trashes,factories,markets,autostations,gasstations,bad_gasstations,stroi


In [6]:
# ищем записи, где не пустые поля имеют размер менее 4 символов
ecology[(ecology[ecology.columns[2:]].astype(str).applymap(len) < 4).all(axis=1)]

  ecology[(ecology[ecology.columns[2:]].astype(str).applymap(len) < 4).all(axis=1)]


Unnamed: 0,start,end,air,mean,noise,dirt,digging,capital,house_lines,clinics,roads,parks,transports,city_places,live_area,streets,trashes,factories,markets,autostations,gasstations,bad_gasstations,stroi
56730,26.05.2017,25.08.2017,,,,,,,,,,,,,[],,,,,,,,
96152,17.06.2019,11.07.2019,,,,,,,,,,,,,[],,,,,,,,
111588,15.04.2021,03.05.2021,,,,,,,,,,,,,[],,,,,,,,


In [7]:
# удаляем найденные выше записи
ecology = ecology[~(ecology[ecology.columns[2:]].astype(str).applymap(len) < 4).all(axis=1)]

  ecology = ecology[~(ecology[ecology.columns[2:]].astype(str).applymap(len) < 4).all(axis=1)]


In [8]:
ecology[ecology['start'].str.len() < 5]

Unnamed: 0,start,end,air,mean,noise,dirt,digging,capital,house_lines,clinics,roads,parks,transports,city_places,live_area,streets,trashes,factories,markets,autostations,gasstations,bad_gasstations,stroi
4612,2016,2016,,,,"{'coordinates': [37.663743194, 55.762913736], 'type': 'Point'}",,,,,,,,,,,,,,,,,
4613,2016,2016,,,,"{'coordinates': [37.680028763, 55.770580184], 'type': 'Point'}",,,,,,,,,,,,,,,,,
4614,2016,2016,,,,"{'coordinates': [37.629370991, 55.745521064], 'type': 'Point'}",,,,,,,,,,,,,,,,,
4615,2016,2016,,,,"{'coordinates': [37.63306719, 55.734627231], 'type': 'Point'}",,,,,,,,,,,,,,,,,
4616,2016,2016,,,,"{'coordinates': [37.626954395, 55.765694099], 'type': 'Point'}",,,,,,,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4731,2023,2023,,,,"{'coordinates': [37.643729417, 55.783498251], 'type': 'Point'}",,,,,,,,,,,,,,,,,
4732,2023,2023,,,,"{'coordinates': [37.55605141, 55.719241571], 'type': 'Point'}",,,,,,,,,,,,,,,,,
4733,2023,2023,,,,"{'coordinates': [37.664467302, 55.725918447], 'type': 'Point'}",,,,,,,,,,,,,,,,,
4734,2023,2023,,,,"{'coordinates': [37.716451279, 55.733854245], 'type': 'Point'}",,,,,,,,,,,,,,,,,


In [9]:
ecology['start'] = ecology.apply(lambda x: "01.01." + x['start'] if len(x['start']) < 5 else x['start'], axis=1)

In [10]:
ecology['start'] = ecology.apply(lambda x: "01." + x['start'] if len(x['start']) < 8 else x['start'], axis=1)

In [11]:
ecology['end'] = ecology.apply(lambda x: "31.12." + x['end'] if len(x['end']) < 5 else x['end'], axis=1)

In [12]:
ecology['end'] = ecology.apply(lambda x: "28." + x['end'] if len(x['end']) < 8 else x['end'], axis=1)

In [13]:
ecology['start'] = pd.to_datetime(ecology['start'], format='%d.%m.%Y')

In [14]:
for index, row in ecology.iterrows():
    try:
        ecology.loc[index, 'end'] = pd.to_datetime(row['end'], format='%d.%m.%Y')
    except ValueError:
        print(f"Ошибка на строке {index}: {row}")
        day = min(calendar.monthrange(int(row['end'].split('.')[2]), int(row['end'].split('.')[1]))[1], 28)
        corrected_date = f"{day:02d}.{row['end'].split('.')[1]}.{row['end'].split('.')[2]}"
        ecology.loc[index, 'end'] = pd.to_datetime(corrected_date, format='%d.%m.%Y')

In [15]:
exams = load_dataset('exams')

In [16]:
exams['start'] = exams.apply(lambda x: '01.03.' + x['YEAR'].split('-')[1], axis=1)

In [17]:
exams['end'] = exams.apply(lambda x: '30.09.' + x['YEAR'].split('-')[1], axis=1)

In [18]:
start_list = exams['start'].value_counts()
end_list = exams['end'].value_counts()

In [30]:
exams_ranges = list(zip(list(start_list.index), list(end_list.index)))
exams_set = set()
for i in exams_ranges:
    start_date = datetime.strptime(i[0], '%d.%m.%Y')
    end_date = datetime.strptime(i[1], '%d.%m.%Y')
    date_range = pd.date_range(start=start_date, end=end_date)
    exams_set |= set(date_range)
    
len(exams_set)

1712

In [20]:
if set1.intersection(set2):
    print("Множества пересекаются")
else:
    print("Множества не пересекаются")

[('01.03.2017', '30.09.2017'),
 ('01.03.2018', '30.09.2018'),
 ('01.03.2019', '30.09.2019'),
 ('01.03.2022', '30.09.2022'),
 ('01.03.2021', '30.09.2021'),
 ('01.03.2020', '30.09.2020'),
 ('01.03.2023', '30.09.2023'),
 ('01.03.2024', '30.09.2024')]

In [21]:
date_list1  = pd.date_range(start='01.01.2017', end='01.01.2025')
len(set(date_list1))

2923