In [1]:
import os
import re
import json
import gzip
import time
import zipfile
import requests
import numpy as np
import pandas as pd
from datetime import date
from decimal import Decimal
CACHE = {} # глобальный кэш, в процессе скрейпинга не обновлять!
SETS = './sets/' # папка с файлами с наборами

In [2]:
def default_serializer(obj): # для правильного преобразования даты в ISO формат
    if isinstance(obj, (date)):
        return obj.isoformat()
    raise TypeError(f'Object of type {obj.__class__.__name__} is not JSON serializable')
    

def load_cache(filename='cache.gz'): # загрузка кэша
    global CACHE
    if os.path.exists(filename) and len(CACHE)==0:
        with gzip.open(filename, 'rb') as gzip_ref:
            CACHE = json.load(gzip_ref)
        print(f'Загружено {len(CACHE)} записей из {filename} в кэш')


def save_cache(filename='cache.gz'): # сохранение кэша
    global CACHE
    with gzip.open(filename, 'wb') as gzip_file:
        json_data = json.dumps(CACHE, ensure_ascii=False, default=default_serializer)
        gzip_file.write(json_data.encode('utf-8'))
    print(f'Сохранено {len(CACHE)} записей в {filename}')
    
    
def load_gzon(filename): 
    if os.path.exists(SETS + filename + '.gz'):
        with gzip.open(SETS + filename + '.gz', 'rb') as gzip_ref:
            return json.load(gzip_ref)
    else:
        return []
    

def save_gzon(filename):
    data = globals().get(filename)
    if data is not None:
        json_data = json.dumps(data, ensure_ascii=False, default=default_serializer)
        with gzip.open(SETS + filename + '.gz', 'wb') as gzip_file:
            gzip_file.write(json_data.encode('utf-8'))
        print(f'Сохранено {len(json_data)} записей в {filename}.gz')
    else:
        print(f'Переменная {filename} не найдена')
        
     
def load_dataset(filename): 
    if os.path.exists(SETS + filename + '.gz'):
        with gzip.open(SETS + filename + '.gz', 'rb') as gzip_ref:
            return pd.DataFrame(json.load(gzip_ref))
        
    
def save_dataset(filename):
    data = globals().get(filename)
    if data is not None:
        data = data.to_dict(orient='records')
        json_data = json.dumps(data, ensure_ascii=False, default=default_serializer)
        with gzip.open(SETS + filename + '.gz', 'wb') as gzip_file:
            gzip_file.write(json_data.encode('utf-8'))
        print(f'Сохранено {len(data)} записей в {filename}.gz')

In [3]:
pd.set_option('display.max_colwidth', None)
pd.set_option('display.max_columns', None)
ecology = load_dataset('ecology')

In [5]:
ecology.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 189214 entries, 0 to 189213
Data columns (total 23 columns):
 #   Column           Non-Null Count   Dtype 
---  ------           --------------   ----- 
 0   start            189214 non-null  object
 1   end              189214 non-null  object
 2   air              291 non-null     object
 3   mean             903 non-null     object
 4   noise            3418 non-null    object
 5   dirt             124 non-null     object
 6   digging          6740 non-null    object
 7   capital          3490 non-null    object
 8   house_lines      31582 non-null   object
 9   clinics          280 non-null     object
 10  roads            3842 non-null    object
 11  parks            37 non-null      object
 12  transports       108 non-null     object
 13  city_places      3093 non-null    object
 14  live_area        103108 non-null  object
 15  streets          24 non-null      object
 16  trashes          26820 non-null   object
 17  factories 