### Чтобы завелась magic

In [None]:
!brew install libmagic

In [None]:
import os
import magic
import chardet
import lm_dataformat as lmd
from tqdm.notebook import tqdm
from joblib import Parallel, delayed


mime = magic.Magic(mime=True)

In [None]:
source_path = 'github-data'
dataset = 'github-data-lm'

In [None]:
def simple_filter(content):
    num_digits = len(list(filter(lambda x: x.isdigit(), content)))
    num_newlines = len(list(filter(lambda x: x == '\n', content)))
    
    if num_digits / len(content) > 0.8:
        return False

    if len(content) / (num_newlines + .001) > 200:
        return False

    return True

In [None]:
def remove_not_py(dirname: str, filenames: list[str]) -> None:
    for filename in filenames:
        if filename.endswith('.py'):
            continue
        else:
            os.remove(os.path.join(dirname, filename))

In [None]:
Parallel(n_jobs=4)(
    delayed(remove_not_py)(dirname, filenames) for dirname, _, filenames in tqdm(os.walk(source_path)))

In [None]:
ar = lmd.Archive(dataset)
for dirname, _, filenames in tqdm(os.walk(source_path)):
    for filename in filenames:
        if filename.endswith('.py'):
            meta = {}
            file = os.path.join(dirname, filename)
            
            try:
                _type = mime.from_file(file)
            except FileNotFoundError:
                continue
                
            if not _type.startswith('text'):
                continue
                
            with open(file, 'rb') as f:
                buf = f.read()
                
            try:
                buf = buf.decode('UTF-8')
            except UnicodeDecodeError:
                enc = chardet.detect(buf)
                
                if enc['encoding'] is None:
                    continue
                    
                try:
                    buf = buf.decode(enc['encoding'])
                except UnicodeDecodeError:
                    continue
                    
            if not simple_filter(buf):
                continue
                
            meta['file_name'] = filename
            meta['mime_type'] = _type

            ar.add_data(buf, meta=meta)
            ar.commit()