In [None]:
import math
import numpy as np
import csv
import os
import matplotlib
from tqdm import tqdm

# Старый метод генерации

In [None]:
def randomize(rMin, rMax, fiMin, fiMax, gMin, gMax):
    rObj = np.random.uniform(rMin, rMax)
    fiObj = np.random.uniform(fiMin, fiMax)
    gObj = np.random.uniform(gMin, gMax)
    xObj = rObj * math.cos(fiObj)
    yObj = rObj * math.sin(fiObj)
    return xObj, yObj, gObj, fiObj, rObj


def valid(xObj, yObj, gObj, h, l):
    t = math.sqrt(4 * pow(h, 2) - pow(l, 2)) * yObj
    cond1 = t + l * xObj > l * h
    cond2 = t - l * xObj > l * h
    cond3 = abs(-l * xObj - t + l * h) > 2 * h * gObj
    cond4 = abs(l * xObj - t + l * h) > 2 * h * gObj
    return cond1 and cond2 and cond3 and cond4


def F_a(x, y, r, h):
    if y >= 0 and x + h >= 0:
        return math.asin(y / r)
    elif y >= 0 and x + h < 0:
        return math.pi - math.asin(y / r)
    elif y < 0 and x + h < 0:
        return math.pi + math.asin(abs(y) / r)
    elif y < 0 and x + h >= 0:
        return 2 * math.pi - math.asin(abs(y) / r)


def F_b(x, y, r, h):
    if y >= 0 and x - h >= 0:
        return math.asin(y / r)
    elif y >= 0 and x - h < 0:
        return math.pi - math.asin(y / r)
    elif y < 0 and x - h < 0:
        return math.pi + math.asin(abs(y) / r)
    elif y < 0 and x - h >= 0:
        return 2 * math.pi - math.asin(abs(y) / r)


def count(xObj, yObj, h, gObj, m):
    rA = math.sqrt(pow((xObj + h), 2) + pow(yObj, 2))
    fiA = F_a(xObj, yObj, rA, h)
    rB = math.sqrt(pow((xObj - h), 2) + pow(yObj, 2))
    fiB = F_b(xObj, yObj, rB, h)
    L_a = math.floor((m / (2 * math.pi)) * (fiA - math.asin(gObj / rA)))
    R_a = math.floor((m / (2 * math.pi)) * (fiA + math.asin(gObj / rA)))
    L_b = math.floor((m / (2 * math.pi)) * (fiB - math.asin(gObj / rB)))
    R_b = math.floor((m / (2 * math.pi)) * (fiB + math.asin(gObj / rB)))
    return L_a, R_a, L_b, R_b

In [None]:
def generate_dataset(h, l, m, n, rMin, rMax, fiMin, fiMax, gMin, gMax, task_id):
    M = []
    precedents = 0
    while precedents < n:
        beta_A = [0] * m
        beta_B = [0] * m
        flag = False
        while not flag:
            xObj, yObj, gObj, fiObj, rObj = randomize(rMin, rMax, fiMin, fiMax, gMin, gMax)
            if not valid(xObj, yObj, gObj, h, l):
                continue
            L_a, R_a, L_b, R_b = count(xObj, yObj, h, gObj, m)
            flag = L_a != R_a
        precedents += 1
        for j in range(L_a, R_a + 1):
            beta_A[j] = 1
        for j in range(L_b, R_b + 1):
            beta_B[j] = 1
        M.append({'beta_A': beta_A, 'beta_B': beta_B, 'rObj': rObj, 'fiObj': fiObj,
                  'gObj': gObj, 'xObj': xObj, 'yObj': yObj, 'h': h, 'l': l})
    save_to_file(dataset_name=f'dataset_{task_id}',
                         dataset_folder='', dataset=M, h=h, l=l,
                         m=m, g_min=gMin, g_max=gMax, fi_min=fiMin, fi_max=fiMax, r_min=rMin,
                         r_max=rMax, n=n)

# Сохранение в файл, анализ и очищение

In [None]:
def save_to_file(dataset_name, dataset, h, l, m, gMax, rMin, rMax, n):
    if not '.csv' in dataset_name:
        dataset_name = dataset_name + '.csv'
    with open(dataset_name, 'w', newline='') as output:
        writer = csv.writer(output, delimiter=';')
        meta_row = [h, l, m, gMax, rMin, rMax, n]
        writer.writerow(meta_row)
        for row in dataset:
            #inserting_row = row['beta_A'] + row['beta_B'] + [row['r'], row['fi'], row['g']]
            #writer.writerow(inserting_row)
            writer.writerow(row)

def count_repeats(dataset_path):
    repeats = {}
    with open(dataset_path) as file:
        reader = csv.reader(file, delimiter=';')
        meta_info = next(reader)
        m = int(meta_info[2])//2
        for row in reader:
            mask = tuple(row[:2 * m])
            if mask in repeats:
                repeats[mask] += 1
            else:
                repeats[mask] = 1
    return sum(repeats[mask] for mask in repeats) - len(repeats)


def clean_dataset(filepath):
    repeats = {}
    clear_dataset = []
    with open(filepath) as csv_file:
        reader = csv.reader(csv_file, delimiter=';')
        meta_info = next(reader)
        clear_dataset.append(meta_info)
        m = int(meta_info[2])
        for row in reader:
            mask = tuple(row[:2 * m])
            if mask in repeats:
                repeats[mask] += 1
            else:
                repeats[mask] = 1
                clear_dataset.append(row)
    path = 'clear_dataset.csv'
    with open(path, 'w') as file:
        writer = csv.writer(file, delimiter=';')
        for row in clear_dataset:
            writer.writerow(row)

In [None]:
save_to_file('dataset.csv', dataset, h=40, l=20, m=720, rMin=50, rMax=1000, gMax=100, n=10000)

# Обновленный алгоритм

In [None]:
def generate(h, l, m, n, rMin, rMax, gMax):
  gamma = (2 * math.pi) / m
  psi = math.asin(l/h)
  M = {} 
  precedents = 0
  masks = []
  while precedents < n:
    r = rMin + math.sqrt(np.random.uniform(0, 1)) * rMax
    gMin = r * math.sin(gamma/2)
    if gMin > gMax:
      continue
    g = np.random.uniform(gMin, gMax)
    if g > r:      
      continue     
    fi = np.random.uniform(psi + math.asin(g/r), math.pi - psi - math.asin(g/r))
    r_b = math.sqrt(h*h + r*r - 2*h*r*math.cos(fi))
    if g > r_b:    
      continue     
    fi_b = math.pi - math.acos((h - r * math.cos(fi)) / math.sqrt(h*h + r*r - 2*h*r*math.cos(fi)))
    L_b = math.floor((m / (2 * math.pi)) * (fi_b - math.asin(g / r_b)))
    R_b = math.floor((m / (2 * math.pi)) * (fi_b + math.asin(g / r_b)))
    r_a = math.sqrt(h*h + r*r + 2*h*r*math.cos(fi))
    if g > r_a:    
      continue 
    fi_a = math.acos((h + r * math.cos(fi)) / math.sqrt(h*h + r*r + 2*h*r*math.cos(fi)))
    L_a = math.floor((m / (2 * math.pi)) * (fi_a - math.asin(g / r_a)))
    R_a = math.floor((m / (2 * math.pi)) * (fi_a + math.asin(g / r_a)))
    if L_a < 0:
      continue
    if R_b >= m//2:
      continue
    beta_A = [0] * (m//2)
    beta_B = [0] * (m//2)
    for j in range(L_a, R_a + 1):
        beta_A[j] = 1
    for j in range(L_b, R_b + 1):
        beta_B[j] = 1
    M[tuple(beta_A + beta_B)] = [r, fi, g]
    precedents = len(M)
  return M

In [None]:
dataset = generate(h=40, l=20, m=720, rMin=50, rMax=5000, gMax=500, n=100000)
dataset = list(dataset.items())
for i in range(len(dataset)):
  dataset[i] = list(dataset[i][0]) + dataset[i][1]
print(len(dataset))

100000


# Создание артефактов

In [None]:
!pip install wandb -qqq
!apt install tree

[K     |████████████████████████████████| 1.7 MB 11.5 MB/s 
[K     |████████████████████████████████| 180 kB 69.1 MB/s 
[K     |████████████████████████████████| 97 kB 5.1 MB/s 
[K     |████████████████████████████████| 139 kB 68.5 MB/s 
[K     |████████████████████████████████| 63 kB 1.4 MB/s 
[?25h  Building wheel for subprocess32 (setup.py) ... [?25l[?25hdone
  Building wheel for pathtools (setup.py) ... [?25l[?25hdone
Reading package lists... Done
Building dependency tree       
Reading state information... Done
The following NEW packages will be installed:
  tree
0 upgraded, 1 newly installed, 0 to remove and 37 not upgraded.
Need to get 40.7 kB of archives.
After this operation, 105 kB of additional disk space will be used.
Get:1 http://archive.ubuntu.com/ubuntu bionic/universe amd64 tree amd64 1.7.0-5 [40.7 kB]
Fetched 40.7 kB in 0s (751 kB/s)
Selecting previously unselected package tree.
(Reading database ... 155219 files and directories currently installed.)
Preparin

In [None]:
import os
import wandb

In [None]:
len(dataset)

250000

In [None]:
from collections import namedtuple
Dataset = namedtuple("Dataset", ["x", "r", "fi"])

def load(dataset):
    x, y_r, y_fi,  x2, y_r2, y_fi2 = [], [], [], [], [], []
    for i, row in enumerate(dataset):
      if i % 5 == 0:
        x2.append(row[:720])
        y_r2.append(row[720])
        y_fi2.append(row[721])
      else:
        x.append(row[:720])
        y_r.append(row[720])
        y_fi.append(row[721])
    dataset_train = Dataset(x, y_r, y_fi)
    dataset_test = Dataset(x2, y_r2, y_fi2)
    #fi_set = Dataset(x, y_fi)
    datasets = [dataset_train, dataset_test]
    return datasets

In [None]:
wandb.login()

<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


True

In [None]:
# 🚀 start a run, with a type to label it and a project it can call home
with wandb.init(project="flatfasetgen", job_type="load-data") as run:
    datasets = load(dataset)  # separate code for loading the datasets
    names = ["train_set", "test_set"]
    # 🏺 create our Artifact
    raw_data = wandb.Artifact(
        "Clear_datasets", type="dataset",
        description="Generated dataset with latest algo. Cleared. Sqrt distribution. 250k version. Linear distributed by distance",
        metadata={"source": "Manually generated",
                  "sizes": [len(dataset.x) for dataset in datasets]})

    for name, data in zip(names, datasets):
        # 🐣 Store a new file in the artifact, and write something into its contents.
        with raw_data.new_file(name + ".npz", mode="wb") as file:
            np.savez(file, x=data.x, r=data.r, fi=data.fi)

    # ✍️ Save the artifact to W&B.
    run.log_artifact(raw_data)

[34m[1mwandb[0m: Currently logged in as: [33martem-starkov[0m (use `wandb login --relogin` to force relogin)


VBox(children=(Label(value=' 1377.11MB of 1377.11MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, ma…