# 1. Generate Dataset
Using the Frozen Requirements
* ```pip install -r requirements.txt```

### 1-1. Setting up

In [1]:
import os
import json
import time
from dotenv import load_dotenv
from ai_translator_mod import AITranslator
from utils_mod import process_translation_term_data, save_json_file

In [2]:
load_dotenv()



True

### 1-2. Grouping terms

In [3]:
grouped_terms = [
    ['adversarial training', 'recurrent neural architectures', 'bayesian optimization'],
    ['adaptive neural frameworks', 'stochastic neural embeddings', 'gibbs sampling'],
    ['contextual embeddings', 'causal modeling', 'probabilistic neural computation'],
    ['adaptive neural optimization', 'bayesian inference', 'adaptive learning algorithms'],
    ['out-of-distribution generalization', 'self-supervised learning frameworks', 'modular neural networks'],
    ['model compression', 'large language models', 'asynchronous advantage actor-critic'],
    ['sparse Bayesian learning', 'self-supervised sequence labeling', 'probabilistic neural processes'],
    ['dynamic text generation', 'markov chains', 'Gaussian processes'],
    ['support vector machines', 'regression discontinuity', 'neural program induction'],
    ['causal neural embeddings', 'Bayes factors', 'causal probabilistic scene segmentation'],
    ['multilinear algebra', 'sparse coding', 'hierarchical representations'],
    ['probabilistic neural inference', 'variational autoencoders', 'neural information retrieval'],
    ['stochastic neural optimization', 'knowledge-based systems', 'neural density estimation'],
    ['task-driven neural learning', 'gpt', 'dynamic neural programming'],
    ['UMAP (Uniform Manifold Approximation and Projection)', 'bootstrap aggregating', 'probabilistic neural text embeddings'],
    ['Dirichlet processes', 'glove', 'dynamic text embeddings'],
    ['context-aware text generation', 'collaborative filtering', 'statistical relational learning'],
    ['context-aware neural networks', 'cloud computing', 'graph attention networks'],
    ['self-attention mechanisms', 'genetic algorithms', 'semantic text models'],
    ['dynamic scene models', 'residual networks', 'adaptive text models'],
    ['pattern recognition', 'deep probabilistic models', 'transfer learning frameworks'],
    ['semi-supervised embeddings', 'signal processing', 'inverse graphics'],
    ['meta-learning for RL', 'dynamic computational graphs', 'unsupervised learning'],
    ['neural reinforcement learning policies', 'neural probabilistic processes', 'multi-agent systems'],
    ['echo state networks', 'adversarial examples', 'neural probabilistic inference'],
    ['dynamic embedding models', 'multi-task learning', 'differential privacy'],
    ['submodular optimization', 'fine-tuning', 'probabilistic text inference'],
    ['k-nearest neighbors', 'sequence-to-sequence neural models', 'stochastic processes'],
    ['deep structured prediction', 'deep equilibrium models', 'semantic neural scene processes'],
    ['dynamic scene learning', 'multi-view learning', 'neurally plausible models'],
    ['few-shot learning', 'deep structured learning', 'question answering'],
    ['deep generative adversarial networks', 'adaptive probabilistic text learning', 'context-aware task learning'],
    ['machine learning workflows', 'DeBERTa', 'self-attention'],
    ['next-token prediction', 'neural probabilistic scene segmentation', 'automated machine learning'],
    ['semantic probabilistic optimization', 'semantic neural processes', 'latent variable inference'],
    ['certifiable robustness', 'autonomous systems', 'multimodal embeddings'],
    ['neural scene processes', 'style transfer', 'neural sequence generation'],
    ['dynamic neural text models', 'nonparametric Bayes', 'subgraph matching'],
    ['shrinkage estimation', 'neural contextual models', 'random feature expansion'],
    ['graph convolutional networks', 'tensor factorization', 'evolution strategies'],
    ['neural dynamic programming', 'gradient boosting', 'policy optimization'],
    ['deep Boltzmann machines', 'symbolic reasoning', 'stochastic scene models'],
    ['semantic text learning', 'causal representation learning', 'collapsibility'],
    ['neural differential equations', 'instrumental variables', 'natural language processing'],
    ['neural adaptive processes', 'deep task-specific learning', 'internet of things'],
    ['differentiable physics', 'deep graph learning', 'stochastic optimization methods'],
    ['neural program synthesis', 'neural dynamic sequence modeling', 'exchangeable sequences'],
    ['self-attention models', 'doc2vec', 'dynamic scene processes'],
    ['deep ensembles', 'backpropagation', 'time series analysis'],
    ['latent space modeling', 'precision', 'multi-modal models'],
    ['variational inference', 'masked language modeling', 'transformer models'],
    ['semantic neural text segmentation', 'differentiable reasoning', 'adaptive neural text models'],
    ['semantic probabilistic scene segmentation', 'contrastive predictive coding', 'low-rank approximations'],
    ['disentangled representations', 'representation learning', 'stochastic text learning'],
    ['Bayesian nonparametrics', 'graph neural architectures', 'machine translation'],
    ['task-agnostic text generation', 'contextual word embeddings', 'graph-based neural computation'],
    ['clustering', 'neural probabilistic learning', 'deep feature learning'],
    ['semi-supervised machine learning', 'non-parametric methods', 'differentiable optimization'],
    ['dense layers', 'semantic neural text learning', 'neural probabilistic scene models'],
    ['iterative back-translation', 'conditional random fields', 'albert'],
    ['neural style transfer', 'joint embedding architectures', 'game theory'],
    ['graph neural message passing', 'neural tangent kernels', 'xlnet'],
    ['differentiable neural programming', 'neural probabilistic scene processes', 'Wasserstein GAN'],
    ['Bayesian optimization', 'random forests', 'neural dynamic modeling'],
    ['convolutional neural networks', 'meta-learning', 'probabilistic graphical models'],
    ['adaptive probabilistic scene models', 'latent diffusion models', 'annealed importance sampling'],
    ['multi-task reinforcement learning', 'nested models', 'Gaussian mixture models'],
    ['curriculum learning', 'independence of irrelevant alternatives', 'neural scene learning'],
    ['semantic text inference', 'dynamic probabilistic text models', 'probabilistic topic models'],
    ['probabilistic task-driven modeling', 'dynamic scene segmentation', 'deep learning'],
    ['evolutionary algorithms', 'normalizing flows', 'tensor decomposition'],
    ['adaptive neural text processes', 'Indian buffet process', 'latent dirichlet allocation'],
    ['confusion matrix', 'reinforcement learning policies', 'posterior sampling'],
    ['neural reinforcement learning algorithms', 'probabilistic neural scene processes', 'stochastic variational inference'],
    ['neural encoder-decoder models', 'partial pooling', 'symbolic regression'],
    ['deep sequential models', 'autoregressive models', 'deep probabilistic learning'],
    ['neural scene embeddings', 'generative adversarial networks', 'dynamic routing'],
    ['natural language generation', 'latent space models', 'deep bayesian networks'],
    ['contextual bandits', 'probabilistic task-driven learning', 'neural dynamic text models'],
    ['probabilistic neural text processes', 'auto-regressive models', 'dynamic graph networks'],
    ['task-specific learning', 'transformer networks', 'algorithmic fairness'],
    ['neural network pruning', 'neural structured learning', 'probabilistic text generation'],
    ['hybrid models', 'sequential neural models', 'probabilistic learning algorithms'],
    ['semantic dynamic models', 'low-shot learning', 'ensemble methods'],
    ['memory-augmented networks', 'semantic scene understanding', 'causal discovery methods'],
    ['dropout', 'few-shot object detection', 'differentially private learning'],
    ['Chinese restaurant process', 'deep probabilistic scene segmentation', 'neural stochastic processes'],
    ['deep contextual learning', 'task-driven sequence learning', 'Monte Carlo dropout'],
    ['differentiable neural computation', 'neural knowledge graph completion', 'self-training'],
    ['temporal convolutional networks', 'federated learning frameworks', 'deep kernel learning'],
    ['marginal independence', 'adaptive neural embeddings', 'robustness to distributional shift'],
    ['hierarchical Bayesian models', 'context-aware reinforcement learning', 'causal effect estimation'],
    ['neural task-specific learning', 'generative text models', 'conformal prediction'],
    ['hidden Markov models', 'partially observable Markov decision processes', 'adaptive probabilistic text segmentation'],
    ['probabilistic programming frameworks', 'dynamic scene understanding', 'autoregressive text generation'],
    ['semantic segmentation', 'meta-learning algorithms', 'message passing neural networks'],
    ['self-organizing maps', 'Hamiltonian Monte Carlo', 'distilbert'],
    ['probabilistic scene learning', 'pre-trained models', 'neural autoregressive models'],
    ['policy gradient methods', 'causal inference', 'spectral clustering'],
    ['contrastive divergence', 'dictionary learning', 'exchangeable models'],
    ['adaptive probabilistic text models', 'task-specific neural models', 'stochastic scene segmentation'],
    ['causal learning networks', 'predictive distributions', 'neural graph-based learning'],
    ['causal probabilistic networks', 'stochastic text models', 'Bayesian belief networks'],
    ['dynamic convolution', 'topic modeling', 'Bayesian model selection'],
    ['neural scene segmentation', 'attention is all you need', 'Bayesian neural networks'],
    ['neural variational inference', 'skip connections', 'secure multi-party computation'],
    ['interactive learning', 'machine learning', 'zero-inflated models'],
    ['stochastic text embeddings', 'multi-modal deep learning', 'causal discovery'],
    ['contextual representation models', 'long-range dependencies', 'bayesian optimization techniques'],
    ['probabilistic text optimization', 'optimization-based meta-learning', 'neural text segmentation'],
    ['batch normalization', 'stochastic scene embeddings', 'multi-agent reinforcement learning'],
    ['neural architecture search', 'text-to-speech', 'hyperparameter optimization'],
    ['dynamic text models', 'implicit models', 'dynamic neural learning'],
    ['factor graphs', 'quantum machine learning', 'reinforcement learning'],
    ['self-supervised text classification', 'generative modeling', 'self-supervised learning'],
    ['adaptive neural networks', 'semantic neural optimization', 'probabilistic text processes'],
    ['semantic probabilistic text learning', 'dynamic neural processes', 'probabilistic neural text models'],
    ['approximate inference', 'adaptive neural scene segmentation', 'neural radiance fields'],
    ['k-means', 'autoregressive flows', 'semantic scene models'],
    ['semantic text processes', 'dynamic scene embeddings', 'deep neural text models'],
    ['cutsets', 'infinite hidden Markov models', 'stochastic neural frameworks'],
    ['causal neural processes', 'audio-visual speech recognition', 'probabilistic scene models'],
    ['neural semantic segmentation', 'task-agnostic learning', 'deep generative models'],
    ['recommendation systems', 'deep reinforcement learning policies', 'automatic differentiation'],
    ['unsupervised representation learning', 'deep policy networks', 'task-specific contextual learning'],
    ['named entity recognition', 'approximate bayesian computation', 'probabilistic language modeling'],
    ['speech recognition', 'exchangeability', 'differentially private training'],
    ['attention mechanisms', 'adaptive text segmentation', 'dynamic task-driven learning'],
    ['end-to-end reinforcement learning', 'dynamic sequence modeling', 'adversarial robustness'],
    ['posterior predictive checks', 'dynamic neural scene processes', 'evolutionary strategies'],
    ['causal generative models', 'neural probabilistic scene learning', 'inverse reinforcement learning'],
    ['multi-head attention', 'information bottleneck method', 'contrastive learning'],
    ['stick-breaking processes', 'self-normalizing networks', 'probabilistic neural scene segmentation'],
    ['meta-reinforcement learning', 'neural context-aware learning', 'probabilistic neural scene learning'],
    ['text-to-image synthesis', 'probabilistic generative models', 'Polya trees'],
    ['relational models', 'dynamic text processes', 'policy search'],
    ['program synthesis', 'probabilistic neural text learning', 'task-driven learning'],
    ['probabilistic neural learning', 'anomaly detection', 'multimodal learning'],
    ['restricted Boltzmann machines', 'principal component analysis', 'calibrated classifiers'],
    ['sentiment analysis', 'byte-pair encoding', 'graph isomorphism networks'],
    ['matrix factorization', 'autonomous driving', 'artificial intelligence'],
    ['deep probabilistic optimization', 'continual learning', 'privacy-preserving machine learning'],
    ['transformer-XL', 'neural probabilistic scene embeddings', 'adaptive scene learning'],
    ['image super-resolution', 'algorithmic bias', 'dynamic neural attention'],
    ['optimization algorithms', 'learning to optimize', 'graph neural modeling'],
    ['deep metric learning', 'neural probabilistic models', 'adaptive text processes'],
    ['structured prediction', 'interpretability', 'neural relation extraction'],
    ['exchangeable arrays', 'neural adaptive optimization', 'boosting'],
    ['neural ordinary differential equations', 'soft attention', 'dbscan'],
    ['markov decision processes', 'graphical model structure learning', 'graph-based learning'],
    ['stochastic neural learning', 'neural logic networks', 'hybrid reinforcement learning'],
    ['stochastic learning processes', 'loss functions', 'few-shot learning frameworks'],
    ['probabilistic neural text segmentation', 'gradient descent', 'energy-based models'],
    ['dynamic probabilistic learning', 'probabilistic text models', 'adaptive learning processes'],
    ['empirical Bayes', 'knowledge graph embeddings', 'distillation'],
    ['manifold learning', 'hierarchical clustering', 'hmm'],
    ['neural text classification', 'deep density estimation', 'supervised learning'],
    ['neural probabilistic programming', 'spectral normalization', 'simultaneous machine translation'],
    ['task-driven text models', 'fasttext', 'contextual sequence modeling'],
    ['neural text models', 'deep probabilistic modeling', 'recurrent neural networks'],
    ['graph-based neural networks', 'uncertainty quantification', 'probabilistic neural embeddings'],
    ['transformer-based architectures', 'policy gradients', 'graph-based reinforcement learning'],
    ['hybrid neural networks', 'generative pretraining', 'semantic text segmentation'],
    ['hypernetworks', 'adaptive semantic text models', 'word2vec'],
    ['edge computing', 'dynamic generative models', 'approximate Bayesian computation'],
    ['Markov Chain Monte Carlo', 'graph embedding', 'deep generative modeling'],
    ['hyperparameter tuning', 'graph-based machine learning', 'neural probabilistic text processes'],
    ['structured sparsity', 'long short-term memory', 'mixture models'],
    ['deep semantic segmentation', 'quantization', 'nonparametric Bayesian models'],
    ['neural sequence labeling', 'autoencoders', 'probabilistic programming'],
    ['neural latent variable models', 'probabilistic deep learning', 'neural adaptive text learning'],
    ['latent variable models', 'dynamic reinforcement learning', 'semantic probabilistic embeddings'],
    ['tokenization', 'adaptive neural text embeddings', 'graph neural networks'],
    ['differentiable rendering', 'difference-in-differences', 'masked language models'],
    ['neural density models', 'dynamic probabilistic neural models', 'task-agnostic reinforcement learning'],
    ['graph representation learning', 'image recognition', 'semi-parametric models'],
    ['contrastive learning frameworks', 'homomorphic encryption', 'self-supervised pretraining'],
    ['machine unlearning', 'multi-head attention networks', 'adaptive text learning'],
    ['data augmentation', 'pose estimation', 't5'],
    ['semantic image segmentation', 'neural ODEs', 'multi-hop reasoning'],
    ['differentiable neural processes', 'monte carlo methods', 'probabilistic neural networks'],
    ['semi-supervised text classification', 'neural processes', 'conditional independence'],
    ['neural context-aware models', 'contextual representation learning', 'neurosymbolic AI'],
    ['causal graph networks', 'semantic neural embeddings', 'differentiable neural computer'],
    ['structured neural networks', 'neural task-driven learning', 'active learning'],
    ['convolutional layers', 'learning to search', 'interpretable machine learning'],
    ['semantic adaptive learning', 'knowledge distillation', 'dynamic neural networks'],
    ['non-negative matrix factorization', 'neural probabilistic sequence models', 'adaptive boosting'],
    ['data-driven decision making', 'semantic probabilistic learning', 'neural networks'],
    ['ensemble neural networks', 'contrastive loss', 'learned optimizers'],
    ['metric learning', 'ensemble learning', 'information bottleneck'],
    ['Markov random fields', 'memory-augmented neural networks', 'neural sequence prediction'],
    ['zero-shot learning', 'sequence modeling', 'Riemannian manifolds'],
    ['natural language understanding', 'neural generative inference', 'dimensionality reduction'],
    ['probabilistic sequence generation', 'neural information processing', 'spiking neural networks'],
    ['decision trees', 'AI alignment', 'deep reinforcement learning'],
    ['neural rendering', 'semantic probabilistic scene models', 'sequence tagging'],
    ['unsupervised text generation', 'neural scene models', 'neural probabilistic text learning'],
    ['domain generalization', 'adaptive neural learning', 'contextual neural modeling'],
    ['dynamic neural inference', 'hidden markov models', 'capsule networks'],
    ['neural adaptive computation', 'meta-gradient learning', 'computer vision'],
    ['neural context-aware generation', 'context-aware probabilistic learning', 'word embeddings'],
    ['dynamic probabilistic modeling', 'multitask learning', 'deep probabilistic forecasting'],
    ['neural search', 'attention networks', 'deep probabilistic inference'],
    ['semantic neural learning', 'transformer-based models', 'basis pursuit'],
    ['multilevel models', 'maximum likelihood estimation', 't-SNE (t-Distributed Stochastic Neighbor Embedding)'],
    ['symbolic AI', 'neural sequence-to-sequence learning', 'explainable AI'],
    ['recall', 'roberta', 'stochastic text segmentation'],
    ['probabilistic dynamic learning', 'semantic probabilistic text models', 'graph structure learning'],
    ['end-to-end learning', 'bert', 'semi-supervised learning'],
    ['adaptive computation time', 'group sparsity', 'semantic neural inference'],
    ['scalable neural networks', 'mcmc', 'neural dynamic sequence learning'],
    ['reinforcement learning frameworks', 'contextual text generation', 'neural probabilistic text models'],
    ['context-aware sequence learning', 'non-Euclidean domains', 'continuous normalizing flows'],
    ['deep q-networks', 'sequence-to-sequence models', 'neural dynamic learning'],
    ['pairwise Markov networks', 'object detection', 'feature engineering'],
    ['stochastic neural processes', 'semantic adaptive processes', 'graph-based neural modeling'],
    ['importance weighted autoencoders', 'dynamic attention models', 'cross-modal learning'],
    ['transfer learning', 'sequential decision making', 'neural spline flows'],
    ['dynamic neural scene learning', 'propensity score matching', 'neural architecture search techniques'],
    ['probabilistic neural optimization', 'context-aware sequence generation', 'neural dynamic optimization'],
    ['q-learning', 'probabilistic text learning', 'differentiable programming'],
    ['adaptive scene segmentation', 'neural scene understanding', 'bayesian networks'],
    ['prompt engineering', 'unsupervised language models', 'domain adaptation'],
    ['lightgbm', 'sampling methods', 'neural adaptive learning'],
    ['xgboost', 'probabilistic text segmentation', 'neural-symbolic learning'],
    ['subword tokenization', 'semantic neural models', 'encoder-decoder architecture'],
    ['low-dimensional embeddings', 'posterior predictive distributions', 'neural task adaptation'],
    ['neural-symbolic integration', 'model evaluation', 'neural architecture optimization'],
    ['neural adaptive learning processes', 'context-aware learning', 'deep probabilistic embeddings'],
    ['knowledge graph completion', 'latent Dirichlet allocation', 'graph matching networks'],
    ['actor-critic methods', 'dynamic Bayesian networks', 'deep task-driven learning'],
    ['neural Turing machines', 'contextual neural attention', 'deep reinforcement learning frameworks'],
    ['gradient penalty', 'neural knowledge graphs', 'causal inference algorithms'],
    ['differentiable programming frameworks', 'neural logic programming', 'neural task-specific models'],
    ['probabilistic relational models', 'deep relational networks', 'contextual learning'],
    ['semantic probabilistic scene learning', 'generative text modeling', 'dynamic neural optimization'],
    ['heterogeneous graphs', 'machine reasoning', 'neural probabilistic text segmentation'],
    ['temporal difference learning', 'inverse graphical models', 'contextual learning frameworks'],
    ['graph spectral methods', 'liquid state machines', 'dynamic text modeling'],
    ['adaptive neural processes', 'high-dimensional statistics', 'adaptive probabilistic models'],
    ['deep uncertainty estimation', 'sequential Monte Carlo', 'task-driven reinforcement learning'],
    ['adaptive probabilistic scene learning', 'content-based filtering', 'relational inductive biases'],
    ['semantic scene learning', 'face recognition', 'dynamic representation learning'],
    ['self-supervised image classification', 'bagging', 'federated learning'],
    ['dynamic neural embeddings', 'recurrent convolutional networks', 'context-aware sequence models'],
    ['cross-validation', 'neural generative programming', 'probabilistic sequence modeling'],
    ['causal reinforcement learning', 'semantic probabilistic text processes', 'stochastic gradient Langevin dynamics'],
    ['wordpiece tokenization', 'activation functions', 'residual connections'],
    ['self-supervised task learning', 'pruning', 'low-rank approximation'],
    ['context-aware deep learning', 'neural memory networks', 'conversational agents'],
    ['multi-scale models', 'context-aware text models', 'deep probabilistic scene models'],
    ['neural task-driven modeling', 'speech-to-text', "de Finetti's theorem"]
]

len(grouped_terms)

253

In [4]:
terms = [term for row in grouped_terms for term in row]
len(terms)

759

In [5]:
# with open("./dataset_new_turn_2/dataset_new_turn_2_1.json", "r", encoding= 'UTF8') as json_file:
#     dataset_new_turn_2_1 = json.load(json_file)
# print(len(dataset_new_turn_2_1))
# dataset_new_turn_2_1[-1]

In [6]:
terms[344:350]

['self-supervised learning',
 'adaptive neural networks',
 'semantic neural optimization',
 'probabilistic text processes',
 'semantic probabilistic text learning',
 'dynamic neural processes']

### 1-3. Get Arxiv Summaries

In [7]:
with open("./arxiv_list/arxiv_total.json", "r", encoding= 'UTF8') as json_file:
    summaries = json.load(json_file)

In [8]:
summaries[0]

{'term': 'adversarial training',
 'response': [{'title': 'HOLMES: to Detect Adversarial Examples with Multiple Detectors',
   'summary': '  Deep neural networks (DNNs) can easily be cheated by some imperceptible but\npurposeful noise added to images, and erroneously classify them. Previous\ndefensive work mostly focused on retraining the models or detecting the noise,\nbut has either shown limited success rates or been attacked by new adversarial\nexamples. Instead of focusing on adversarial images or the interior of DNN\nmodels, we observed that adversarial examples generated by different algorithms\ncan be identified based on the output of DNNs (logits). Logit can serve as an\nexterior feature to train detectors. Then, we propose HOLMES (Hierarchically\nOrganized Light-weight Multiple dEtector System) to reinforce DNNs by detecting\npotential adversarial examples to minimize the threats they may bring in\npractical. HOLMES is able to distinguish \\textit{unseen} adversarial examples\

In [9]:
summaries[0]['response'][0]['summary']

'  Deep neural networks (DNNs) can easily be cheated by some imperceptible but\npurposeful noise added to images, and erroneously classify them. Previous\ndefensive work mostly focused on retraining the models or detecting the noise,\nbut has either shown limited success rates or been attacked by new adversarial\nexamples. Instead of focusing on adversarial images or the interior of DNN\nmodels, we observed that adversarial examples generated by different algorithms\ncan be identified based on the output of DNNs (logits). Logit can serve as an\nexterior feature to train detectors. Then, we propose HOLMES (Hierarchically\nOrganized Light-weight Multiple dEtector System) to reinforce DNNs by detecting\npotential adversarial examples to minimize the threats they may bring in\npractical. HOLMES is able to distinguish \\textit{unseen} adversarial examples\nfrom multiple attacks with high accuracy and low false positive rates than\nsingle detector systems even in an adaptive model. To ensure

In [10]:
len(summaries)

759

In [11]:
def get_summary_safely(summaries, idx):
    try:
        return summaries[idx]['response'][2]['summary']
    except (IndexError, KeyError):
        return " "

arxiv_summaries = [get_summary_safely(summaries, idx) for idx in range(len(terms))]

In [12]:
len(arxiv_summaries)

759

### 1-4. Agentic Data generation

In [13]:
# import time
# import traceback
# import json
# import random
# import requests

# model = AITranslator()

# error_indices = []  # List to store indices with repeated errors
# train_data = []

# base_delay = 3  # 기본 대기 시간 (초)
# max_retries = 5  # 최대 재시도 횟수 증가

# def add_train_data(idx, summary):
#     term = terms[idx]
#     turn_index = 2
#     retry = True
#     retry_count = 0
#     while retry and retry_count < max_retries:
#         try:
#             print(f"{idx}번째 그룹")
#             print(term)
#             _, sentences = model.gen_translate_sentences(term, summary)
#             result = process_translation_term_data(
#                 turn_index=turn_index,
#                 data=sentences.chat_history[-2]["content"], 
#                 domain="cs.AI",
#                 term=term,
#                 summary=summary
#             )
#             print(json.dumps(result, ensure_ascii=False, indent=4))
#             train_data.append(result)   # Add results to train_data
#             time.sleep(base_delay)
            
#             retry = False  # Set retry to False to exit the loop on success
        
#         except (requests.exceptions.ConnectionError, ConnectionResetError) as e:
#             print(f"Network error at index {idx}: {e}")
#             retry_count += 1
#             if retry_count >= max_retries:
#                 print(f"Max retries reached for index {idx}. Moving to next.")
#                 error_indices.append(idx)  # Add index to error list
#             else:
#                 delay = base_delay * (2 ** retry_count) + random.uniform(0, 1)
#                 print(f"Retrying in {delay:.2f} seconds...")
#                 time.sleep(delay)
        
#         except IndexError:
#             print(f"IndexError at index {idx}. Passing.")
#             error_indices.append(idx)
#             retry = False
        
#         except Exception as e:
#             print(f"Unexpected error at index {idx}: {e}")
#             traceback.print_exc()
#             retry_count += 1
#             if retry_count >= max_retries:
#                 print(f"Max retries reached for index {idx}. Moving to next.")
#                 error_indices.append(idx)  # Add index to error list
#                 retry = False
#             else:
#                 delay = base_delay * (2 ** retry_count) + random.uniform(0, 1)
#                 print(f"Retrying in {delay:.2f} seconds...")
#                 time.sleep(delay)
#     return result


In [14]:
# from concurrent.futures import ThreadPoolExecutor

# with ThreadPoolExecutor(max_workers=20) as executor:
#     responses = list(executor.map(add_train_data, range(len(terms)), arxiv_summaries))


In [19]:
len(terms[344:])

415

In [22]:
import time
import traceback
import json
import random
import requests
from threading import Lock
from concurrent.futures import ThreadPoolExecutor

model = AITranslator()

error_indices_lock = Lock()  # Lock for error_indices
train_data_lock = Lock()     # Lock for train_data

error_indices = []  # List to store indices with repeated errors
train_data = []

base_delay = 20  # 기본 대기 시간 (초)
max_retries = 5  # 최대 재시도 횟수 증가

def add_train_data(idx, summary):
    term = terms[idx]
    turn_index = 2
    retry = True
    retry_count = 0
    result = None  # Initialize result
    
    while retry and retry_count < max_retries:
        try:
            print(f"{idx}번째 그룹")
            print(term)
            _, sentences = model.gen_translate_sentences(term, summary)
            result = process_translation_term_data(
                turn_index=turn_index,
                data=sentences.chat_history[-2]["content"], 
                domain="cs.AI",
                term=term,
                summary=summary
            )
            print(json.dumps(result, ensure_ascii=False, indent=4))
            
            # Thread-safe append to train_data
            with train_data_lock:
                train_data.append(result)
                
            time.sleep(base_delay)
            retry = False
            
        except (requests.exceptions.ConnectionError, ConnectionResetError) as e:
            print(f"Network error at index {idx}: {e}")
            retry_count += 1
            if retry_count >= max_retries:
                print(f"Max retries reached for index {idx}. Moving to next.")
                with error_indices_lock:
                    error_indices.append(idx)
                result = {
                    "turn_index": turn_index,
                    "term": term,
                    "domain": "cs.AI",
                    "summary": summary,
                    "error": str(e)
                }
            else:
                delay = base_delay * (2 ** retry_count) + random.uniform(0, 1)
                print(f"Retrying in {delay:.2f} seconds...")
                time.sleep(delay)
        
        except IndexError:
            print(f"IndexError at index {idx}. Passing.")
            with error_indices_lock:
                error_indices.append(idx)
            result = {
                "turn_index": turn_index,
                "term": term,
                "domain": "cs.AI",
                "summary": summary,
                "error": "IndexError"
            }
            retry = False
        
        except Exception as e:
            print(f"Unexpected error at index {idx}: {e}")
            traceback.print_exc()
            retry_count += 1
            if retry_count >= max_retries:
                print(f"Max retries reached for index {idx}. Moving to next.")
                with error_indices_lock:
                    error_indices.append(idx)
                result = {
                    "turn_index": turn_index,
                    "term": term,
                    "domain": "cs.AI",
                    "summary": summary,
                    "error": str(e)
                }
                retry = False
            else:
                delay = base_delay * (2 ** retry_count) + random.uniform(0, 1)
                print(f"Retrying in {delay:.2f} seconds...")
                time.sleep(delay)
    
    return result

# 메인 실행 코드
with ThreadPoolExecutor(max_workers=10) as executor:
    try:
        responses = list(executor.map(add_train_data, range(100, 344), arxiv_summaries[100:344]))
        # 결과 필터링 - None이나 에러가 있는 결과 제외
        save_json_file(train_data, directory_name="dataset_new_turn_1", file_name="dataset_new_turn_1_2.json")
        valid_responses = [r for r in responses if r is not None and 'error' not in r]
    except Exception as e:
        print(f"Error in thread pool execution: {e}")
        traceback.print_exc()

100번째 그룹
neural probabilistic scene segmentation
101번째 그룹
automated machine learning
102번째 그룹
semantic probabilistic optimization
103번째 그룹
semantic neural processes
104번째 그룹
latent variable inference
105번째 그룹
certifiable robustness
106번째 그룹
autonomous systems
107번째 그룹
multimodal embeddings
108번째 그룹
neural scene processes
109번째 그룹
style transfer
[33mInit[0m (to chat_manager):

Topic: Generating professional English sentences.

--------------------------------------------------------------------------------
[33mInit[0m (to chat_manager):

Topic: Generating professional English sentences.

--------------------------------------------------------------------------------
[33mInit[0m (to chat_manager):

Topic: Generating professional English sentences.

--------------------------------------------------------------------------------
[33mInit[0m (to chat_manager):

Topic: Generating professional English sentences.

----------------------------------------------------------------------

In [23]:
len(train_data)

244

In [18]:
train_data

[{'turn_index': 2,
  'term': 'probabilistic text processes',
  'domain': 'cs.AI',
  'summary': '  Process mining is a subfield of process science that analyzes event data\ncollected in databases called event logs. Recently, novel types of event data\nhave become of interest due to the wide industrial application of process\nmining analyses. In this paper, we examine uncertain event data. Such data\ncontain meta-attributes describing the amount of imprecision tied with\nattributes recorded in an event log. We provide examples of uncertain event\ndata, present the state of the art in regard of uncertainty in process mining,\nand illustrate open challenges related to this research direction.\n',
  'english': 'Probabilistic text processes, akin to process mining, involve the analysis of event data collected in databases, specifically focusing on the imprecision tied with recorded attributes in an event log. The examination of uncertain event data, which includes meta-attributes describing 