In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import random
from rdkit import Chem
from rdkit.Chem import Crippen
from rdkit.Chem import Descriptors

# 랜덤한 a와 b 생성
# a = random.uniform(0, 10)
# b = random.uniform(-200, 200)
a = 6.462356980390821
b = -162.75140504630065

# 데이터 불러오기
esol = pd.read_csv('../delaney-processed.csv')

# 'aM_w+b' 열 추가 (분자량은 4번째 열에 있다고 가정)
esol['smiles_len'] = esol['smiles'].apply(len)
esol['aM_w+b'] = a * esol.loc[:, 'Molecular Weight'] + b

# RDKit를 사용하여 물성 값들 계산
esol['LogP'] = esol['smiles'].apply(lambda x: Descriptors.MolLogP(Chem.MolFromSmiles(x)))
esol['TPSA'] = esol['smiles'].apply(lambda x: Descriptors.TPSA(Chem.MolFromSmiles(x)))
esol['sp3'] = esol['smiles'].apply(lambda x: Descriptors.FractionCSP3(Chem.MolFromSmiles(x)))
esol['MolMR'] = esol['smiles'].apply(lambda x: Descriptors.MolMR(Chem.MolFromSmiles(x)))
esol['BJ'] = esol['smiles'].apply(lambda x: Descriptors.BalabanJ(Chem.MolFromSmiles(x)))
esol['Chi'] = esol['smiles'].apply(lambda x: Descriptors.Chi1v(Chem.MolFromSmiles(x)))
esol['HKA'] = esol['smiles'].apply(lambda x: Descriptors.HallKierAlpha(Chem.MolFromSmiles(x)))

print(f"a = {a}, b = {b}")

# Robust Z-score 정규화 함수
def robust_zscore(values, mode):
   """
   Robust Z-score 정규화 함수
   median과 MAD(Median Absolute Deviation)를 사용
   """
   values = np.array(values)
   
   # NaN 값 처리
   valid_mask = ~np.isnan(values)
   if not np.any(valid_mask):
       return values  # 모든 값이 NaN인 경우
   
   valid_values = values[valid_mask]
   
   # Median 계산
   median = np.median(valid_values)
   
   # MAD (Median Absolute Deviation) 계산
   mad = np.median(np.abs(valid_values - median))
   
   # MAD가 0인 경우 (모든 값이 같은 경우) 처리
   if mad == 0:
       return np.zeros_like(values)
   
   # 정규분포 가정 하의 보정계수 1.4826
   # (MAD * 1.4826 ≈ standard deviation for normal distribution)
   robust_std = 1.4826 * mad
   
   # Robust Z-score 계산 ##################################################
   robust_z = (values - median) / robust_std
   if mode == 0:
      return robust_z
   elif mode == 1:
      return robust_z * 1000
   elif mode == 2:
      return robust_z ** 2
   elif mode == 3:
      return np.log(np.abs(robust_z + 1e-3)) + 500 # log 변환 (0으로 나누는 오류 방지)
   elif mode == 4:
      return np.sin(10 * robust_z)
   elif mode == 5:
      return np.exp(robust_z - 10)
      
# 정규화할 물성들 리스트return 500 * np.sin(5 * robust_z)elif mode == 4:

properties_to_normalize = ['Molecular Weight', 'LogP', 'TPSA', 'sp3', 'MolMR', 'BJ', 'Chi', 'HKA']

# 원본 통계량 저장을 위한 딕셔너리
original_stats = {}
normalized_stats = {}

print("\n=== 원본 데이터 통계량 ===")
for prop in properties_to_normalize:
   values = esol[prop].values
   
   # 원본 통계량 계산
   original_stats[prop] = {
       'mean': np.mean(values),
       'median': np.median(values),
       'std': np.std(values),
       'mad': np.median(np.abs(values - np.median(values))),
       'min': np.min(values),
       'max': np.max(values),
       'outlier_ratio': np.sum(np.abs(values - np.mean(values)) > 3 * np.std(values)) / len(values)
   }
   
   print(f"\n{prop}:")
   print(f"  Mean: {original_stats[prop]['mean']:.3f}")
   print(f"  Median: {original_stats[prop]['median']:.3f}")
   print(f"  Std: {original_stats[prop]['std']:.3f}")
   print(f"  MAD: {original_stats[prop]['mad']:.3f}")
   print(f"  Range: [{original_stats[prop]['min']:.3f}, {original_stats[prop]['max']:.3f}]")
   print(f"  Outlier ratio (>3σ): {original_stats[prop]['outlier_ratio']:.3f}")

print("\n=== Robust Z-score 정규화 수행 ===")

# 각 물성에 대해 Robust Z-score 정규화 수행
for prop in properties_to_normalize:
   # 새로운 컬럼명 생성
   normalized_col = f"{prop}_robust_zscore"
   lt_col = f"{prop}_lt"
   sq_col = f"{prop}_sq"
   lg_col = f"{prop}_lg"
   sn_col = f"{prop}_sn"
   ex_col = f"{prop}_ex"

   # Robust Z-score 정규화 적용
   esol[normalized_col] = robust_zscore(esol[prop], 0)
   esol[lt_col] = robust_zscore(esol[prop], 1)
   esol[sq_col] = robust_zscore(robust_zscore(esol[prop], 2), 0)
   esol[lg_col] = robust_zscore(robust_zscore(esol[prop], 3), 0)
   esol[sn_col] = robust_zscore(robust_zscore(esol[prop], 4), 0)
   esol[ex_col] = robust_zscore(robust_zscore(esol[prop], 5), 0)

   # 정규화 후 통계량 계산
   normalized_values = esol[normalized_col].values
   normalized_stats[prop] = {
       'mean': np.mean(normalized_values),
       'median': np.median(normalized_values),
       'std': np.std(normalized_values),
       'mad': np.median(np.abs(normalized_values - np.median(normalized_values))),
       'min': np.min(normalized_values),
       'max': np.max(normalized_values)
   }
   
   print(f"\n{prop} -> {normalized_col}:")
   print(f"  정규화 후 Mean: {normalized_stats[prop]['mean']:.6f}")
   print(f"  정규화 후 Median: {normalized_stats[prop]['median']:.6f}")
   print(f"  정규화 후 Std: {normalized_stats[prop]['std']:.3f}")
   print(f"  정규화 후 MAD: {normalized_stats[prop]['mad']:.6f}")
   print(f"  정규화 후 Range: [{normalized_stats[prop]['min']:.3f}, {normalized_stats[prop]['max']:.3f}]")

# 정규화 검증
print("\n=== 정규화 검증 ===")
print("Robust Z-score의 특성:")
print("- Median = 0 (정확히 0)")
print("- MAD = 1/1.4826 ≈ 0.6745 (이론값)")
print("\n실제 값들:")
for prop in properties_to_normalize:
   normalized_col = f"{prop}_robust_zscore"
   median_val = normalized_stats[prop]['median']
   mad_val = normalized_stats[prop]['mad']
   theoretical_mad = 1/1.4826
   
   print(f"{prop}:")
   print(f"  Median: {median_val:.6f} (이론값: 0)")
   print(f"  MAD: {mad_val:.6f} (이론값: {theoretical_mad:.6f})")

# 데이터프레임의 새로운 컬럼들 확인
print(f"\n=== 최종 데이터프레임 정보 ===")
print(f"전체 컬럼 수: {len(esol.columns)}")
print("정규화된 컬럼들:")
for prop in properties_to_normalize:
   normalized_col = f"{prop}_robust_zscore"
   print(f"  {normalized_col}")

# 샘플 데이터 확인
print(f"\n=== 샘플 데이터 (첫 5행) ===")
cols_to_show = properties_to_normalize + [f"{prop}_robust_zscore" for prop in properties_to_normalize]
print(esol[cols_to_show].head())

# 선택적: 히스토그램으로 정규화 전후 비교
def plot_before_after_histograms(prop_name, save_plots=False):
   """정규화 전후 히스토그램 비교"""
   fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 5))
   
   # 원본 데이터 히스토그램
   ax1.hist(esol[prop_name], bins=50, alpha=0.7, color='blue', edgecolor='black')
   ax1.set_title(f'Original {prop_name}')
   ax1.set_xlabel(prop_name)
   ax1.set_ylabel('Frequency')
   ax1.grid(True, alpha=0.3)
   
   # 정규화된 데이터 히스토그램
   normalized_col = f"{prop_name}_robust_zscore"
   ax2.hist(esol[normalized_col], bins=50, alpha=0.7, color='red', edgecolor='black')
   ax2.set_title(f'Robust Z-score Normalized {prop_name}')
   ax2.set_xlabel(f'{prop_name} (Robust Z-score)')
   ax2.set_ylabel('Frequency')
   ax2.grid(True, alpha=0.3)
   
   plt.tight_layout()
   
   if save_plots:
       plt.savefig(f'robust_zscore_{prop_name.replace(" ", "_")}.png', dpi=300, bbox_inches='tight')
   
   plt.show()

# 예시: 몇 개 물성의 히스토그램 확인 (선택적으로 실행)
print("\n히스토그램을 확인하려면 다음 함수들을 실행하세요:")
print("plot_before_after_histograms('Molecular Weight')")
print("plot_before_after_histograms('LogP')")
print("plot_before_after_histograms('sp3')")

In [None]:
import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"
from xai_sdk import Client
from xai_sdk.chat import user
import pandas as pd
import random

# Initialize xAI client
client = Client(api_key="Your API Key")  # 실제 xAI API 키로 교체하세요

# 매 실험마다 새로운 예시를 생성하는 프롬프트 생성 함수
def create_prompt(smiles, example_data):
    # 예시 데이터 문자열 생성
    example_str = "\n".join([f"{s}, {round(mw, 8)}" for s, mw in example_data])
    
    # No any hint
    prompt = f"""You are an experienced chemist with expertise in molecular structures. Using only your knowledge and without employing any external tools or code, predict the property for the following molecules. Below are examples of molecules and known property value:\n\n{example_str}\n\nNow, based on these examples, predict the property for the following molecule:\n\n{smiles}\n\nPlease provide the predicted specific property value!"""
    # Label hint
    #prompt = f"""You are an experienced chemist with expertise in molecular structures. Using only your knowledge and without employing any external tools or code, predict the molecular weight for the following molecules. Below are examples of molecules and known molecular weights:\n\n{example_str}\n\nNow, based on these examples, predict the molecular weight for the following molecule:\n\n{smiles}\n\nPlease provide the predicted specific molecular weight value!"""
    # All hint
    #prompt = f"""You are an experienced chemist with expertise in molecular structures. Using only your knowledge and without employing any external tools or code, predict the molecular weight for the following molecules. Below are examples of molecules in SMILES format and their known molecular weights:\n\n{example_str}\n\nNow, based on these examples, predict the molecular weight for the following molecule given in SMILES format:\n\n{smiles}\n\nPlease provide the predicted specific molecular weight value!"""
    # SMILES hint
    #prompt = f"""You are an experienced chemist with expertise in molecular structures. Using only your knowledge and without employing any external tools or code, predict the property for the following molecules. Below are examples of molecules in SMILES format and known property value:\n\n{example_str}\n\nNow, based on these examples, predict the property for the following molecule given in SMILES format:\n\n{smiles}\n\nPlease provide the predicted specific property value!"""
    # SMILES & f(M.W.) hint
    #prompt = f"""You are an experienced chemist with expertise in molecular structures. Using only your knowledge and without employing any external tools or code, predict the property for the following molecules. Below are examples of molecules in SMILES format and known property values. Note that the property is a function of molecular weight, f(M.W.):\n\n{example_str}\n\nNow, based on these examples, predict the property for the following molecule given in SMILES format:\n\n{smiles}\n\nPlease provide the predicted specific property value!"""
    # SMILES & a*M.W.+b hint
    #prompt = f"""You are an experienced chemist with expertise in molecular structures. Using only your knowledge and without employing any external tools or code, predict the property for the following molecules. Below are examples of molecules in SMILES format and known property values. Note that the property is represented as a linear function of molecular weight, specifically a * M.W. + b:\n\n{example_str}\n\nNow, based on these examples, predict the property for the following molecule given in SMILES format:\n\n{smiles}\n\nPlease provide the predicted specific property value!"""
    return prompt

# 프롬프트 미리보기 함수
def run_prompt_preview(esol_df, property):
    for seed in range(1, 101):
        # Test와 Train 데이터 생성
        test_set = esol_df.sample(n=1, random_state=seed)
        train_set = esol_df.drop(test_set.index).sample(n=50, random_state=seed, replace=False)
        
        smiles_test = test_set['smiles'].values[0]
        example_data = train_set[['smiles', property]].values.tolist()
        
        # 디버그: example_data 내용 출력
        for example in example_data:
            if len(example) != 2:
                print(f"Error in example_data formatting at seed {seed}: {example}")
                break
        
        # 프롬프트 생성 및 출력 (10번째마다 출력)
        if seed % 1 == 0:
            print(f"\n--- Prompt for iteration {seed} ---\n{create_prompt(smiles_test, example_data)}\n")
            #print(test_set['Molecular Weight'].values[0])

# Grok 예측 함수 (xAI 공식 SDK 사용)
def grok_predict(smiles, example_data):
    prompt = create_prompt(smiles, example_data)
    
    try:
        # xAI SDK를 사용한 채팅 생성 (다른 모델들과 일관성 맞춤)
        chat = client.chat.create(
            model="grok-3",  # 또는 "grok-4", "grok-3-mini" 등 사용 가능
            messages=[user(prompt)],
            temperature=0.0,        # GPT/Claude와 동일
            top_p=1.0,              # GPT/Claude와 동일  
            # max_tokens=1000,        # Claude와 동일
            frequency_penalty=0.0,  # GPT와 동일
            presence_penalty=0.0,   # GPT와 동일
            # seed=42                 # 재현성을 위한 시드 (선택사항)
        )
        
        # 응답 생성 (매개변수 없음)
        response = chat.sample()
        
        print('it is okay')
        # 응답 텍스트 추출
        return response.content.strip()
    except Exception as e:
        print(f"Error in Grok API call: {e}")
        return f"Error: {e}"

# 실험 수행 및 결과를 txt 파일에 즉시 기록
def run_grok_experiment_to_txt(esol_df, filename, property):
    # 파일 경로에서 디렉토리 부분을 추출하여 생성
    os.makedirs(os.path.dirname(filename), exist_ok=True)
    
    with open(filename, 'a', encoding='utf-8') as f:
        for seed in [26, 34]:################################################################################################################
            # Test와 Train 데이터 생성
            test_set = esol_df.sample(n=1, random_state=seed)
            train_set = esol_df.drop(test_set.index).sample(n=50, random_state=seed, replace=False)
            
            smiles_test = test_set['smiles'].values[0]
            true_value = test_set[property].values[0]
            example_data = train_set[['smiles', property]].values.tolist()
            
            # Grok 예측
            predicted_property = grok_predict(smiles_test, example_data)
            
            # 결과를 파일에 기록
            f.write(f"Iteration: {seed}\n")
            f.write(f"SMILES: {smiles_test}\n")
            f.write(f"True Property: {true_value}\n")
            f.write("Predicted Property:\n")
            f.write(f"{predicted_property}\n")
            f.write("="*50 + "\n")  # 구분선 추가
            
    print(f"Data successfully saved to {filename}")

# 실행
# 1. 프롬프트 미리보기 (API 호출 없이)
# run_prompt_preview(esol, 'LogP')

# 2. 실제 실험 수행 (프롬프트 확인 후 실행)
# task = 'sq'
# run_grok_experiment_to_txt(esol, filename=f"../Grok_Responses/grok_{task}mw.txt", property=f'Molecular Weight_{task}')
# task = 'lg'
# run_grok_experiment_to_txt(esol, filename=f"../Grok_Responses/grok_{task}mw.txt", property=f'Molecular Weight_{task}')
# task = 'sn'
# run_grok_experiment_to_txt(esol, filename=f"../Grok_Responses/grok_{task}mw.txt", property=f'Molecular Weight_{task}')
# task = 'ex'
# run_grok_experiment_to_txt(esol, filename=f"../Grok_Responses/grok_{task}mw.txt", property=f'Molecular Weight_{task}')

# 사용하기 전에 다음을 수행하세요:
# 1. xAI Console에서 API 키를 발급받으세요: https://console.x.ai
# 2. xAI SDK를 설치하세요: pip install xai-sdk
# 3. YOUR_XAI_API_KEY_HERE를 실제 xAI API 키로 교체하세요
# 4. 또는 환경변수 XAI_API_KEY를 설정하고 Client()만 사용할 수도 있습니다
# 5. esol 데이터프레임이 정의되어 있는지 확인하세요
#
# 사용 가능한 모델들:
# - "grok-3" (권장) - 최신 Grok 3 모델
# - "grok-3-mini" - 더 빠르고 저렴한 버전
# - "grok-4" - 가장 강력한 모델 (더 비쌈)
# - "grok-beta" - 베타 모델
#
# 환경변수 사용 시:
# export XAI_API_KEY="your_api_key_here"
# client = Client()  # API 키 자동 인식

In [None]:
tasks = ['raw', 'robust_zscore', 'lt', 'sq', 'lg', 'sn', 'ex']
prop_name = ['Molecular Weight', 'LogP', 'TPSA', 'sp3', 'MolMR', 'BJ', 'Chi', 'HKA']
file_name = ['mw', 'logp', 'tpsa', 'sp3', 'mr', 'bj', 'chi', 'hka']
for i in range(8):
    for task in tasks:
        if task != 'raw':
            run_grok_experiment_to_txt(esol, filename=f"../Grok3_Responses/grok_{task}{file_name[i]}.txt", property=f'{prop_name[i]}_{task}')
        else:
            run_grok_experiment_to_txt(esol, filename=f"../Grok3_Responses/grok_{task}{file_name[i]}.txt", property=f'{prop_name[i]}')
print("All tasks completed successfully!")