In [None]:
import requests
import pandas as pd
# Correct table name for cumulative exoplanet data
url = "https://exoplanetarchive.ipac.caltech.edu/cgi-bin/nstedAPI/nph-nstedAPI"
params = {
    "table": "cumulative",  # Use the correct table name
    "select": "*",          # Select all columns
    "format": "csv"         # Retrieve the data in CSV format
}
# Request the data
response = requests.get(url, params=params)
# Save the data to a file or load it into a DataFrame
with open("exoplanets_cumulative.csv", "wb") as file:
    file.write(response.content)
# Load the CSV data into a DataFrame for further processing
df = pd.read_csv("exoplanets_cumulative.csv")
# Display the first few rows of the data
print(df.head())

      kepid kepoi_name   kepler_name         ra  ra_err        ra_str  \
0  10797460  K00752.01  Kepler-227 b  291.93423     0.0  19h27m44.22s   
1  10797460  K00752.02  Kepler-227 c  291.93423     0.0  19h27m44.22s   
2  10811496  K00753.01           NaN  297.00482     0.0  19h48m01.16s   
3  10848459  K00754.01           NaN  285.53461     0.0  19h02m08.31s   
4  10854555  K00755.01  Kepler-664 b  288.75488     0.0  19h15m01.17s   

         dec  dec_err       dec_str  koi_gmag  ...  koi_fpflag_co  \
0  48.141651      0.0  +48d08m29.9s    15.890  ...              0   
1  48.141651      0.0  +48d08m29.9s    15.890  ...              0   
2  48.134129      0.0  +48d08m02.9s    15.943  ...              0   
3  48.285210      0.0  +48d17m06.8s    16.100  ...              0   
4  48.226200      0.0  +48d13m34.3s    16.015  ...              0   

   koi_fpflag_ec  koi_insol  koi_insol_err1  koi_insol_err2  koi_srho  \
0              0      93.59           29.45          -16.65   3.20796   


In [None]:
# Define a function to convert each row into a textual description
def row_to_text(row):
    return f"Exoplanet {row['kepler_name']} (KOI: {row['kepoi_name']}) is located at RA: {row['ra_str']} and Dec: {row['dec_str']}. " \
           f"It has a gmag of {row['koi_gmag']}, an insolation flux of {row['koi_insol']} (error: +{row['koi_insol_err1']}, -{row['koi_insol_err2']}), " \
           f"and a stellar density of {row['koi_srho']} (error: +{row['koi_srho_err1']}, -{row['koi_srho_err2']})."
# Apply the function to create text descriptions for each row in the DataFrame
df['text_description'] = df.apply(row_to_text, axis=1)
# Preview the first few text descriptions
print(df['text_description'].head())

0    Exoplanet Kepler-227 b (KOI: K00752.01) is loc...
1    Exoplanet Kepler-227 c (KOI: K00752.02) is loc...
2    Exoplanet nan (KOI: K00753.01) is located at R...
3    Exoplanet nan (KOI: K00754.01) is located at R...
4    Exoplanet Kepler-664 b (KOI: K00755.01) is loc...
Name: text_description, dtype: object


In [None]:
from datasets import Dataset
# Create a Hugging Face dataset from the text descriptions
train_data = {"text": df['text_description'].tolist()}
train_dataset = Dataset.from_dict(train_data)
# Preview the dataset
print(train_dataset)

Dataset({
    features: ['text'],
    num_rows: 9564
})


In [None]:
# 1. First, let's create a function to format our prompts more effectively
def format_prompt(question):
    return f"Question: {question}\nAnswer:"

# 2. Improved generation function with adjusted parameters
def generate_exoplanet_text(prompt, model, tokenizer, max_length=200):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)
    model.eval()  # Set the model to evaluation mode

    formatted_prompt = format_prompt(prompt)
    inputs = tokenizer(formatted_prompt, return_tensors="pt").to(device)

    outputs = model.generate(
        **inputs,
        max_length=max_length,
        num_return_sequences=1,
        temperature=0.9,        # Slightly increased for more variety
        top_p=0.95,            # Slightly increased
        top_k=50,              # Added top_k parameter
        do_sample=True,
        pad_token_id=tokenizer.eos_token_id,
        no_repeat_ngram_size=2,  # Prevent repetition of 2-grams
        num_beams=5             # Use beam search
    )

    generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
    # Remove the prompt from the generated text
    response = generated_text.replace(formatted_prompt, "").strip()
    return response

# 3. Test prompts
test_prompts = [
    "Tell me about Kepler-22b",
    "What are the characteristics of hot Jupiters?",
    "Describe the most Earth-like exoplanet",
    "List some interesting facts about exoplanets",
    "What is the largest known exoplanet?"
]

# 4. Test the model and print results
print("Testing the model with various prompts:\n")
for prompt in test_prompts:
    print(f"Prompt: {prompt}")
    try:
        response = generate_exoplanet_text(prompt, model, tokenizer)
        if response:
            print(f"Generated Response: {response}\n")
        else:
            print("No response generated.\n")
    except Exception as e:
        print(f"Error generating response: {str(e)}\n")

# 5. Optional: Test with a sample from our training data
print("Testing with a sample from our training data:")
sample_data = train_dataset['text'][0]
print(f"Sample data: {sample_data[:100]}...")  # Print first 100 characters
sample_prompt = "Describe this exoplanet:"
response = generate_exoplanet_text(sample_prompt, model, tokenizer)
print(f"Generated Response: {response}\n")

Testing the model with various prompts:

Prompt: Tell me about Kepler-22b
Generated Response: It is located at RA: 19h21m54.07s and Dec: +38d36m59.0s. It has a gmag of 15.723, an insolation flux of 5.81 (error: -2.73, --1.66), and a stellar density of 0.96779 (0.08622, -0m11.83895).

Prompt: What are the characteristics of hot Jupiters?
Generated Response: 1.064 (error: +1.01, --0.42), --1 year.056

Prompt: Describe the most Earth-like exoplanet
Generated Response: 0.038 (error: +0.03, --1.01),
0.03901, and

Prompt: List some interesting facts about exoplanets
Generated Response: 1.04 (error: +1.03, --0.45),

Prompt: What is the largest known exoplanet?
Generated Response: It is located at RA: 19h26m59.87s and Dec: +38d46m26.3s. It has a gmag of 15.821, an insolation flux of 0.73 (error: -0.45, --0%), and a stellar density of 1.96484 (0e-05, -1.09895).

Testing with a sample from our training data:
Sample data: Exoplanet Kepler-227 b (KOI: K00752.01) is located at RA: 19h27m44.22s and 

In [None]:
# Import required libraries
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer
import torch
from datasets import Dataset

# 1. Set up tokenizer
tokenizer = AutoTokenizer.from_pretrained("gpt2")
tokenizer.pad_token = tokenizer.eos_token

# 2. Modified tokenization function to include labels
def tokenize_function(examples):
    tokenized_inputs = tokenizer(
        examples["text"],
        padding="max_length",
        truncation=True,
        max_length=512,
        return_special_tokens_mask=True
    )
    tokenized_inputs["labels"] = tokenized_inputs["input_ids"].copy()
    return tokenized_inputs

# 3. Tokenize the dataset
tokenized_dataset = train_dataset.map(
    tokenize_function,
    batched=True,
    remove_columns=train_dataset.column_names
)

# 4. Load the model
model = AutoModelForCausalLM.from_pretrained("gpt2")
model.resize_token_embeddings(len(tokenizer))

# 5. Set up training arguments
training_args = TrainingArguments(
    output_dir="./exoplanet-gpt2-improved",
    overwrite_output_dir=True,
    num_train_epochs=5,
    per_device_train_batch_size=2,
    gradient_accumulation_steps=4,
    save_steps=500,
    save_total_limit=2,
    learning_rate=2e-5,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=100,
    fp16=True,
    warmup_steps=500
)

# 6. Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset
)

# Train the model
trainer.train()

# Save the model
model_path = "./exoplanet-gpt2-final-improved"
model.save_pretrained(model_path)
tokenizer.save_pretrained(model_path)


# 9. Function to generate text
def generate_exoplanet_text(prompt, model, tokenizer, max_length=200):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)

    inputs = tokenizer(prompt, return_tensors="pt").to(device)

    outputs = model.generate(
        **inputs,
        max_length=max_length,
        num_return_sequences=1,
        temperature=0.7,
        top_p=0.9,
        do_sample=True
    )

    return tokenizer.decode(outputs[0], skip_special_tokens=True)




Map:   0%|          | 0/9564 [00:00<?, ? examples/s]

  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


Step,Training Loss
100,4.8255
200,0.3887
300,0.2516
400,0.2424
500,0.241
600,0.2352
700,0.2342
800,0.2337
900,0.2315


In [None]:
def test_model(model, tokenizer):
    test_questions = [
        "Tell me about Kepler-22b",
        "What are the characteristics of hot Jupiters?",
        "Describe the most Earth-like exoplanet",
        "How are exoplanets discovered?",
        "What makes an exoplanet habitable?"
    ]

    print("Testing the improved model:\n")
    for question in test_questions:
        print(f"Question: {question}")

        inputs = tokenizer(f"Question: {question}\nAnswer:", return_tensors="pt")
        outputs = model.generate(
            **inputs,
            max_length=200,
            num_return_sequences=1,
            temperature=0.7,
            top_p=0.95,
            top_k=50,
            no_repeat_ngram_size=2,
            num_beams=5
        )

        response = tokenizer.decode(outputs[0], skip_special_tokens=True)
        response = response.split("Answer:")[-1].strip()
        print(f"Answer: {response}\n")

# Test the model
test_model(model, tokenizer)

In [None]:
import random
from transformers import GPT2LMHeadModel, GPT2Tokenizer
import torch

def load_model_and_tokenizer(model_path):
    """Load the fine-tuned model and tokenizer"""
    try:
        model = GPT2LMHeadModel.from_pretrained(model_path)
        tokenizer = GPT2Tokenizer.from_pretrained(model_path)
        tokenizer.pad_token = tokenizer.eos_token
        return model, tokenizer
    except Exception as e:
        print(f"Error loading model: {str(e)}")
        return None, None

def generate_response(prompt, model, tokenizer, max_length=200):
    """Generate a response from the model"""
    try:
        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        model.to(device)
        model.eval()

        formatted_prompt = f"Question: {prompt}\nAnswer:"
        inputs = tokenizer(formatted_prompt, return_tensors="pt").to(device)

        outputs = model.generate(
            **inputs,
            max_length=max_length,
            num_return_sequences=1,
            temperature=0.7,
            top_p=0.95,
            top_k=50,
            no_repeat_ngram_size=2,
            num_beams=5,
            pad_token_id=tokenizer.eos_token_id
        )

        full_response = tokenizer.decode(outputs[0], skip_special_tokens=True)
        # Extract just the answer part
        answer = full_response.split("Answer:")[-1].strip()
        return answer
    except Exception as e:
        return f"Error generating response: {str(e)}"

def test_specific_examples(model, tokenizer, df):
    """Test the model with specific examples from the dataset"""
    print("\n=== Testing Specific Examples ===")

    # Test with 3 random specific exoplanets from the dataset
    for _ in range(3):
        row = df.iloc[random.randint(0, len(df)-1)]
        name = row['kepler_name'] if pd.notna(row['kepler_name']) else row['kepoi_name']
        prompt = f"Tell me about the exoplanet {name}"

        print(f"\nPrompt: {prompt}")
        response = generate_response(prompt, model, tokenizer)
        print(f"Response: {response}")

def test_general_questions(model, tokenizer):
    """Test the model with general questions about exoplanets"""
    print("\n=== Testing General Questions ===")

    general_questions = [
        "What are hot Jupiters?",
        "How are exoplanets discovered?",
        "What makes an exoplanet potentially habitable?",
        "What is the Kepler Space Telescope?",
        "Why are exoplanets important to study?"
    ]

    for question in general_questions:
        print(f"\nPrompt: {question}")
        response = generate_response(question, model, tokenizer)
        print(f"Response: {response}")

def test_complex_queries(model, tokenizer):
    """Test the model with more complex or comparative queries"""
    print("\n=== Testing Complex Queries ===")

    complex_questions = [
        "Compare Earth-like exoplanets to hot Jupiters",
        "What are the challenges in detecting small rocky exoplanets?",
        "How do scientists determine the composition of exoplanets?",
        "What role does stellar type play in exoplanet detection?",
        "Describe the different methods used to find exoplanets"
    ]

    for question in complex_questions:
        print(f"\nPrompt: {question}")
        response = generate_response(question, model, tokenizer)
        print(f"Response: {response}")

def main_test_suite():
    """Main function to run all tests"""
    model_path = "./exoplanet-gpt2-final"  # Update this to your model path
    model, tokenizer = load_model_and_tokenizer(model_path)

    if model is None or tokenizer is None:
        print("Failed to load model or tokenizer. Exiting.")
        return

    print("=== Starting Comprehensive Test Suite ===")

    # Test specific examples from the dataset
    test_specific_examples(model, tokenizer, df)

    # Test general questions
    test_general_questions(model, tokenizer)

    # Test complex queries
    test_complex_queries(model, tokenizer)

# Run the test suite
if __name__ == "__main__":
    main_test_suite()

Map:   0%|          | 0/9567 [00:00<?, ? examples/s]

Sample training examples:

Example 1:
Question: What is Kepler-227 b?
Answer: Kepler-227 b is an exoplanet discovered by the Kepler space telescope. It is located at coordinates RA 19h27m44.22s and Dec +48d08m29.9s in the sky. This exoplanet likely has a hot, with an insolation flux of 93.59 relative to Earth. The star it orbits has a density 3.21 times that of the Sun.

Example 2:
Question: What is Kepler-227 c?
Answer: Kepler-227 c is an exoplanet discovered by the Kepler space telescope. It is located at coordinates RA 19h27m44.22s and Dec +48d08m29.9s in the sky. This exoplanet likely has a moderate temperature, with an insolation flux of 9.11 relative to Earth. The star it orbits has a density 3.02 times that of the Sun.

Example 3:
Question: What are some interesting exoplanets?
Answer: K00753.01 is an exoplanet discovered by the Kepler space telescope. It is located at coordinates RA 19h48m01.16s and Dec +48d08m02.9s in the sky. This exoplanet likely has a hot, with an insolatio