# Titanic Survival Prediction using FEDOT and LLM

This notebook demonstrates the process of analyzing the Titanic dataset and predicting passenger survival using the FEDOT framework enhanced with Large Language Models (LLM).


## Setup

In [None]:
import os
import sys
import re

import numpy as np
import pandas as pd

from pprint import pprint

module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)

from fedot_llm.data.zip import unzip_archive
from fedot_llm.fedot_util import run_example
from fedot_llm.language_models.actions import ModelAction
from fedot_llm.language_models.llms import OllamaLLM
from fedot_llm.data.data import Dataset
import fedot_llm.language_models.prompts

## Data Loading and Exploration

In this section, we load the Titanic dataset and perform initial exploration.

In [None]:
dataset_name = 'titanic'
dataset_path = os.sep.join(['..', 'datasets', dataset_name])
dataset = Dataset.load_from_path(dataset_path)
dataset_description = dataset.description
dataset_metadata_description = dataset.metadata_description

print(dataset_description)
print()
print(dataset_metadata_description)

In [None]:
import json
with open('../datasets/big_descriptions.json', 'r') as json_file:
    dataset_big_descriptions = json.load(json_file)
dataset_big_descriptions

In [None]:
train = list(filter(lambda split: split.name == dataset.train_split_name, dataset.splits))[0]
train.data.info()

In [None]:
test = list(filter(lambda split: split.name == 'test_merged', dataset.splits))[0]
test.data.info()

## Dataset Analysis using LLM

Here we use LLM to analyze and describe various aspects of the dataset.

In [None]:
model = OllamaLLM(model='llama3')

In [None]:
from fedot_llm.language_models import prompts

action = ModelAction(model=model)
# 1: Название датасета и определение тренировочного сплита
task_prompts = {
    "dataset_name": {
        "system": dataset_big_descriptions[dataset_name],
        "task": prompts.dataset_name_prompt,
        "context": None,
    },
    "train_split": {
        "system": dataset_big_descriptions[dataset_name],
        "task": prompts.train_split_definition_prompt,
        "context": dataset.description,
    }
}

responses = action.run_model_multicall(
    task_prompts
)
operations = {
    "train_split": lambda x : x.split(".")[0]
}
responses = action.process_model_responses(responses, operations)
pprint(responses)

dataset.name = responses["dataset_name"]
dataset.train_split_name = responses["train_split"] 

### Dataset Description and Goal

In [None]:
task_prompts = {
    "dataset_description": {
        "system": dataset_big_descriptions[dataset_name],
        "task": prompts.dataset_description_prompt,
        "context": dataset.description,
    },
    "dataset_goal": {
        "system": dataset_big_descriptions[dataset_name],
        "task": prompts.dataset_goal_prompt,
        "context": dataset.description,
    },
}

responses = action.run_model_multicall(
    task_prompts
)
pprint(responses)

dataset.description = responses["dataset_description"]
dataset.goal = responses["dataset_goal"]

### Column Descriptions

In [None]:
column_descriptions = action.generate_all_column_description(split=train, dataset=dataset)
train.set_column_descriptions(column_descriptions)
column_descriptions

### Target Column and Task Type Identification

In [None]:
task_prompts = {
    "target_column": {
        "system": dataset.description,
        "task": prompts.target_definition_prompt,
        "context": None,
    },
    "task_type": {
        "system": dataset.description,
        "task": prompts.task_definition_prompt,
        "context": None,
    }
}

responses = action.run_model_multicall(
    task_prompts
)

pattern = r'[\'\"“”‘’`´]'
operations = {
    "target_column" :  lambda x : re.sub(pattern, '', x),
    "task_type": lambda x : re.sub(pattern, '', x.lower())
}
responses = action.process_model_responses(responses, operations)
pprint(responses)

### Categorical Columns Identification

In [None]:
categorical_columns =  action.get_categorical_features(split=train, dataset=dataset)
pprint(categorical_columns)

## FEDOT Framework Execution
In this section, we prepare the data and run the FEDOT framework to generate predictions.

In [None]:
prediction = run_example(train_df=train.data, test_df=test.data, problem=responses['task_type'], target=responses['target_column'])

## Results

Here we display and analyze the prediction results.

In [None]:
prediction[:5]