In [1]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

# Load dataset
titanic = pd.read_csv('train.csv')

# Fill missing values
titanic['Age'].fillna(titanic.groupby(['Pclass', 'Sex'])['Age'].transform('median'), inplace=True)
titanic['Embarked'].fillna(titanic['Embarked'].mode()[0], inplace=True)

# Feature engineering
titanic['FamilySize'] = titanic['SibSp'] + titanic['Parch'] + 1
titanic['IsAlone'] = (titanic['FamilySize'] == 1).astype(int)

# Extract titles from names
titanic['Title'] = titanic['Name'].str.extract(' ([A-Za-z]+)\.', expand=False)

# Encode categorical variables
titanic = pd.get_dummies(titanic, columns=['Sex', 'Embarked', 'Pclass', 'Title'], drop_first=True)

# Scale numerical features
scaler = StandardScaler()
titanic['Fare'] = scaler.fit_transform(titanic[['Fare']])
titanic['Age'] = scaler.fit_transform(titanic[['Age']])

# Drop irrelevant features
titanic.drop(['PassengerId', 'Name', 'Ticket', 'Cabin'], axis=1, inplace=True)

# Separate features and target variable
X = titanic.drop('Survived', axis=1)
y = titanic['Survived']

# Split into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Combine features and target into single dataframes
train_data = X_train.copy()
train_data['Survived'] = y_train

test_data = X_test.copy()
test_data['Survived'] = y_test

# Function to safely access columns
def safe_get(row, column, default=0):
    return row[column] if column in row else default

# Create descriptive sentences for training set
def create_train_sentence(row):
    sentence = f"A {'male' if safe_get(row, 'Sex_male') == 1 else 'female'} passenger, aged {row['Age']:.2f}, "
    sentence += f"of {'higher' if safe_get(row, 'Pclass_2') == 1 else 'middle' if safe_get(row, 'Pclass_3') == 1 else 'lower'} class, "
    sentence += f"with a fare of {row['Fare']:.2f}. "
    sentence += f"Embarked from {'Cherbourg' if safe_get(row, 'Embarked_C') == 1 else 'Queenstown' if safe_get(row, 'Embarked_Q') == 1 else 'Southampton'}. "
    sentence += f"Has a family size of {row['FamilySize']} and {'is alone' if row['IsAlone'] == 1 else 'is not alone'}. "
    titles = ['Mr', 'Miss', 'Mrs', 'Master', 'Other']
    for title in titles:
        if f'Title_{title}' in row and row[f'Title_{title}'] == 1:
            sentence += f"Holds the title of {title}. "
            break
    sentence += f"Survived: {'yes' if row['Survived'] == 1 else 'no'}."
    return sentence

# Create descriptive sentences for test set
def create_test_sentence(row):
    sentence = f"A {'male' if safe_get(row, 'Sex_male') == 1 else 'female'} passenger, aged {row['Age']:.2f}, "
    sentence += f"of {'higher' if safe_get(row, 'Pclass_2') == 1 else 'middle' if safe_get(row, 'Pclass_3') == 1 else 'lower'} class, "
    sentence += f"with a fare of {row['Fare']:.2f}. "
    sentence += f"Embarked from {'Cherbourg' if safe_get(row, 'Embarked_C') == 1 else 'Queenstown' if safe_get(row, 'Embarked_Q') == 1 else 'Southampton'}. "
    sentence += f"Has a family size of {row['FamilySize']} and {'is alone' if row['IsAlone'] == 1 else 'is not alone'}. "
    titles = ['Mr', 'Miss', 'Mrs', 'Master', 'Other']
    for title in titles:
        if f'Title_{title}' in row and row[f'Title_{title}'] == 1:
            sentence += f"Holds the title of {title}. "
            break
    sentence += "Will this passenger survive?"
    return sentence

# Apply to all rows
train_sentences = train_data.apply(create_train_sentence, axis=1)
test_sentences = test_data.apply(create_test_sentence, axis=1)

# Display a few examples
print("Training Examples:")
for i in range(5):
    print(train_sentences.iloc[i])

print("\nTest Examples:")
for i in range(5):
    print(test_sentences.iloc[i])


  titanic['Title'] = titanic['Name'].str.extract(' ([A-Za-z]+)\.', expand=False)


Training Examples:
A male passenger, aged -1.89, of lower class, with a fare of 1.00. Embarked from Southampton. Has a family size of 3 and is not alone. Holds the title of Master. Survived: yes.
A male passenger, aged -0.31, of middle class, with a fare of -0.49. Embarked from Southampton. Has a family size of 1 and is alone. Holds the title of Mr. Survived: no.
A female passenger, aged -2.11, of middle class, with a fare of -0.42. Embarked from Southampton. Has a family size of 3 and is not alone. Holds the title of Miss. Survived: yes.
A male passenger, aged 0.52, of higher class, with a fare of -0.09. Embarked from Southampton. Has a family size of 4 and is not alone. Holds the title of Mr. Survived: no.
A male passenger, aged 1.04, of higher class, with a fare of -0.12. Embarked from Southampton. Has a family size of 3 and is not alone. Holds the title of Mr. Survived: no.

Test Examples:
A male passenger, aged -0.31, of middle class, with a fare of -0.34. Embarked from Southampto

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  titanic['Age'].fillna(titanic.groupby(['Pclass', 'Sex'])['Age'].transform('median'), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  titanic['Embarked'].fillna(titanic['Embarked'].mode()[0], inplace=True)


In [2]:
# Create dictionaries for training and test sentences
train_dict = {i: sentence for i, sentence in enumerate(train_sentences)}
test_dict = {i: sentence for i, sentence in enumerate(test_sentences)}

# Display a few examples
print("Training Dictionary Examples:")
for i in range(5):
    print(f"{i}: {train_dict[i]}")

print("\nTest Dictionary Examples:")
for i in range(5):
    print(f"{i}: {test_dict[i]}")

Training Dictionary Examples:
0: A male passenger, aged -1.89, of lower class, with a fare of 1.00. Embarked from Southampton. Has a family size of 3 and is not alone. Holds the title of Master. Survived: yes.
1: A male passenger, aged -0.31, of middle class, with a fare of -0.49. Embarked from Southampton. Has a family size of 1 and is alone. Holds the title of Mr. Survived: no.
2: A female passenger, aged -2.11, of middle class, with a fare of -0.42. Embarked from Southampton. Has a family size of 3 and is not alone. Holds the title of Miss. Survived: yes.
3: A male passenger, aged 0.52, of higher class, with a fare of -0.09. Embarked from Southampton. Has a family size of 4 and is not alone. Holds the title of Mr. Survived: no.
4: A male passenger, aged 1.04, of higher class, with a fare of -0.12. Embarked from Southampton. Has a family size of 3 and is not alone. Holds the title of Mr. Survived: no.

Test Dictionary Examples:
0: A male passenger, aged -0.31, of middle class, with a

In [20]:
# Concatenate all test sentences into a single prompt
prompt = "\n\n"
for key, sentence in train_dict.items():
    prompt += f"{key}: {sentence}\n\n"

In [4]:
# Concatenate all test sentences into a single prompt
prompt2 = "\n\n"
for key, sentence in test_dict.items():
    prompt2 += f"{key}: {sentence}\n\n"

In [15]:
print(prompt)



0: A male passenger, aged 1.23, of lower class, with a fare of -0.07. Embarked from Southampton. Has a family size of 1 and is alone. Holds the title of Mr. Survived: no.

1: A male passenger, aged -0.46, of higher class, with a fare of -0.39. Embarked from Southampton. Has a family size of 1 and is alone. Holds the title of Mr. Survived: no.

2: A male passenger, aged 0.22, of middle class, with a fare of -0.49. Embarked from Southampton. Has a family size of 1 and is alone. Holds the title of Mr. Survived: no.

3: A male passenger, aged -0.23, of middle class, with a fare of -0.49. Embarked from Southampton. Has a family size of 2 and is not alone. Holds the title of Mr. Survived: no.

4: A female passenger, aged -1.74, of middle class, with a fare of -0.02. Embarked from Southampton. Has a family size of 7 and is not alone. Holds the title of Miss. Survived: no.

5: A male passenger, aged -0.38, of lower class, with a fare of 4.34. Embarked from Southampton. Has a family size of 2

In [27]:
from langchain.chat_models import ChatOpenAI
from langchain.prompts import ChatPromptTemplate

analysis_string = """Generate the analysis based on instruction\ that
is delimited by triple backticks.\instruction: ```{instruction_analyis}```
"""

instruction_analysis = f"""You are functioning as an AI prediction analyst whose job is to learn patterns from the data provided of who survived an accident and who didnt.
1. You are given few examples of which passenger survived and who didnt here {prompt}
2. What are the key differences in terms of different charecteristics whether its ticket price, marital status etc to differentiate who survived and who perished
6. We are trying to frame the analysis in a way so that with this limited data you can make a reasonable guess if passenger survived or perished in the accident 
"""

analysis_template = ChatPromptTemplate.from_template(analysis_string)

chat = ChatOpenAI(temperature=0.0, model="gpt-4-turbo", openai_api_key='')
user_analysis = analysis_template.format_messages(instruction_analyis=instruction_analysis)
response = chat(user_analysis)
text=response.content
print(text)

  analysis_string = """Generate the analysis based on instruction\ that


### Analysis of Survival Patterns Based on Passenger Data

#### Overview
The provided data includes various attributes of passengers such as age, class, fare, embarkation point, family size, whether they were alone, title, and survival status. By analyzing these attributes, we can identify patterns and factors that may have influenced the likelihood of survival.

#### Key Observations and Patterns

1. **Gender and Title**:
   - Titles such as "Mrs." and "Miss" (typically female) show a higher survival rate compared to "Mr." (typically male). This suggests that women had a higher likelihood of survival.
   - Titles like "Master," which usually refer to young males, also show a relatively higher survival rate, indicating that children were more likely to survive.

2. **Class**:
   - Passengers from higher classes generally show a higher survival rate compared to those from lower classes. This could be due to better access to lifeboats and other safety measures for higher-class passengers

In [26]:
from langchain.chat_models import ChatOpenAI
from langchain.prompts import ChatPromptTemplate

analysis_string = """Generate the analysis based on instruction\ that
is delimited by triple backticks.\instruction: ```{instruction_analyis}```
"""

instruction_analysis = f"""You are functioning as an AI prediction analyst whose job is to learn patterns from the data provided of who survived an accident and who didnt.
1. You are given detailed analysis of which passenger are likely to survive and which one not  here {text}
2. Using this analysis predict for each of these passengers if they would have survived or not {prompt2}
3. Generate the analysis in this form Index | Survived , Not Survived
"""

analysis_template = ChatPromptTemplate.from_template(analysis_string)

chat = ChatOpenAI(temperature=0.0, model="gpt-4-turbo", openai_api_key='')
user_analysis = analysis_template.format_messages(instruction_analyis=instruction_analysis)
response = chat(user_analysis)
text=response.content
print(text)

  analysis_string = """Generate the analysis based on instruction\ that


Based on the detailed analysis of survival patterns from the provided passenger data, here are the predictions for each passenger's survival status:

```
0: Master | Survived
1: Mr. | Not Survived
2: Mr. | Not Survived
3: Miss | Survived
4: Miss | Survived
5: Miss | Not Survived
6: Miss | Not Survived
7: Mr. | Not Survived
8: Miss | Not Survived
9: Miss | Survived
10: Mr. | Not Survived
11: Mr. | Not Survived
12: Miss | Survived
13: Mr. | Not Survived
14: Mr. | Not Survived
15: Miss | Survived
16: Mr. | Not Survived
17: Miss | Not Survived
18: Mr. | Not Survived
19: Mr. | Not Survived
20: Mr. | Not Survived
21: Mr. | Not Survived
22: Miss | Survived
23: Mr. | Not Survived
24: Mr. | Not Survived
25: Master | Survived
26: Mr. | Not Survived
27: Mr. | Not Survived
28: Master | Survived
29: Miss | Not Survived
30: Mr. | Not Survived
31: Miss | Not Survived
32: Mr. | Not Survived
33: Miss | Not Survived
34: Mr. | Not Survived
35: Mr. | Not Survived
36: Mr. | Not Survived
37: Miss | Not Surv

Thought Refinement Exercise

In [3]:
#Creating a smaller prompt to refine the analysis 
# Create dictionaries for training and test sentences
train_dict2 = {i: sentence for i, sentence in enumerate(train_sentences.head(200))}

prompt_small = "\n\n"
for key, sentence in train_dict2.items():
    prompt_small += f"{key}: {sentence}\n\n"


In [42]:
from langchain.chat_models import ChatOpenAI
from langchain.prompts import ChatPromptTemplate

analysis_string = """Generate the analysis based on instruction\ that
is delimited by triple backticks.\instruction: ```{instruction_analyis}```
"""

instruction_analysis = f"""You are functioning as an AI prediction analyst whose job is to learn patterns from the data provided of who survived an accident and who didnt.
1. In the initial analysis you have taken some data and analyzed patterns in data which can help us predict wether a passenger will survive an accident or not 
2. The analysis is placed here in the {text}
3. However there is alot of scope for improvement to this analysis as we are not to predict 100% of survival cases
4. Can you please relook at the analysis and here {prompt_small} some more data to build a better analysis this time
"""

analysis_template = ChatPromptTemplate.from_template(analysis_string)

chat = ChatOpenAI(temperature=0.0, model="gpt-4-turbo", openai_api_key='')
user_analysis = analysis_template.format_messages(instruction_analyis=instruction_analysis)
response = chat(user_analysis)
text2=response.content
print(text2)

  analysis_string = """Generate the analysis based on instruction\ that


### Analysis of Survival Patterns Based on Passenger Data

#### Overview
The expanded dataset includes detailed attributes of passengers such as age, class, fare, embarkation point, family size, whether they were alone, title, and survival status. This comprehensive data allows for a deeper analysis to identify more nuanced patterns that may influence the likelihood of survival in an accident.

#### Key Observations and Patterns

1. **Gender and Title**:
   - Consistently, titles such as "Mrs.", "Miss", and "Master" correlate with higher survival rates, reinforcing the trend that women and children had a higher likelihood of survival.
   - "Mr." title, typically representing adult males, consistently shows lower survival rates across various classes and family sizes.

2. **Class**:
   - Higher class passengers continue to exhibit a higher survival rate. This trend is persistent even when controlling for other factors like fare and embarkation points, suggesting a strong class bias in s

In [7]:
#Reducing the analysis size 

text_short = """"#### Key Observations and Patterns

1. **Gender and Title**:
   - Consistently, titles such as "Mrs.", "Miss", and "Master" correlate with higher survival rates, reinforcing the trend that women and children had a higher likelihood of survival.
   - "Mr." title, typically representing adult males, consistently shows lower survival rates across various classes and family sizes.

2. **Class**:
   - Higher class passengers continue to exhibit a higher survival rate. This trend is persistent even when controlling for other factors like fare and embarkation points, suggesting a strong class bias in survival chances.

3. **Fare**:
   - There is a positive correlation between fare and survival, indicating that passengers who paid more (likely correlated with higher class) had better survival odds. This could be due to better placement on the ship or closer access to lifeboats.

4. **Age**:
   - Younger passengers, especially those with titles like "Master" or "Miss," show higher survival rates. Negative age values, likely representing infants or very young children, also have higher survival rates, suggesting a protective effect for the youngest passengers.

5. **Family Size**:
   - Medium-sized family groups (2-4 members) show better survival rates compared to those traveling alone or with very large families. This might be due to better ability to navigate the crisis with moderate family support without the hindrance of a large group.

6. **Embarkation Point**:
   - Variations in survival rates by embarkation points are observed, with passengers from Queenstown showing slightly higher survival rates in some cases. This could be influenced by the composition of passengers embarking from these locations.

7. **Traveling Alone vs. Not Alone**:
   - Passengers not traveling alone generally show higher survival rates, supporting the idea that companions could aid in survival efforts. However, very large groups do not necessarily have improved survival, possibly due to difficulties in coordinating during the crisis.

#### Conclusions and Predictive Insights

- **Gender, Age, and Title**: Strong indicators of survival include being female, young, or holding titles like "Miss," "Mrs.," or "Master."
- **Socio-Economic Status**: Higher-class passengers and those paying higher fares had better survival odds, likely due to better access to life-saving resources.
- **Family Dynamics**: Traveling with a family size of 2 to 4 increases survival chances, likely due to optimal support without the drawbacks of large group coordination."""

Predicting with new analysis 

In [8]:
from langchain.chat_models import ChatOpenAI
from langchain.prompts import ChatPromptTemplate

analysis_string = """Generate the analysis based on instruction\ that
is delimited by triple backticks.\instruction: ```{instruction_analyis}```
"""

instruction_analysis = f"""You are functioning as an AI prediction analyst whose job is to learn patterns from the data provided of who survived an accident and who didnt.
1. You are given detailed analysis of which passenger are likely to survive and which one not  here {text_short}
2. Using this analysis predict for each of these passengers if they would have survived or not {prompt2}
3. Generate the analysis in this form Index | Survived , Not Survived
"""

analysis_template = ChatPromptTemplate.from_template(analysis_string)

chat = ChatOpenAI(temperature=0.0, model="gpt-4-turbo", openai_api_key='')
user_analysis = analysis_template.format_messages(instruction_analyis=instruction_analysis)
response = chat(user_analysis)
text3=response.content
print(text3)

  analysis_string = """Generate the analysis based on instruction\ that
  warn_deprecated(
  warn_deprecated(


Based on the detailed analysis provided and the patterns observed in the survival rates of passengers, here are the predictions for each passenger listed:

```
Index | Survival Prediction
---------------------------
0     | Survived
1     | Not Survived
2     | Not Survived
3     | Survived
4     | Survived
5     | Survived
6     | Survived
7     | Not Survived
8     | Survived
9     | Survived
10    | Not Survived
11    | Not Survived
12    | Survived
13    | Not Survived
14    | Not Survived
15    | Survived
16    | Not Survived
17    | Survived
18    | Not Survived
19    | Not Survived
20    | Not Survived
21    | Not Survived
22    | Survived
23    | Not Survived
24    | Not Survived
25    | Survived
26    | Not Survived
27    | Not Survived
28    | Survived
29    | Survived
30    | Not Survived
31    | Survived
32    | Not Survived
33    | Survived
34    | Not Survived
35    | Not Survived
36    | Not Survived
37    | Survived
38    | Survived
39    | Not Survived
40    | Not Surv