# **Titanic Prediction**

### **Import necessary libraries**

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

### **Load the dataset**

In [2]:
titanic_data = pd.read_csv('train.csv')

### **Display the first few rows of the dataset**

In [3]:
titanic_data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


## **Data preprocessing**

### **Define categorical and numerical features**

In [4]:
categorical_features = ['Sex', 'Embarked']
numerical_features = ['Pclass', 'Age', 'SibSp', 'Parch', 'Fare']

### **Create preprocessing pipeline for numerical features**

In [5]:
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median'))
])

### **Create preprocessing pipeline for categorical features**

In [6]:
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(drop='first'))
])

### **Combine preprocessing pipelines using ColumnTransformer**

In [7]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)
    ])

### **Define the model pipeline**

In [8]:
model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier(n_estimators=100, random_state=42))
])

### **Split the data into features and target variable**

In [9]:
X_train = titanic_data.drop('Survived', axis=1)
y_train = titanic_data['Survived']

### **Model training**

In [10]:
model.fit(X_train, y_train)

## **Model evaluation**

### **Predict on the test set**

In [11]:
X_test = pd.read_csv('test.csv')

In [12]:
y_pred = model.predict(X_test)

### **Calculate accuracy**

In [13]:
y_test = pd.read_csv('gender_submission.csv')

In [14]:
y_test = y_test.iloc[:,1]

In [15]:
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

Accuracy: 0.8205741626794258


### **Generate classification report**

In [16]:
print("Classification Report:")
print(classification_report(y_test, y_pred))

Classification Report:
              precision    recall  f1-score   support

           0       0.86      0.86      0.86       266
           1       0.75      0.76      0.75       152

    accuracy                           0.82       418
   macro avg       0.81      0.81      0.81       418
weighted avg       0.82      0.82      0.82       418



In [17]:
y_pred = pd.DataFrame(y_pred)

In [18]:
y_pred.to_csv('Predictions.csv',index=False)