In [7]:
import pandas as pd
from sklearn.model_selection import train_test_split


In [4]:
df = pd.read_csv("titanic.csv")
print(df.head())
print(df.shape)
print(df.columns)

   PassengerId  Survived  Pclass  \
0            1         0       3   
1            2         1       1   
2            3         1       3   
3            4         1       1   
4            5         0       3   

                                                Name     Sex   Age  SibSp  \
0                            Braund, Mr. Owen Harris    male  22.0      1   
1  Cumings, Mrs. John Bradley (Florence Briggs Th...  female  38.0      1   
2                             Heikkinen, Miss. Laina  female  26.0      0   
3       Futrelle, Mrs. Jacques Heath (Lily May Peel)  female  35.0      1   
4                           Allen, Mr. William Henry    male  35.0      0   

   Parch            Ticket     Fare Cabin Embarked  
0      0         A/5 21171   7.2500   NaN        S  
1      0          PC 17599  71.2833   C85        C  
2      0  STON/O2. 3101282   7.9250   NaN        S  
3      0            113803  53.1000  C123        S  
4      0            373450   8.0500   NaN        S  
(8

In [5]:
#data analysis - survival percentages by category
# survival by sex
female_survival = df[df['Sex'] == 'female']['Survived'].mean()
male_survival = df[df['Sex'] == 'male']['Survived'].mean()
print(f"Sex - Female: {female_survival:.3f}, Male: {male_survival:.3f}")

# survival by class
class1_survival = df[df['Pclass'] == 1]['Survived'].mean()
class2_survival = df[df['Pclass'] == 2]['Survived'].mean()
class3_survival = df[df['Pclass'] == 3]['Survived'].mean()
print(f"Pclass - 1: {class1_survival:.3f}, 2: {class2_survival:.3f}, 3: {class3_survival:.3f}")

# survival by age
young_survival = df[df['Age'] <= 18]['Survived'].mean()
old_survival = df[df['Age'] > 18]['Survived'].mean()
print(f"Age - 18 and younger: {young_survival:.3f}, 18 and older: {old_survival:.3f}")

Sex - Female: 0.742, Male: 0.189
Pclass - 1: 0.630, 2: 0.473, 3: 0.242
Age - 18 and younger: 0.504, 18 and older: 0.383


In [8]:

# Decision Tree Classifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score

# prepare data
X = df[['Sex', 'Age', 'Pclass']].copy()
y = df['Survived']

# remove rows with missing values
X_clean = X.dropna().copy()
y_clean = y.loc[X_clean.index]

# encode categorical variable
sex_mapping = {'male': 0, 'female': 1}
X_clean.loc[:, 'Sex'] = X_clean['Sex'].map(sex_mapping)

# split data intro training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_clean, y_clean, test_size=0.2, random_state=42)

# train decision tree
model = DecisionTreeClassifier(random_state=42)
model.fit(X_train, y_train)

# calculate accuracy
test_accuracy = accuracy_score(y_test, model.predict(X_test))
print(f"Test Accuracy: {test_accuracy:.3f}")

# feature importance
feature_names = ['Sex', 'Age', 'Pclass']
importances = model.feature_importances_

print("\nFeature Importance:")
for name, importance in zip(feature_names, importances):
    print(f"{name}: {importance:.3f}")

# Example with test data
test_sample = X_test.iloc[0]
prediction = model.predict(test_sample.to_frame().T)[0]
reverse_sex_mapping = {0: 'male', 1: 'female'}
print(f"\nExample: Sex={reverse_sex_mapping[test_sample['Sex']]}, Age={test_sample['Age']:.0f}, Pclass={test_sample['Pclass']} -> {'Survived' if prediction == 1 else 'Not Survived'}")


Test Accuracy: 0.776

Feature Importance:
Sex: 0.425
Age: 0.404
Pclass: 0.170

Example: Sex=male, Age=42, Pclass=2 -> Not Survived
