In [1]:
import pandas as pd 
import seaborn as sns 
import numpy as np 

In [2]:
train = pd .read_csv('titanic/train.csv')
test = pd .read_csv('titanic/test.csv')

In [3]:
train['train'] = 1
test['train'] = 0

In [4]:
df = pd.concat([train, test], axis=0)

In [5]:
len(df)

1309

In [6]:
df.isnull().sum() / len(df) * 100

PassengerId     0.000000
Survived       31.932773
Pclass          0.000000
Name            0.000000
Sex             0.000000
Age            20.091673
SibSp           0.000000
Parch           0.000000
Ticket          0.000000
Fare            0.076394
Cabin          77.463713
Embarked        0.152788
train           0.000000
dtype: float64

In [7]:
df.drop(columns='Cabin', inplace=True)

In [8]:
df['Survived'] = {
    1: "Yes", 
    0: "No"
}

In [9]:
df.set_index('PassengerId', inplace=True)

In [10]:
numeric_cols = list(df.select_dtypes(np.number).columns)
cat_cols = list(df.select_dtypes('object').columns)


In [11]:
numeric_cols

['Pclass', 'Age', 'SibSp', 'Parch', 'Fare', 'train']

In [12]:
from sklearn.preprocessing import MinMaxScaler

In [13]:
scaler = MinMaxScaler()

In [14]:
scaler.fit(df[numeric_cols])

In [15]:
df[numeric_cols] = scaler.transform(df[numeric_cols])

In [16]:
df[numeric_cols].describe().loc[['min', 'max']]

Unnamed: 0,Pclass,Age,SibSp,Parch,Fare,train
min,0.0,0.0,0.0,0.0,0.0,0.0
max,1.0,1.0,1.0,1.0,1.0,1.0


In [17]:
from sklearn.tree import DecisionTreeClassifier

In [18]:
train_df = df[df['train'] == 1]
test_df = df[df['train'] == 0]

In [19]:
test_df = test_df.drop(columns='train')
train_df = train_df.drop(columns='train')

In [20]:
test.train

0      0
1      0
2      0
3      0
4      0
      ..
413    0
414    0
415    0
416    0
417    0
Name: train, Length: 418, dtype: int64

In [21]:
test_df.isnull().sum() / len(test_df)

Survived    0.995215
Pclass      0.000000
Name        0.000000
Sex         0.000000
Age         0.205742
SibSp       0.000000
Parch       0.000000
Ticket      0.000000
Fare        0.002392
Embarked    0.000000
dtype: float64

In [22]:
input_cols = numeric_cols + cat_cols

In [23]:
train_df['Ticket'] = train_df['Ticket'].astype('category')
test_df['Ticket'] = test_df['Ticket'].astype('category')

In [24]:
input_cols.remove('Survived')
input_cols.remove('Name')



In [25]:
input_cols.remove('train')

In [26]:
x_train = train_df[input_cols]
y_train = train_df.Survived

x_test = test_df[input_cols]
y_test = test_df.Survived

In [27]:
from sklearn.tree import DecisionTreeClassifier

In [28]:
model = DecisionTreeClassifier(random_state=42, max_depth=10)

In [29]:
x_train.nunique()

Pclass        3
Age          88
SibSp         7
Parch         7
Fare        248
Sex           2
Ticket      681
Embarked      3
dtype: int64

In [30]:
df['Ticket'].nunique()

929

In [31]:
freq_enc = df['Ticket'].value_counts()
df['encoded'] = df['Ticket'].map(freq_enc)

In [32]:
df[df['train'] == 1]['encoded']

PassengerId
1      1
2      2
3      1
4      2
5      1
      ..
887    1
888    1
889    4
890    1
891    1
Name: encoded, Length: 891, dtype: int64

In [33]:
train_df['encoded'] = df[df['train'] == 1]['encoded']
test_df['encoded'] = df[df['train'] == 0]['encoded']

In [34]:
input_cols.remove('Ticket')

In [35]:
input_cols.append('encoded')

In [36]:
train_df

Unnamed: 0_level_0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Embarked,encoded
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1,No,1.0,"Braund, Mr. Owen Harris",male,0.273456,0.125,0.000000,A/5 21171,0.014151,S,1
2,Yes,0.0,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,0.473882,0.125,0.000000,PC 17599,0.139136,C,2
3,,1.0,"Heikkinen, Miss. Laina",female,0.323563,0.000,0.000000,STON/O2. 3101282,0.015469,S,1
4,,0.0,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,0.436302,0.125,0.000000,113803,0.103644,S,2
5,,1.0,"Allen, Mr. William Henry",male,0.436302,0.000,0.000000,373450,0.015713,S,1
...,...,...,...,...,...,...,...,...,...,...,...
887,,0.5,"Montvila, Rev. Juozas",male,0.336089,0.000,0.000000,211536,0.025374,S,1
888,,0.0,"Graham, Miss. Margaret Edith",female,0.235876,0.000,0.000000,112053,0.058556,S,1
889,,1.0,"Johnston, Miss. Catherine Helen ""Carrie""",female,,0.125,0.222222,W./C. 6607,0.045771,S,4
890,,0.0,"Behr, Mr. Karl Howell",male,0.323563,0.000,0.000000,111369,0.058556,C,1


In [37]:
from sklearn.preprocessing import OneHotEncoder

In [38]:
sex_code = {
    "male": 1, 
    "female": 0
}

In [39]:
train_df = train_df.Sex.map(sex_code)
test_df = test_df.Sex.map(sex_code)

In [40]:
x_train = train_df[input_cols]
x_test = test_df[input_cols]

y_train = train_df['Survived']
y_test = test_df['Survived']

KeyError: "None of [Index(['Pclass', 'Age', 'SibSp', 'Parch', 'Fare', 'Sex', 'Embarked',\n       'encoded'],\n      dtype='object', name='PassengerId')] are in the [index]"

In [None]:
model.fit(x_train, y_train)

In [None]:
# Assuming 'input_cols' contains the desired column names
input_cols = ['Pclass', 'Age', 'SibSp', 'Parch', 'Fare', 'Sex', 'Embarked', 'encoded']

# Reset the index of the DataFrame

# Select columns using the corrected index
x_train = train_df[input_cols]
y_train = train_df['Survived']
