In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns 
from scipy import stats
from sklearn.model_selection import train_test_split, GridSearchCV
# Select only numerical features
import re
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_selection import SelectKBest, chi2, f_classif
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
import xgboost as xgb
import joblib
import pickle



In [2]:
train = pd.read_csv(r"C:\Users\HP\Downloads\train_df.csv")
test=pd.read_csv(r"C:\Users\HP\Downloads\test_df (1).csv")
train

Unnamed: 0,Fare,Pclass,AgeGroup,Sex_female,Sex_male,Embarked_C,FamilyCategory_Small,Survived
0,28.5000,1,2.0,0,1,0,0,0
1,13.0000,2,4.0,0,1,0,0,0
2,7.9250,3,0.0,0,1,0,0,0
3,7.8542,3,4.0,0,1,0,1,0
4,31.2750,3,1.0,1,0,0,0,0
...,...,...,...,...,...,...,...,...
707,7.6500,3,4.0,1,0,0,0,1
708,31.0000,1,4.0,0,1,0,0,0
709,14.1083,3,0.0,0,1,0,1,0
710,120.0000,1,3.0,1,0,0,0,1


In [3]:
with open('q2_age.pkl', 'rb') as f:
    Q2_age = pickle.load(f)

print("Loaded Q2 (Median Age):", Q2_age)

Loaded Q2 (Median Age): 28.0


In [4]:
test['Age'] = test['Age'].apply(lambda x: Q2_age if x < -4.5 or x > 63.5 else x)

In [5]:
with open('age_mean.pkl', 'rb') as f:
        mean_age = pickle.load(f)
print("Loaded (mean Age):", mean_age)

Loaded (mean Age): 28.97874125874126


In [6]:
test['Age'] = test['Age'].fillna(mean_age)

In [7]:
with open('cabin_mode.pkl', 'rb') as f:
    cabin_mode=pickle.load(f)
print("Loaded (cabin_mode):", cabin_mode[0:3])

Loaded (cabin_mode): C23


In [8]:
test['Cabin'] = test['Cabin'].fillna(cabin_mode[0:3])

In [9]:
test.isnull().sum()

PassengerId    0
Survived       0
Pclass         0
Name           0
Sex            0
Age            0
SibSp          0
Parch          0
Ticket         0
Fare           0
Cabin          0
Embarked       0
dtype: int64

In [10]:
num_duplicates = test.duplicated().sum()
print(f"Number of duplicate rows: {num_duplicates}")

Number of duplicate rows: 0


In [11]:
test['FamilySize'] = test['SibSp'] + test['Parch'] + 1
def categorize_family_size(size):
    if size == 1:
        return 'Single'
    elif size <= 3:
        return 'Small'
    elif size <= 6:
        return 'Medium'
    else:
        return 'Large'

test['FamilyCategory'] = test['FamilySize'].apply(categorize_family_size)
test.drop(['Parch', 'SibSp'], axis=1, inplace=True)

In [13]:
bins = [0, 12, 18, 30, 45, 63.5]
labels = ['Child', 'Teen', 'Young Adult', 'Adult', 'Mid-age Adult']

test['AgeGroup'] = pd.cut(test['Age'], bins=bins, labels=labels, right=True, include_lowest=True)

In [14]:
test['Title'] = test['Name'].str.extract(r',\s*(\w+)\.')

In [15]:
test.drop(columns=['Name'], inplace=True)

In [16]:
test.isnull().sum()

PassengerId       0
Survived          0
Pclass            0
Sex               0
Age               0
Ticket            0
Fare              0
Cabin             0
Embarked          0
FamilySize        0
FamilyCategory    0
AgeGroup          0
Title             0
dtype: int64

In [17]:
test['Ticket'] = test['Ticket'].astype(str)

# Step 2: Extract only characters before the first digit (i.e., the prefix)
test['Ticket_Prefix'] = test['Ticket'].str.extract(r'^([^\d]+)')

# Step 3: Clean up - remove leading/trailing spaces and dots
test['Ticket_Prefix'] = test['Ticket_Prefix'].str.strip()
test['Ticket_Prefix'] = test['Ticket_Prefix'].str.replace('.', '', regex=False)

In [18]:
with open('prefix_mode.pkl', 'rb') as f:
        prefix_mode = pickle.load(f)
test['Ticket_Prefix'] = test['Ticket_Prefix'].fillna(prefix_mode)

In [19]:
test.isnull().sum()

PassengerId       0
Survived          0
Pclass            0
Sex               0
Age               0
Ticket            0
Fare              0
Cabin             0
Embarked          0
FamilySize        0
FamilyCategory    0
AgeGroup          0
Title             0
Ticket_Prefix     0
dtype: int64

In [20]:
##freq_map = test['Ticket_Prefix'].value_counts(normalize=False)
with open('freq_map.pkl', 'rb') as f:
        freq_map = pickle.load(f)
# Step 3: Map frequencies to the column

test['Ticket_Prefix'] = test['Ticket_Prefix'].map(freq_map)


In [21]:
test['Cabin_Deck'] = test['Cabin'].str.extract(r'^([A-Za-z]+)', expand=False)
test.drop(columns=['Cabin'], inplace=True)

In [22]:
test.isnull().sum()

PassengerId       0
Survived          0
Pclass            0
Sex               0
Age               0
Ticket            0
Fare              0
Embarked          0
FamilySize        0
FamilyCategory    0
AgeGroup          0
Title             0
Ticket_Prefix     2
Cabin_Deck        0
dtype: int64

In [23]:
test.dropna(inplace=True)

In [24]:
test.isnull().sum()

PassengerId       0
Survived          0
Pclass            0
Sex               0
Age               0
Ticket            0
Fare              0
Embarked          0
FamilySize        0
FamilyCategory    0
AgeGroup          0
Title             0
Ticket_Prefix     0
Cabin_Deck        0
dtype: int64

In [25]:
test = pd.get_dummies(test, columns=['Sex', 'Embarked','FamilyCategory'], drop_first=False)
# One-hot encode 'Sex' and 'Embarked'


# Identify one-hot encoded columns (or specify them if you know)
one_hot_cols = [col for col in test.columns if col.startswith('Sex_') or col.startswith('Embarked_') or col.startswith('FamilyCategory_')]

# Replace integer 0 → 'False', 1 → 'True'
for col in one_hot_cols:
     test[col] = test[col].replace({False: 0, True: 1})

In [26]:
test.drop(columns=['Ticket'], inplace=True)

In [27]:
test.head()

Unnamed: 0,PassengerId,Survived,Pclass,Age,Fare,FamilySize,AgeGroup,Title,Ticket_Prefix,Cabin_Deck,Sex_female,Sex_male,Embarked_C,Embarked_Q,Embarked_S,FamilyCategory_Large,FamilyCategory_Medium,FamilyCategory_Single,FamilyCategory_Small
0,710,1,3,28.978741,15.2458,3,Young Adult,Master,570.0,C,0,1,1,0,0,0,0,0,1
1,440,0,2,31.0,10.5,1,Adult,Mr,36.0,C,0,1,0,0,1,0,0,1,0
2,841,0,3,20.0,7.925,1,Young Adult,Mr,1.0,C,0,1,0,0,1,0,0,1,0
3,721,1,2,6.0,33.0,2,Child,Miss,570.0,C,1,0,0,0,1,0,0,0,1
4,40,1,3,14.0,11.2417,2,Teen,Miss,570.0,C,1,0,1,0,0,0,0,0,1


In [28]:
train.head()

Unnamed: 0,Fare,Pclass,AgeGroup,Sex_female,Sex_male,Embarked_C,FamilyCategory_Small,Survived
0,28.5,1,2.0,0,1,0,0,0
1,13.0,2,4.0,0,1,0,0,0
2,7.925,3,0.0,0,1,0,0,0
3,7.8542,3,4.0,0,1,0,1,0
4,31.275,3,1.0,1,0,0,0,0


In [29]:
with open('ordinal_encoder.pkl', 'rb') as f:
    ordinal_encoder = pickle.load(f)
label_cols = ['Ticket_Prefix', 'Cabin_Deck', 'Title','AgeGroup'] 
# Apply to the same columns in test set
test[label_cols] = ordinal_encoder.transform(test[label_cols])

In [30]:
test.head()

Unnamed: 0,PassengerId,Survived,Pclass,Age,Fare,FamilySize,AgeGroup,Title,Ticket_Prefix,Cabin_Deck,Sex_female,Sex_male,Embarked_C,Embarked_Q,Embarked_S,FamilyCategory_Large,FamilyCategory_Medium,FamilyCategory_Single,FamilyCategory_Small
0,710,1,3,28.978741,15.2458,3,4.0,5.0,10.0,2.0,0,1,1,0,0,0,0,0,1
1,440,0,2,31.0,10.5,1,0.0,9.0,9.0,2.0,0,1,0,0,1,0,0,1,0
2,841,0,3,20.0,7.925,1,4.0,9.0,0.0,2.0,0,1,0,0,1,0,0,1,0
3,721,1,2,6.0,33.0,2,1.0,6.0,10.0,2.0,1,0,0,0,1,0,0,0,1
4,40,1,3,14.0,11.2417,2,3.0,6.0,10.0,2.0,1,0,1,0,0,0,0,0,1


In [31]:
common_cols = [col for col in test.columns if col in train.columns]
test = test[common_cols]

In [32]:
test = test[train.columns]

In [33]:
test.head()

Unnamed: 0,Fare,Pclass,AgeGroup,Sex_female,Sex_male,Embarked_C,FamilyCategory_Small,Survived
0,15.2458,3,4.0,0,1,1,1,1
1,10.5,2,0.0,0,1,0,0,0
2,7.925,3,4.0,0,1,0,0,0
3,33.0,2,1.0,1,0,0,1,1
4,11.2417,3,3.0,1,0,1,1,1


In [34]:
model = joblib.load('DT_model.pkl')
X = test.drop("Survived", axis=1) 
y = test["Survived"]               
# Predict
predictions = model.predict(X)


# If you have the true labels:
score = accuracy_score(y, predictions)
print("Test Accuracy Score:", score)

Test Accuracy Score: 0.8361581920903954


In [35]:
model = joblib.load('best_rf_model.pkl')
X = test.drop("Survived", axis=1) 
y = test["Survived"]               
# Predict
predictions = model.predict(X)


# If you have the true labels:
score = accuracy_score(y, predictions)
print("Test Accuracy Score:", score)

Test Accuracy Score: 0.8305084745762712
