In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import precision_score, recall_score
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

In [2]:
# Load the dataset
df_train = pd.read_csv(r'C:\Users\HP\Downloads\train.csv')
df_test = pd.read_csv(r'C:\Users\HP\Downloads\test.csv')
df_gender_submission = pd.read_csv(r'C:\Users\HP\Downloads\gender_submission.csv')

In [3]:
df_train

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C


In [4]:
# Display the gender submission data
df_gender_submission

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,1
2,894,0
3,895,0
4,896,1
...,...,...
413,1305,0
414,1306,1
415,1307,0
416,1308,0


In [5]:
# Handle missing values for 'Age'
imputer_age = SimpleImputer(strategy='median')
df_train['Age'] = imputer_age.fit_transform(df_train[['Age']])
df_test['Age'] = imputer_age.transform(df_test[['Age']])

In [6]:
# Display the 'Age' column
df_train['Age']

0      22.0
1      38.0
2      26.0
3      35.0
4      35.0
       ... 
886    27.0
887    19.0
888    28.0
889    26.0
890    32.0
Name: Age, Length: 891, dtype: float64

In [7]:
# Handle missing values for 'Embarked'
imputer_embarked = SimpleImputer(strategy='most_frequent')
df_train['Embarked'] = imputer_embarked.fit_transform(df_train[['Embarked']]).ravel()
df_test['Embarked'] = imputer_embarked.transform(df_test[['Embarked']]).ravel()

In [8]:
# Display the 'Embarked' column
df_train['Embarked']

0      S
1      C
2      S
3      S
4      S
      ..
886    S
887    S
888    S
889    C
890    Q
Name: Embarked, Length: 891, dtype: object

In [9]:
# Check if 'Sex' column exists before encoding
from sklearn.preprocessing import LabelEncoder
l = LabelEncoder()
l.fit(df_train[['Sex']])
df_train['Sex_res'] = l.transform(df_train[['Sex']])
df_train = df_train.drop('Sex', axis=1)
df_train

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)


Unnamed: 0,PassengerId,Survived,Pclass,Name,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Sex_res
0,1,0,3,"Braund, Mr. Owen Harris",22.0,1,0,A/5 21171,7.2500,,S,1
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",38.0,1,0,PC 17599,71.2833,C85,C,0
2,3,1,3,"Heikkinen, Miss. Laina",26.0,0,0,STON/O2. 3101282,7.9250,,S,0
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",35.0,1,0,113803,53.1000,C123,S,0
4,5,0,3,"Allen, Mr. William Henry",35.0,0,0,373450,8.0500,,S,1
...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",27.0,0,0,211536,13.0000,,S,1
887,888,1,1,"Graham, Miss. Margaret Edith",19.0,0,0,112053,30.0000,B42,S,0
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",28.0,1,2,W./C. 6607,23.4500,,S,0
889,890,1,1,"Behr, Mr. Karl Howell",26.0,0,0,111369,30.0000,C148,C,1


In [10]:
# Drop unnecessary columns
df_train = df_train.drop(columns=['Cabin', 'Ticket', 'Name', 'Embarked'])
df_test = df_test.drop(columns=['Cabin', 'Ticket', 'Name', 'Embarked'])

In [11]:
# Define numerical features
numerical_features = ['Pclass', 'SibSp', 'Parch', 'Fare']

In [12]:
# Preprocess numerical features
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_features),
    ])

# Prepare feature and target variables
X = df_train.drop(columns=['Survived'])
y = df_train['Survived']

In [13]:
# Check for null values
X.isnull().sum()

PassengerId    0
Pclass         0
Age            0
SibSp          0
Parch          0
Fare           0
Sex_res        0
dtype: int64

In [14]:
# Split data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)


In [15]:
# Train a Random Forest model
rf_model = RandomForestClassifier(random_state=42)
rf_grid = GridSearchCV(rf_model, 
                       param_grid={'n_estimators': [100, 200], 'max_depth': [None, 10, 20]}, 
                       cv=5, scoring='precision')


In [16]:
# Fit the Random Forest model
rf_grid.fit(X_train, y_train)


In [17]:
# Get the best Random Forest model
best_rf_model = rf_grid.best_estimator_

In [18]:
# Predict with the Random Forest model
rf_preds = best_rf_model.predict(X_val)
rf_preds

array([0, 0, 0, 1, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 1, 1, 1, 0, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1, 0, 0, 1,
       0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0,
       1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0,
       0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0,
       0, 1, 1], dtype=int64)

In [19]:
# Evaluate the Random Forest model
rf_precision = precision_score(y_val, rf_preds)
rf_precision

rf_recall = recall_score(y_val, rf_preds)
rf_recall

0.7162162162162162

In [20]:
# Train an SVM model
svm_model = SVC(random_state=42)
svm_grid = GridSearchCV(svm_model, {
    'C': [0.1, 1, 10],
    'kernel': ['linear', 'rbf']
}, cv=5, scoring='precision')


In [21]:
# Fit the SVM model
svm_grid.fit(X_train, y_train)

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [27]:
# Get the best SVM model
best_svm_model = svm_grid.best_estimator_
best_svm_model

In [28]:
# Predict with the SVM model
svm_preds = best_svm_model.predict(X_val)
svm_preds 

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0], dtype=int64)

In [29]:
# Evaluate the SVM model
svm_precision = precision_score(y_val, svm_preds)
svm_precision

0.6666666666666666

In [30]:
svm_recall = recall_score(y_val, svm_preds)
svm_recall


0.05405405405405406

In [31]:
# Display precision and recall for both models
svm_precision, rf_precision

(0.6666666666666666, 0.8153846153846154)