### Import necessary libraries

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier


### Loading Dataset

In [2]:
# Read the Titanic dataset

df= pd.read_csv('titanic.csv')


### Data Exploration

In [3]:
df.shape

(418, 12)

In [4]:
df.head

<bound method NDFrame.head of      PassengerId  Survived  Pclass  \
0            892         0       3   
1            893         1       3   
2            894         0       2   
3            895         0       3   
4            896         1       3   
..           ...       ...     ...   
413         1305         0       3   
414         1306         1       1   
415         1307         0       3   
416         1308         0       3   
417         1309         0       3   

                                             Name     Sex   Age  SibSp  Parch  \
0                                Kelly, Mr. James    male  34.5      0      0   
1                Wilkes, Mrs. James (Ellen Needs)  female  47.0      1      0   
2                       Myles, Mr. Thomas Francis    male  62.0      0      0   
3                                Wirz, Mr. Albert    male  27.0      0      0   
4    Hirvonen, Mrs. Alexander (Helga E Lindqvist)  female  22.0      1      1   
..                         

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  418 non-null    int64  
 1   Survived     418 non-null    int64  
 2   Pclass       418 non-null    int64  
 3   Name         418 non-null    object 
 4   Sex          418 non-null    object 
 5   Age          332 non-null    float64
 6   SibSp        418 non-null    int64  
 7   Parch        418 non-null    int64  
 8   Ticket       418 non-null    object 
 9   Fare         417 non-null    float64
 10  Cabin        91 non-null     object 
 11  Embarked     418 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 39.3+ KB


In [6]:
df.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,418.0,418.0,418.0,332.0,418.0,418.0,417.0
mean,1100.5,0.363636,2.26555,30.27259,0.447368,0.392344,35.627188
std,120.810458,0.481622,0.841838,14.181209,0.89676,0.981429,55.907576
min,892.0,0.0,1.0,0.17,0.0,0.0,0.0
25%,996.25,0.0,1.0,21.0,0.0,0.0,7.8958
50%,1100.5,0.0,3.0,27.0,0.0,0.0,14.4542
75%,1204.75,1.0,3.0,39.0,1.0,0.0,31.5
max,1309.0,1.0,3.0,76.0,8.0,9.0,512.3292


In [7]:
df['Embarked'].unique()  # Check unique values in the 'Embarked' column

array(['Q', 'S', 'C'], dtype=object)

In [8]:
df.isna().sum()       # Check for missing values in the dataset
 

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age             86
SibSp            0
Parch            0
Ticket           0
Fare             1
Cabin          327
Embarked         0
dtype: int64

In [9]:
# Fill missing values in the 'Age' column with the mean age rounded to the nearest integer

df['Age'] = df['Age'].fillna(df['Age'].mean().round())        

df.head(10)


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,0,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,1,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,0,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,0,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,1,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S
5,897,0,3,"Svensson, Mr. Johan Cervin",male,14.0,0,0,7538,9.225,,S
6,898,1,3,"Connolly, Miss. Kate",female,30.0,0,0,330972,7.6292,,Q
7,899,0,2,"Caldwell, Mr. Albert Francis",male,26.0,1,1,248738,29.0,,S
8,900,1,3,"Abrahim, Mrs. Joseph (Sophie Halaut Easu)",female,18.0,0,0,2657,7.2292,,C
9,901,0,3,"Davies, Mr. John Samuel",male,21.0,2,0,A/4 48871,24.15,,S


In [10]:
# Fill missing values in the 'Cabin' column with 'Unknown'

df['Cabin'] = df['Cabin'].fillna('Unknown')
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,0,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,Unknown,Q
1,893,1,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,Unknown,S
2,894,0,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,Unknown,Q
3,895,0,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,Unknown,S
4,896,1,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,Unknown,S


In [11]:
# Encode categorical columns using LabelEncoder

from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()

In [13]:
df['Sex'] = le.fit_transform(df['Sex'])
df['Cabin'] = le.fit_transform(df['Cabin'])
df['Ticket'] = le.fit_transform(df['Ticket'])
df['Embarked'] = le.fit_transform(df['Embarked'])
df['Name'] = le.fit_transform(df['Name'])

In [14]:
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,0,3,206,1,34.5,0,0,152,7.8292,76,1
1,893,1,3,403,0,47.0,1,0,221,7.0,76,2
2,894,0,2,269,1,62.0,0,0,73,9.6875,76,1
3,895,0,3,408,1,27.0,0,0,147,8.6625,76,2
4,896,1,3,178,0,22.0,1,1,138,12.2875,76,2


In [15]:
df = df.drop(['Ticket'],axis = 1)     # Drop the 'Ticket' column
df

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked
0,892,0,3,206,1,34.5,0,0,7.8292,76,1
1,893,1,3,403,0,47.0,1,0,7.0000,76,2
2,894,0,2,269,1,62.0,0,0,9.6875,76,1
3,895,0,3,408,1,27.0,0,0,8.6625,76,2
4,896,1,3,178,0,22.0,1,1,12.2875,76,2
...,...,...,...,...,...,...,...,...,...,...,...
413,1305,0,3,353,1,30.0,0,0,8.0500,76,2
414,1306,1,1,283,0,39.0,0,0,108.9000,22,0
415,1307,0,3,332,1,38.5,0,0,7.2500,76,2
416,1308,0,3,384,1,30.0,0,0,8.0500,76,2


In [16]:
df.isna().sum()      # Check for missing values again after encoding

PassengerId    0
Survived       0
Pclass         0
Name           0
Sex            0
Age            0
SibSp          0
Parch          0
Fare           1
Cabin          0
Embarked       0
dtype: int64

### Spliting Data

In [17]:
# Split the data into features (X) and the target variable (y)

from sklearn.model_selection import train_test_split
X = df.loc[:,'Pclass':'Embarked']
y = df.loc[:, 'Survived']

In [18]:
# Split the data into training and testing sets

Xtrain,Xtest,ytrain,ytest = train_test_split(X,y, test_size = 0.2, random_state = 42)
print('X train shape ', Xtrain.shape)
print('X test shape ', Xtest.shape)
print('y train shape ', ytrain.shape)
print('y test shape ', ytest.shape)

X train shape  (334, 9)
X test shape  (84, 9)
y train shape  (334,)
y test shape  (84,)


In [19]:
# Check the shapes of the training and testing sets

cols = []
cols.append(Xtrain.columns)
cols

[Index(['Pclass', 'Name', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Cabin',
        'Embarked'],
       dtype='object')]

In [20]:
# Scale the features using Min-Max scaling

from sklearn.preprocessing import MinMaxScaler
ms = MinMaxScaler()

### Train The Model

In [21]:
# Initialize the Support Vector Machine (SVM) classifier

X_train = ms.fit_transform(Xtrain)
X_test = ms.fit_transform(Xtest)

In [22]:
# Train the SVM classifier on the training data

X_train = pd.DataFrame(X_train, columns = cols)
X_test = pd.DataFrame(X_test, columns = cols)

In [23]:
X_train.head()
X_train['Fare'] = X_train['Fare'].fillna(X_train['Fare'].mean())

In [24]:
from sklearn.svm import SVC
svm = SVC( C = 0.001, kernel = 'linear', random_state = 42)
svm.fit(X_train, ytrain)

In [25]:
print(f'Model score on training set : {(svm.score(X_train,ytrain)*100).round(2)}%')
print(f'Model score on test set : {(svm.score(X_test,ytest)*100).round(2)}%')


Model score on training set : 64.67%
Model score on test set : 59.52%


### Prediction

In [26]:
# Make predictions on the testing data

y_pred = svm.predict(X_test)
y_pred

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], dtype=int64)

### Model Evaluation

In [27]:
# Evaluate the model's performance

from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
accuracy = accuracy_score(ytest, y_pred)
conf_matrix = confusion_matrix(ytest, y_pred)
class_report = classification_report(ytest, y_pred)

# Print accuracy, confusion matrix, and classification report

print("Accuracy:", accuracy)
print("\nConfusion Matrix:\n\n\n", conf_matrix)
print("\nClassification Report:\n\n\n", class_report)

Accuracy: 0.5952380952380952

Confusion Matrix:


 [[50  0]
 [34  0]]

Classification Report:


               precision    recall  f1-score   support

           0       0.60      1.00      0.75        50
           1       0.00      0.00      0.00        34

    accuracy                           0.60        84
   macro avg       0.30      0.50      0.37        84
weighted avg       0.35      0.60      0.44        84



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
