In [None]:
Assignment – Model Selection

In [6]:
# Import necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold, KFold
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC

# Load the dataset from a local file
local_path = "/home/nandu/Downloads/titanic_dataset .csv"
titanic_data = pd.read_csv(local_path)

# Display the first few rows of the dataset
print("Step 1: Loading the dataset")
print(titanic_data.head())

# Pre-processing steps
print("\nStep 2: Pre-processing")

# Drop irrelevant columns (e.g., PassengerId, Name, Ticket, Cabin)
titanic_data = titanic_data.drop(['PassengerId', 'Name', 'Ticket', 'Cabin'], axis=1)

# Handling missing values (e.g., filling NaN values in the 'Age' column with the mean)
titanic_data['Age'].fillna(titanic_data['Age'].mean(), inplace=True)

# Replace missing 'Embarked' values with the most common value
titanic_data['Embarked'].fillna(titanic_data['Embarked'].mode()[0], inplace=True)

# Convert categorical variables to numerical using one-hot encoding
titanic_data = pd.get_dummies(titanic_data, columns=['Sex', 'Embarked', 'Pclass'], drop_first=True)

# Extract features and target variable
X = titanic_data.drop('Survived', axis=1)
y = titanic_data['Survived']

# Separate numeric and non-numeric columns
numeric_columns = X.select_dtypes(include=['float64', 'int64']).columns
non_numeric_columns = X.select_dtypes(include=['object']).columns

# Standardize numeric features
scaler = StandardScaler()
X[numeric_columns] = scaler.fit_transform(X[numeric_columns])

# Display the first few rows after preprocessing
print(X.head())

# Create kNN and SVM models
knn_model = KNeighborsClassifier()
svm_model = SVC()

# K-Fold Cross Validation
print("\nStep 3: K-Fold Cross Validation")
k_fold = KFold(n_splits=5, shuffle=True, random_state=42)

# kNN model
knn_scores = cross_val_score(knn_model, X, y, cv=k_fold, scoring='accuracy')
print("kNN Average Accuracy:", knn_scores.mean())

# SVM model
svm_scores = cross_val_score(svm_model, X, y, cv=k_fold, scoring='accuracy')
print("SVM Average Accuracy:", svm_scores.mean())

# Stratified K-Fold Cross Validation
print("\nStep 4: Stratified K-Fold Cross Validation")
stratified_k_fold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# kNN model
knn_stratified_scores = cross_val_score(knn_model, X, y, cv=stratified_k_fold, scoring='accuracy')
print("kNN Stratified Average Accuracy:", knn_stratified_scores.mean())

# SVM model
svm_stratified_scores = cross_val_score(svm_model, X, y, cv=stratified_k_fold, scoring='accuracy')
print("SVM Stratified Average Accuracy:", svm_stratified_scores.mean())



Step 1: Loading the dataset
   PassengerId  Survived  Pclass  \
0            1         0       3   
1            2         1       1   
2            3         1       3   
3            4         1       1   
4            5         0       3   

                                                Name     Sex   Age  SibSp  \
0                            Braund, Mr. Owen Harris    male  22.0      1   
1  Cumings, Mrs. John Bradley (Florence Briggs Th...  female  38.0      1   
2                             Heikkinen, Miss. Laina  female  26.0      0   
3       Futrelle, Mrs. Jacques Heath (Lily May Peel)  female  35.0      1   
4                           Allen, Mr. William Henry    male  35.0      0   

   Parch            Ticket     Fare Cabin Embarked  
0      0         A/5 21171   7.2500   NaN        S  
1      0          PC 17599  71.2833   C85        C  
2      0  STON/O2. 3101282   7.9250   NaN        S  
3      0            113803  53.1000  C123        S  
4      0            373450 