In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score,classification_report

In [None]:
df = pd.read_csv('/content/train.csv')
test_df = pd.read_csv('/content/test.csv')

In [None]:
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [None]:
test_df.head()


Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


In [None]:
print(df.shape)
print(test_df.shape)

(891, 12)
(418, 11)


In [None]:
df.isnull().sum()


PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [None]:
test_df.isnull().sum()

PassengerId      0
Pclass           0
Name             0
Sex              0
Age             86
SibSp            0
Parch            0
Ticket           0
Fare             1
Cabin          327
Embarked         0
dtype: int64

In [None]:
#dropping useless columns
df.drop(columns = ['Name','Cabin','Ticket'],axis=1,inplace =True)
test_df.drop(columns = ['Name','Cabin','Ticket'],axis=1,inplace =True)

In [None]:
print(df.shape)
print(test_df.shape)

(891, 9)
(418, 8)


In [None]:
test_df.isnull().sum()

PassengerId     0
Pclass          0
Sex             0
Age            86
SibSp           0
Parch           0
Fare            1
Embarked        0
dtype: int64

In [None]:
df['Embarked'].fillna(value = df['Embarked'].mode()[0],inplace = True) # filling null values in 'Embarked'column

In [None]:
df.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Sex              0
Age            177
SibSp            0
Parch            0
Fare             0
Embarked         0
dtype: int64

In [None]:
df['Age'].fillna(value = df['Age'].median(),inplace = True) #filling nan value in "Age" column
test_df['Age'].fillna(value = df['Age'].median(),inplace = True)

In [None]:
df.isnull().sum()

PassengerId    0
Survived       0
Pclass         0
Sex            0
Age            0
SibSp          0
Parch          0
Fare           0
Embarked       0
dtype: int64

In [None]:
test_df.isnull().sum()

PassengerId    0
Pclass         0
Sex            0
Age            0
SibSp          0
Parch          0
Fare           1
Embarked       0
dtype: int64

In [None]:
test_df[np.isnan(test_df['Fare'].values)]

Unnamed: 0,PassengerId,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
152,1044,3,male,60.5,0,0,,S


In [None]:
test_df.fillna(value =test_df['Fare'].mean(),inplace = True)

In [None]:
test_df.isnull().sum()

PassengerId    0
Pclass         0
Sex            0
Age            0
SibSp          0
Parch          0
Fare           0
Embarked       0
dtype: int64

In [None]:
df['Sex'] = df['Sex'].apply(lambda x:1 if x=='male'else 0)
test_df['Sex'] =test_df['Sex'].apply(lambda x:1 if x=='male'else 0)

In [None]:
test_df.head()

Unnamed: 0,PassengerId,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,892,3,1,34.5,0,0,7.8292,Q
1,893,3,0,47.0,1,0,7.0,S
2,894,2,1,62.0,0,0,9.6875,Q
3,895,3,1,27.0,0,0,8.6625,S
4,896,3,0,22.0,1,1,12.2875,S


In [None]:
df['Embarked'].nunique()

3

In [None]:
df = pd.get_dummies(df,dtype=np.int64,drop_first=True)
test_df = pd.get_dummies(test_df,dtype=np.int64,drop_first=True)

In [None]:
test_df.head()

Unnamed: 0,PassengerId,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked_Q,Embarked_S
0,892,3,1,34.5,0,0,7.8292,1,0
1,893,3,0,47.0,1,0,7.0,0,1
2,894,2,1,62.0,0,0,9.6875,1,0
3,895,3,1,27.0,0,0,8.6625,0,1
4,896,3,0,22.0,1,1,12.2875,0,1


In [None]:
df.corr()

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked_Q,Embarked_S
PassengerId,1.0,-0.005007,-0.035144,0.042939,0.034212,-0.057527,-0.001652,0.012658,-0.033606,0.022204
Survived,-0.005007,1.0,-0.338481,-0.543351,-0.06491,-0.035322,0.081629,0.257307,0.00365,-0.149683
Pclass,-0.035144,-0.338481,1.0,0.1319,-0.339898,0.083081,0.018443,-0.5495,0.221009,0.074053
Sex,0.042939,-0.543351,0.1319,1.0,0.081163,-0.114631,-0.245489,-0.182333,-0.074115,0.119224
Age,0.034212,-0.06491,-0.339898,0.081163,1.0,-0.233296,-0.172482,0.096688,-0.031415,-0.006729
SibSp,-0.057527,-0.035322,0.083081,-0.114631,-0.233296,1.0,0.414838,0.159651,-0.026354,0.068734
Parch,-0.001652,0.081629,0.018443,-0.245489,-0.172482,0.414838,1.0,0.216225,-0.081228,0.060814
Fare,0.012658,0.257307,-0.5495,-0.182333,0.096688,0.159651,0.216225,1.0,-0.117216,-0.162184
Embarked_Q,-0.033606,0.00365,0.221009,-0.074115,-0.031415,-0.026354,-0.081228,-0.117216,1.0,-0.499421
Embarked_S,0.022204,-0.149683,0.074053,0.119224,-0.006729,0.068734,0.060814,-0.162184,-0.499421,1.0


In [None]:
x = df.drop(columns = ['PassengerId','Survived'])
y= df['Survived']

In [None]:
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size = 0.2,stratify =y,random_state = 42)

In [None]:
x_train.head()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked_Q,Embarked_S
692,3,1,28.0,0,0,56.4958,0,1
481,2,1,28.0,0,0,0.0,0,1
527,1,1,28.0,0,0,221.7792,0,1
855,3,0,18.0,0,1,9.35,0,1
801,2,0,31.0,1,1,26.25,0,1


In [None]:
y_train.head()


692    1
481    0
527    0
855    1
801    1
Name: Survived, dtype: int64

In [None]:
x_test.head()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked_Q,Embarked_S
565,3,1,24.0,2,0,24.15,0,1
160,3,1,44.0,0,1,16.1,0,1
553,3,1,22.0,0,0,7.225,0,0
860,3,1,41.0,2,0,14.1083,0,1
241,3,0,28.0,1,0,15.5,1,0


In [None]:
y_test.head()

565    0
160    0
553    1
860    0
241    1
Name: Survived, dtype: int64

In [None]:
# check the number of missing values in each column
titanic_data.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [None]:
st = StandardScaler()

In [None]:
scaledx = st.fit_transform(x_train)
scaledx_test = st.fit_transform(x_test)

In [None]:
scaledx

array([[ 0.82956755,  0.74242727, -0.11207776, ...,  0.5138115 ,
        -0.28933346,  0.61197825],
       [-0.37094484,  0.74242727, -0.11207776, ..., -0.66256323,
        -0.28933346,  0.61197825],
       [-1.57145722,  0.74242727, -0.11207776, ...,  3.95539858,
        -0.28933346,  0.61197825],
       ...,
       [ 0.82956755, -1.34693328,  1.42338169, ...,  0.0532047 ,
        -0.28933346,  0.61197825],
       [-1.57145722,  0.74242727,  1.34660872, ...,  0.13909685,
        -0.28933346,  0.61197825],
       [-1.57145722,  0.74242727, -0.11207776, ..., -0.10973011,
        -0.28933346,  0.61197825]])

In [None]:
model = KNeighborsClassifier() # n_neighbors=5 -> default value

In [None]:
model.fit(scaledx,y_train)

In [None]:
ypredk = model.predict(scaledx_test)

In [None]:
accuracy_score(y_test,ypredk)

0.8156424581005587

In [None]:
print(classification_report(y_test,ypredk))

              precision    recall  f1-score   support

           0       0.83      0.87      0.85       110
           1       0.78      0.72      0.75        69

    accuracy                           0.82       179
   macro avg       0.81      0.80      0.80       179
weighted avg       0.81      0.82      0.81       179



In [None]:
for i in range(3,60,2):
    model = KNeighborsClassifier(n_neighbors=i)
    model.fit(scaledx,y_train)
    prediction = model.predict(scaledx_test)
    print(f'Accuracy for n_neighbors={i} : {accuracy_score(y_test,prediction)}')

Accuracy for n_neighbors=3 : 0.8044692737430168
Accuracy for n_neighbors=5 : 0.8156424581005587
Accuracy for n_neighbors=7 : 0.7988826815642458
Accuracy for n_neighbors=9 : 0.7877094972067039
Accuracy for n_neighbors=11 : 0.8044692737430168
Accuracy for n_neighbors=13 : 0.8212290502793296
Accuracy for n_neighbors=15 : 0.8100558659217877
Accuracy for n_neighbors=17 : 0.8212290502793296
Accuracy for n_neighbors=19 : 0.8268156424581006
Accuracy for n_neighbors=21 : 0.8044692737430168
Accuracy for n_neighbors=23 : 0.8100558659217877
Accuracy for n_neighbors=25 : 0.8100558659217877
Accuracy for n_neighbors=27 : 0.8100558659217877
Accuracy for n_neighbors=29 : 0.8156424581005587
Accuracy for n_neighbors=31 : 0.8044692737430168
Accuracy for n_neighbors=33 : 0.7932960893854749
Accuracy for n_neighbors=35 : 0.7988826815642458
Accuracy for n_neighbors=37 : 0.7932960893854749
Accuracy for n_neighbors=39 : 0.7988826815642458
Accuracy for n_neighbors=41 : 0.7932960893854749
Accuracy for n_neighbors

In [None]:
model2 = KNeighborsClassifier(n_neighbors=19)

In [None]:
model2.fit(scaledx,y_train)

In [None]:
prediction = model2.predict(scaledx_test)

In [None]:
print(classification_report(y_test,prediction))

              precision    recall  f1-score   support

           0       0.82      0.93      0.87       110
           1       0.85      0.67      0.75        69

    accuracy                           0.83       179
   macro avg       0.83      0.80      0.81       179
weighted avg       0.83      0.83      0.82       179



In [None]:
test_df.head()

Unnamed: 0,PassengerId,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked_Q,Embarked_S
0,892,3,1,34.5,0,0,7.8292,1,0
1,893,3,0,47.0,1,0,7.0,0,1
2,894,2,1,62.0,0,0,9.6875,1,0
3,895,3,1,27.0,0,0,8.6625,0,1
4,896,3,0,22.0,1,1,12.2875,0,1


In [None]:
test_x = test_df.drop(columns='PassengerId')

In [None]:
test_x.head()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked_Q,Embarked_S
0,3,1,34.5,0,0,7.8292,1,0
1,3,0,47.0,1,0,7.0,0,1
2,2,1,62.0,0,0,9.6875,1,0
3,3,1,27.0,0,0,8.6625,0,1
4,3,0,22.0,1,1,12.2875,0,1


In [None]:
scaled_test = st.transform(test_x) # scaling the test data

In [None]:
predict_test = model2.predict(scaled_test)

In [None]:
predict_test

array([0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0,
       1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
       1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1,
       1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 1, 0,
       1, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1,
       0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1,
       1, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0,
       0, 0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0,
       1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 1, 1, 1,
       0, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
       0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0,

In [None]:
output = pd.DataFrame({'PassengerId': test_df.PassengerId, 'Survived': predict_test})
output.to_csv('submission.csv', index=False)
print("Your submission was successfully saved!")

Your submission was successfully saved!


In [None]:
pd.read_csv("/content/submission.csv").head()

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,1
2,894,0
3,895,0
4,896,1
