In [1]:
# importing libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
# loading the training and test dataset
training_dataset = pd.read_csv("train.csv")
test_dataset = pd.read_csv("test.csv")


In [3]:
# viewing the datasets
training_dataset.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [4]:
test_dataset.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


In [5]:
# looking for missing values
training_dataset.isna().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [6]:
training_dataset.dropna(axis=0,subset=["Embarked"],inplace=True)

In [7]:
# looking for missing values in test dataset
test_dataset.isna().sum()

PassengerId      0
Pclass           0
Name             0
Sex              0
Age             86
SibSp            0
Parch            0
Ticket           0
Fare             1
Cabin          327
Embarked         0
dtype: int64

In [8]:
# getting info about the type of columns in the dataset
training_dataset.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 889 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  889 non-null    int64  
 1   Survived     889 non-null    int64  
 2   Pclass       889 non-null    int64  
 3   Name         889 non-null    object 
 4   Sex          889 non-null    object 
 5   Age          712 non-null    float64
 6   SibSp        889 non-null    int64  
 7   Parch        889 non-null    int64  
 8   Ticket       889 non-null    object 
 9   Fare         889 non-null    float64
 10  Cabin        202 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 90.3+ KB


In [9]:
test_dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  418 non-null    int64  
 1   Pclass       418 non-null    int64  
 2   Name         418 non-null    object 
 3   Sex          418 non-null    object 
 4   Age          332 non-null    float64
 5   SibSp        418 non-null    int64  
 6   Parch        418 non-null    int64  
 7   Ticket       418 non-null    object 
 8   Fare         417 non-null    float64
 9   Cabin        91 non-null     object 
 10  Embarked     418 non-null    object 
dtypes: float64(2), int64(4), object(5)
memory usage: 36.0+ KB


In [10]:
# splitting into training and test data
x_train = training_dataset.iloc[:,2:].values
y_train = training_dataset.iloc[:,1].values
x_test = test_dataset.iloc[:,1:].values

In [11]:
x_train

array([[3, 'Braund, Mr. Owen Harris', 'male', ..., 7.25, nan, 'S'],
       [1, 'Cumings, Mrs. John Bradley (Florence Briggs Thayer)',
        'female', ..., 71.2833, 'C85', 'C'],
       [3, 'Heikkinen, Miss. Laina', 'female', ..., 7.925, nan, 'S'],
       ...,
       [3, 'Johnston, Miss. Catherine Helen "Carrie"', 'female', ...,
        23.45, nan, 'S'],
       [1, 'Behr, Mr. Karl Howell', 'male', ..., 30.0, 'C148', 'C'],
       [3, 'Dooley, Mr. Patrick', 'male', ..., 7.75, nan, 'Q']],
      dtype=object)

In [12]:
y_train

array([0, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1,
       1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1,
       1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1,
       0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 0, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0,
       1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0,
       0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 0, 0, 1,
       0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 1,
       0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0,
       0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0, 0,
       0, 0, 1, 1, 1, 0, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1,
       0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1,

In [13]:
x_test

array([[3, 'Kelly, Mr. James', 'male', ..., 7.8292, nan, 'Q'],
       [3, 'Wilkes, Mrs. James (Ellen Needs)', 'female', ..., 7.0, nan,
        'S'],
       [2, 'Myles, Mr. Thomas Francis', 'male', ..., 9.6875, nan, 'Q'],
       ...,
       [3, 'Saether, Mr. Simon Sivertsen', 'male', ..., 7.25, nan, 'S'],
       [3, 'Ware, Mr. Frederick', 'male', ..., 8.05, nan, 'S'],
       [3, 'Peter, Master. Michael J', 'male', ..., 22.3583, nan, 'C']],
      dtype=object)

In [14]:
# taking care of missing data in training set and test set using scikitlearn
from sklearn.impute import SimpleImputer
imputer_train1 = SimpleImputer(missing_values=np.nan,strategy="constant",fill_value="missing")
imputer_train2 = SimpleImputer(missing_values=np.nan, strategy="most_frequent")
imputer_train1.fit(x_train[:,8].reshape(-1,1))
x_train[:,8] = imputer_train1.transform(x_train[:,8].reshape(-1,1)).flatten()
x_train[:,3] = imputer_train2.fit_transform(x_train[:,3].reshape(-1,1)).flatten()

In [15]:
x_train

array([[3, 'Braund, Mr. Owen Harris', 'male', ..., 7.25, 'missing', 'S'],
       [1, 'Cumings, Mrs. John Bradley (Florence Briggs Thayer)',
        'female', ..., 71.2833, 'C85', 'C'],
       [3, 'Heikkinen, Miss. Laina', 'female', ..., 7.925, 'missing',
        'S'],
       ...,
       [3, 'Johnston, Miss. Catherine Helen "Carrie"', 'female', ...,
        23.45, 'missing', 'S'],
       [1, 'Behr, Mr. Karl Howell', 'male', ..., 30.0, 'C148', 'C'],
       [3, 'Dooley, Mr. Patrick', 'male', ..., 7.75, 'missing', 'Q']],
      dtype=object)

In [16]:
imputer_test1 = SimpleImputer(missing_values=np.nan, strategy="constant", fill_value="missing")
imputer_test2 = SimpleImputer(missing_values=np.nan, strategy="most_frequent")
x_test[:,8] = imputer_test1.fit_transform(x_test[:,8].reshape(-1,1)).flatten()
x_test[:,3]= imputer_test2.fit_transform(x_test[:,3].reshape(-1,1)).flatten()
x_test[:,7]= imputer_test2.fit_transform(x_test[:,7].reshape(-1,1)).flatten()

In [17]:
x_test

array([[3, 'Kelly, Mr. James', 'male', ..., 7.8292, 'missing', 'Q'],
       [3, 'Wilkes, Mrs. James (Ellen Needs)', 'female', ..., 7.0,
        'missing', 'S'],
       [2, 'Myles, Mr. Thomas Francis', 'male', ..., 9.6875, 'missing',
        'Q'],
       ...,
       [3, 'Saether, Mr. Simon Sivertsen', 'male', ..., 7.25, 'missing',
        'S'],
       [3, 'Ware, Mr. Frederick', 'male', ..., 8.05, 'missing', 'S'],
       [3, 'Peter, Master. Michael J', 'male', ..., 22.3583, 'missing',
        'C']], dtype=object)

In [18]:
# checking if the new train and test set has any missing values
pd.DataFrame(x_train).isna().sum(),pd.DataFrame(x_test).isna().sum()

(0    0
 1    0
 2    0
 3    0
 4    0
 5    0
 6    0
 7    0
 8    0
 9    0
 dtype: int64,
 0    0
 1    0
 2    0
 3    0
 4    0
 5    0
 6    0
 7    0
 8    0
 9    0
 dtype: int64)

In [19]:
# taking care of categorical data 
from sklearn.preprocessing import OrdinalEncoder
encoder_train = OrdinalEncoder()
x_train[:,[1,2,6,8,9]] = encoder_train.fit_transform(x_train[:,[1,2,6,8,9]])
pd.DataFrame(x_train)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,3,108.0,1.0,22.0,1,0,522.0,7.25,146.0,2.0
1,1,190.0,0.0,38.0,1,0,595.0,71.2833,80.0,0.0
2,3,353.0,0.0,26.0,0,0,668.0,7.925,146.0,2.0
3,1,272.0,0.0,35.0,1,0,48.0,53.1,54.0,2.0
4,3,15.0,1.0,35.0,0,0,471.0,8.05,146.0,2.0
...,...,...,...,...,...,...,...,...,...,...
884,2,547.0,1.0,27.0,0,0,100.0,13.0,146.0,2.0
885,1,303.0,0.0,19.0,0,0,14.0,30.0,29.0,2.0
886,3,412.0,0.0,24.0,1,2,674.0,23.45,146.0,2.0
887,1,81.0,1.0,26.0,0,0,8.0,30.0,59.0,0.0


In [20]:
test_encoder = OrdinalEncoder()
x_test[:,[1,2,6,8,9]] = test_encoder.fit_transform(x_test[:,[1,2,6,8,9]])
pd.DataFrame(x_test)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,3,206.0,1.0,34.5,0,0,152.0,7.8292,76.0,1.0
1,3,403.0,0.0,47.0,1,0,221.0,7.0,76.0,2.0
2,2,269.0,1.0,62.0,0,0,73.0,9.6875,76.0,1.0
3,3,408.0,1.0,27.0,0,0,147.0,8.6625,76.0,2.0
4,3,178.0,0.0,22.0,1,1,138.0,12.2875,76.0,2.0
...,...,...,...,...,...,...,...,...,...,...
413,3,353.0,1.0,21.0,0,0,267.0,8.05,76.0,2.0
414,1,283.0,0.0,39.0,0,0,324.0,108.9,22.0,0.0
415,3,332.0,1.0,38.5,0,0,346.0,7.25,76.0,2.0
416,3,384.0,1.0,21.0,0,0,220.0,8.05,76.0,2.0


In [21]:
# now bringing the features in a bit comparable forms by feature scalling
from sklearn.preprocessing import StandardScaler
sc_x = StandardScaler()
x_train[:,[1,3,6,7,8]] = sc_x.fit_transform(x_train[:,[1,3,6,7,8]])
x_test[:,[1,3,6,7,8]] = sc_x.transform(x_test[:,[1,3,6,7,8]])

In [22]:
pd.DataFrame(x_train)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,3,-1.309268,1.0,-0.495519,1,0,0.917018,-0.50024,0.449244,2.0
1,1,-0.989744,0.0,0.720711,1,0,1.281353,0.788947,-1.405976,0.0
2,3,-0.354593,0.0,-0.191461,0,0,1.645689,-0.48665,0.449244,2.0
3,1,-0.67022,0.0,0.492668,1,0,-1.448669,0.422861,-2.13682,2.0
4,3,-1.671654,1.0,0.492668,0,0,0.662482,-0.484133,0.449244,2.0
...,...,...,...,...,...,...,...,...,...,...
884,2,0.401353,1.0,-0.115447,0,0,-1.189142,-0.384475,0.449244,2.0
885,1,-0.549425,0.0,-0.723562,0,0,-1.61836,-0.042213,-2.839555,2.0
886,3,-0.124692,0.0,-0.34349,1,2,1.675635,-0.174084,0.449244,2.0
887,1,-1.414477,1.0,-0.191461,0,0,-1.648305,-0.042213,-1.996273,0.0


In [23]:
# building linnear svm as first model
from sklearn.svm import SVC
model1 = SVC(kernel="linear",random_state=0)
model1.fit(x_train,y_train)

In [24]:
y_pred1 = model1.predict(x_test)

In [25]:
y_pred1

array([0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 0,
       1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1,
       1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1,
       1, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1,
       1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0,
       1, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1,
       0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1,
       1, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1,
       0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 0,
       1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1,
       0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1,
       0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0,
       0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0,

In [26]:
df = pd.DataFrame({'PassengerId': test_dataset["PassengerId"], 'Survived': y_pred1},index=None)
df

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,1
2,894,0
3,895,0
4,896,1
...,...,...
413,1305,0
414,1306,1
415,1307,0
416,1308,0


In [27]:
df.to_csv("linear_svm_predictions",index = False)

In [28]:
#preparing model2 based on KNN classifier
from sklearn.neighbors import KNeighborsClassifier
model2 = KNeighborsClassifier(n_neighbors=5)
model2.fit(x_train,y_train)

In [29]:
y_pred2 = model2.predict(x_test)

In [32]:
df2 = pd.DataFrame({'PassengerId': test_dataset['PassengerId'],'Survived': y_pred2})
df2.to_csv("KNN_classification_model",index=False)

In [33]:
#preparing model3 based on random forest classification
from sklearn.ensemble import RandomForestClassifier
model3 = RandomForestClassifier(n_estimators=200,random_state=0)
model3.fit(x_train,y_train)

In [34]:
y_pred3 = model3.predict(x_test)

In [35]:
df3= pd.DataFrame({'PassengerId': test_dataset['PassengerId'],'Survived': y_pred3})
df3.to_csv("Random forest classification",index=False)

In [36]:
#preparing model4 based on kernel svm
from sklearn.svm import SVC
model4 = SVC(kernel="rbf",random_state=0)
model4.fit(x_train,y_train)

In [37]:
y_pred4= model4.predict(x_test)

In [38]:
df4= pd.DataFrame({'PassengerId': test_dataset['PassengerId'],'Survived': y_pred4})
df4.to_csv("kernel_svm",index=False)