In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC

In [2]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
tested_output = pd.read_csv('gender_submission.csv')

## <u>Data description:

In [3]:
train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [4]:
test.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


In [5]:
train.columns

Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')

In [6]:
test.columns

Index(['PassengerId', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp', 'Parch',
       'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')

In [7]:
tested_output.columns

Index(['PassengerId', 'Survived'], dtype='object')

In [8]:
test.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


In [9]:
train.shape

(891, 12)

In [10]:
test.shape

(418, 11)

In [11]:
tested_output.head()

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,1
2,894,0
3,895,0
4,896,1


## <u>Filling Null Values:

In [12]:
#finding Null columns and the number of null entries in each in the training dataset
columns_with_null = train.columns[train.isnull().any()].tolist()
null_counts = train[columns_with_null].isnull().sum()
print("Null value counts per column in training dataset:")
print(null_counts)

Null value counts per column in training dataset:
Age         177
Cabin       687
Embarked      2
dtype: int64


In [13]:
#finding Null columns and the number of null entries in each in the testing dataset
columns_with_null = test.columns[test.isnull().any()].tolist()
null_counts = test[columns_with_null].isnull().sum()
print("Null value counts per column in testing dataset:")
print(null_counts)

Null value counts per column in testing dataset:
Age       86
Fare       1
Cabin    327
dtype: int64


#### As we can see, train dataset has null values in Age, Cabin and Embarked, whereas test has the same in Age, Fare and Cabin.


#### So, to fill the null values of the dataset, for age and fare, we will average the values, for cabin and embarked we will use the mode for getting the average.

In [14]:
train['Age'].fillna(train['Age'].mean(), inplace=True)
test['Age'].fillna(test['Age'].mean(), inplace=True)
test['Fare'].fillna(test['Fare'].mean(), inplace=True)

In [15]:
print(train['Cabin'].value_counts())
test['Cabin'].value_counts()

C23 C25 C27    4
G6             4
B96 B98        4
D              3
E101           3
              ..
E12            1
A16            1
D10 D12        1
E49            1
C46            1
Name: Cabin, Length: 147, dtype: int64


B57 B59 B63 B66    3
C31                2
F4                 2
B45                2
C80                2
                  ..
D15                1
E52                1
D21                1
D22                1
C46                1
Name: Cabin, Length: 76, dtype: int64

In [16]:
def fillNullValInCabin(df, value1, value2):
    toFill = np.array([])
    value_counts = df['Cabin'].value_counts()
    
    for value, count in value_counts.items():
        if count == value1 or count == value2:
            toFill = np.append(toFill, value)
    
    random_index = np.random.randint(0, len(toFill))
    df['Cabin'].fillna(toFill[random_index], inplace=True)

In [17]:
fillNullValInCabin(train, 4, 3) 
fillNullValInCabin(test, 2, 3)

In [18]:
train['Embarked'].value_counts()

S    644
C    168
Q     77
Name: Embarked, dtype: int64

In [19]:
train['Embarked'].fillna('S', inplace=True)

## <u>Encoding data:

In [20]:
# since name isn't a factor which can help in learning of the model, as name wouldn't tell which person would survive
# and which wouldn't, hence name is removed from the dataset

In [21]:
train = train.drop('Name', axis =1)
test = test.drop('Name', axis =1)

#### OneHotEncoding columns Sex and Embarked

In [22]:
test['Embarked'].unique()

array(['Q', 'S', 'C'], dtype=object)

In [23]:
train_encoded = pd.concat([train, pd.get_dummies(train['Embarked'], prefix= 'Embarked'), pd.get_dummies(train['Sex'], prefix= 'Sex')], axis=1)
train_encoded.drop(['Embarked','Sex'], axis =1, inplace =True)
test_encoded = pd.concat([test, pd.get_dummies(test['Embarked'], prefix= 'Embarked'), pd.get_dummies(test['Sex'], prefix= 'Sex')], axis=1)
test_encoded.drop(['Embarked','Sex'], axis =1, inplace =True)

In [24]:
train_encoded.head(2)

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked_C,Embarked_Q,Embarked_S,Sex_female,Sex_male
0,1,0,3,22.0,1,0,A/5 21171,7.25,F2,0,0,1,0,1
1,2,1,1,38.0,1,0,PC 17599,71.2833,C85,1,0,0,1,0


In [25]:
test_encoded.head(2)

Unnamed: 0,PassengerId,Pclass,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked_C,Embarked_Q,Embarked_S,Sex_female,Sex_male
0,892,3,34.5,0,0,330911,7.8292,E34,0,1,0,0,1
1,893,3,47.0,1,0,363272,7.0,E34,0,0,1,1,0


In [26]:
encoder = LabelEncoder()
train_encoded['cabin_encoded'] =encoder.fit_transform(train_encoded['Cabin']) 
train_encoded.drop('Cabin', axis=1, inplace=True)
test_encoded['cabin_encoded'] =encoder.fit_transform(test_encoded['Cabin'])
test_encoded.drop('Cabin', axis=1, inplace=True)

In [27]:
# since passenger id is unique, we don't use it to learn in a model because of no no similarity with each other

In [28]:
train_encoded.drop('PassengerId',axis=1,inplace = True)
test_passengerID = test_encoded['PassengerId']
test_encoded.drop('PassengerId',axis=1,inplace = True)

In [29]:
len(train_encoded['Ticket'].unique())/train_encoded.shape[0]

0.7643097643097643

In [30]:
len(test_encoded['Ticket'].unique())/test_encoded.shape[0]

0.868421052631579

In [31]:
#since most of the tickets are unique, we drop them too, following same rule as passenger id

In [32]:
train_encoded.drop('Ticket',axis=1,inplace = True)
test_encoded.drop('Ticket',axis=1,inplace = True)

In [33]:
train_encoded.shape

(891, 12)

In [34]:
test_encoded.shape

(418, 11)

In [35]:
train_encoded.head()

Unnamed: 0,Survived,Pclass,Age,SibSp,Parch,Fare,Embarked_C,Embarked_Q,Embarked_S,Sex_female,Sex_male,cabin_encoded
0,0,3,22.0,1,0,7.25,0,0,1,0,1,141
1,1,1,38.0,1,0,71.2833,1,0,0,1,0,81
2,1,3,26.0,0,0,7.925,0,0,1,1,0,141
3,1,1,35.0,1,0,53.1,0,0,1,1,0,55
4,0,3,35.0,0,0,8.05,0,0,1,0,1,141


In [36]:
test_encoded.head()

Unnamed: 0,Pclass,Age,SibSp,Parch,Fare,Embarked_C,Embarked_Q,Embarked_S,Sex_female,Sex_male,cabin_encoded
0,3,34.5,0,0,7.8292,0,1,0,0,1,61
1,3,47.0,1,0,7.0,0,0,1,1,0,61
2,2,62.0,0,0,9.6875,0,1,0,0,1,61
3,3,27.0,0,0,8.6625,0,0,1,0,1,61
4,3,22.0,1,1,12.2875,0,0,1,1,0,61


In [37]:
X = train_encoded.drop('Survived', axis=1)
y = train_encoded['Survived']

In [38]:
y.sum()/y.shape # 38 : 62 -> survive: not survived ratio

array([0.38383838])

## <u>Scaling:

In [39]:
X.head()

Unnamed: 0,Pclass,Age,SibSp,Parch,Fare,Embarked_C,Embarked_Q,Embarked_S,Sex_female,Sex_male,cabin_encoded
0,3,22.0,1,0,7.25,0,0,1,0,1,141
1,1,38.0,1,0,71.2833,1,0,0,1,0,81
2,3,26.0,0,0,7.925,0,0,1,1,0,141
3,1,35.0,1,0,53.1,0,0,1,1,0,55
4,3,35.0,0,0,8.05,0,0,1,0,1,141


In [40]:
y.head()

0    0
1    1
2    1
3    1
4    0
Name: Survived, dtype: int64

In [41]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)

In [42]:
scaler = StandardScaler()

In [43]:
scaler.fit(X_train)

StandardScaler()

In [44]:
X_train_scaled = scaler.transform(X_train)

In [45]:
X_test_scaled = scaler.transform(X_test)

## <u>Logistic Regression Model:

In [46]:
logreg = LogisticRegression()
logreg.fit(X_train_scaled,y_train)

LogisticRegression()

In [47]:
y_pred = logreg.predict(X_test_scaled)
accuracy_score(y_pred, y_test)

0.8555555555555555

#### Testing the real test file given

In [48]:
test_encoded_scaled = scaler.transform(test_encoded)

In [49]:
y_logistic_predicted = logreg.predict(test_encoded_scaled)

In [50]:
tested_output_value = np.array(tested_output['Survived'])

In [51]:
accuracy_score(y_logistic_predicted, tested_output_value)

0.9497607655502392

## <u>SVM Model:

In [52]:
x_train, x_test, yo_train, yo_test = train_test_split(X, y, test_size=0.1, random_state=24)

In [53]:
scaler.fit(x_train)

StandardScaler()

In [54]:
x_train_scaled = scaler.transform(x_train)

In [55]:
x_test_scaled = scaler.transform(x_test)

In [56]:
model2 = SVC()

In [57]:
model2.fit(x_train_scaled, yo_train)

SVC()

In [58]:
yo_pred = model2.predict(x_test_scaled)

In [59]:
accuracy_score(yo_pred,yo_test)

0.8666666666666667

#### Testing the real test file given

In [60]:
Test_encoded_scaled = scaler.transform(test_encoded)

In [61]:
y_SVM_predicted = model2.predict(Test_encoded_scaled)

In [62]:
accuracy_score(y_SVM_predicted, tested_output_value)

0.9569377990430622