In [1]:
import pandas as pd
import numpy as np

In [2]:
data_train = pd.read_csv("train.csv")
data_test = pd.read_csv("test.csv")

In [3]:
data_train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [4]:
data_train.shape

(891, 12)

In [5]:
data_train.pivot_table(index="Sex", values="Survived")

Unnamed: 0_level_0,Survived
Sex,Unnamed: 1_level_1
female,0.742038
male,0.188908


In [6]:
data_train.pivot_table(index="Pclass", values="Survived")

Unnamed: 0_level_0,Survived
Pclass,Unnamed: 1_level_1
1,0.62963
2,0.472826
3,0.242363


In [7]:
data_train.pivot_table(index="Embarked", values="Survived")

Unnamed: 0_level_0,Survived
Embarked,Unnamed: 1_level_1
C,0.553571
Q,0.38961
S,0.336957


In [8]:
# Checking if a trend exists based on Embarking point
temp = pd.get_dummies(data_train["Sex"], prefix="Sex")
temp["Start"] = data_train["Embarked"]
temp.columns

Index(['Sex_female', 'Sex_male', 'Start'], dtype='object')

In [9]:
temp.pivot_table(index="Start")

Unnamed: 0_level_0,Sex_female,Sex_male
Start,Unnamed: 1_level_1,Unnamed: 2_level_1
C,0.434524,0.565476
Q,0.467532,0.532468
S,0.315217,0.684783


In [10]:
# Evaluate the quality of data under Age column
data_train["Age"].describe()

count    714.000000
mean      29.699118
std       14.526497
min        0.420000
25%       20.125000
50%       28.000000
75%       38.000000
max       80.000000
Name: Age, dtype: float64

In [11]:
# Dealing with NAs and binning the ages
def age_splits(df, splits, col_names):
    df["Age"]=df["Age"].fillna(-1)
    df["Age_Type"] = pd.cut(x=df["Age"], bins=splits, labels=col_names)
    return df
splits = [-2, 0, 5, 15, 30, 60, 100]
col_names = ["Missing", "Toddler", "Kid", "Young", "Mature", "Senior"]
data_train = age_splits(data_train, splits, col_names)
data_test = age_splits(data_test, splits, col_names)
data_train.columns

Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked', 'Age_Type'],
      dtype='object')

In [12]:
#Exploring trend among males on the basis of bins created
data_train[data_train["Sex"]=="male"].pivot_table(index="Age_Type", values="Survived")

Unnamed: 0_level_0,Survived
Age_Type,Unnamed: 1_level_1
Missing,0.129032
Toddler,0.652174
Kid,0.352941
Young,0.146919
Mature,0.213115
Senior,0.105263


In [13]:
# Converting data points into numerical inputs
def parameter_redefining(df, col_name):
    dummies = pd.get_dummies(data=df[col_name], prefix=col_name)
    df = pd.concat([df,dummies], axis=1)
    return df
data_train = parameter_redefining(data_train, "Sex")
data_train = parameter_redefining(data_train, "Pclass")
data_train = parameter_redefining(data_train, "Age_Type")
data_train = parameter_redefining(data_train, "Embarked")
data_test = parameter_redefining(data_test, "Sex")
data_test = parameter_redefining(data_test, "Pclass")
data_test = parameter_redefining(data_test, "Age_Type")
data_test = parameter_redefining(data_test, "Embarked")
data_train.columns

Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked', 'Age_Type',
       'Sex_female', 'Sex_male', 'Pclass_1', 'Pclass_2', 'Pclass_3',
       'Age_Type_Missing', 'Age_Type_Toddler', 'Age_Type_Kid',
       'Age_Type_Young', 'Age_Type_Mature', 'Age_Type_Senior', 'Embarked_C',
       'Embarked_Q', 'Embarked_S'],
      dtype='object')

In [14]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score

In [15]:
lr = LogisticRegression()

In [16]:
# Considering Embarking field
numeric_column = ['Sex_female', 'Pclass_1', 'Pclass_2', 'Pclass_3',
       'Age_Type_Missing', 'Age_Type_Toddler', 'Age_Type_Kid',
       'Age_Type_Young', 'Age_Type_Mature', 'Age_Type_Senior', 'Embarked_C',
       'Embarked_Q', 'Embarked_S']

In [17]:
X = data_train[numeric_column]
y = data_train['Survived']

In [18]:
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = 0.2, random_state = 1004)

In [19]:
lr.fit(X_train, y_train)
predictions = lr.predict(X_test)
accuracy = accuracy_score(y_test, predictions)
print("Accuracy of the model is {} %" .format(accuracy*100))

Accuracy of the model is 78.2122905027933 %


In [20]:
# Disregarding the Embarking data point
numeric_column2 = ['Sex_female', 'Pclass_1', 'Pclass_2', 'Pclass_3',
       'Age_Type_Missing', 'Age_Type_Toddler', 'Age_Type_Kid',
       'Age_Type_Young', 'Age_Type_Mature', 'Age_Type_Senior']

In [21]:
X = data_train[numeric_column2]
Y = data_train['Survived']

In [22]:
train_X, test_X, train_Y, test_Y = train_test_split(X, Y, test_size=0.2, random_state = 1004)

In [23]:
lr.fit(train_X, train_Y)
predictions = lr.predict(test_X)
accuracy = accuracy_score(test_Y, predictions)
print("Accuracy of the model is {} %" .format(accuracy*100))
# Value of embarking data point is negligible. Hence ignore

Accuracy of the model is 79.88826815642457 %


In [24]:
val_score = cross_val_score(lr, X, Y, cv=7)
cross_accuracy = np.mean(val_score)*100
print("Cross validated accuracy is {} %" .format(cross_accuracy))
# Cross validated accuracy is close to the model accuracy. Hence proceed.

Cross validated accuracy is 78.89712725641436 %


In [25]:
lr = LogisticRegression()
lr.fit(X,Y)
test_predictions = lr.predict(data_test[numeric_column2])

In [26]:
# Creating the output file
test_ids = data_test["PassengerId"]
submission = {'PassengerId': test_ids, 'Survived':test_predictions}
submission_df = pd.DataFrame(submission)
submission_file = submission_df.to_csv('Titanic_Submission.csv', index=False)