In [1]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LogisticRegression

In [2]:
#The features of the datasets will be the coloum name in data table. 
#'col_name' cointains the names of the features of the dataset of training data
#'test_col' cointains the names of the feature in test dataset
#'expected_col' cointains the name of the features that are to be predicted from 'test_col', here 'survival'
col_name = ['id', 'survival', 'p_class', 'name', 'sex', 'age', 'sib_sp', 'par_ch', 'ticket_no', 'fare', 'cabin_no', 'embarked']
test_col = ['id', 'p_class', 'name', 'sex', 'age', 'sib_sp', 'par_ch', 'ticket_no', 'fare', 'cabin_no', 'embarked']
expected_col =['id', 'survival']

In [3]:
#The file 'test.csv' cointains the data that should be used to train the hypothesis.
#'test.csv' cointains the data on which the hypothesis is to be tested. 
#The expected prediction, ie. 'survival', on dataset 'test.csv' is given in 'gender_submission.csv'
train_df = pd.read_csv( 'train.csv', delimiter = ',', names = col_name, quotechar='"', skiprows = [0])
test_df = pd.read_csv( 'test.csv', delimiter = ',', names = test_col, quotechar='"', skiprows = [0])
expected_df = pd.read_csv( 'gender_submission.csv', delimiter = ',', names = expected_col, skiprows = [0])

In [4]:
#The train(train_df) dataset is divided into two parts
#The first part(train_outcome) cointains the values of the features that hypothesis should predict
#The secound part(train_data) cointains the values of the features on the basis of which hypothesis predicts.
#Simillary the required features of the test dataset are extracted where data wh=ith age is 'NaN' is ignored
#From Dummies variable we get two coloum, one which shows 1 if 'male' other coloum shows 1 if 'female'.
#We choose the former. This is done because logistic regression classifies can only optimize numerical inputs.
train_data = pd.DataFrame()
test_data = pd.DataFrame()

train_df = train_df.dropna()
train_data['p_class'] = train_df['p_class']
train_data['age'] = train_df['age']
train_data['sex'] = pd.get_dummies(train_df['sex'])['male']
train_outcome = train_df[ 'survival']

test_data['p_class'] = test_df['p_class']
test_data['age'] = test_df['age']
test_data['sex'] = pd.get_dummies(test_df['sex'])['male']

In [5]:
#There are data in testset which does not have age.
#For these datas, the age is assumed to be the median of age with same 'p_class' and 'sex'

for data_sex in [0 , 1]:
    #Sub-dataset with particular 'sex' is made
    train_data_gender = train_data[train_data['sex'] == data_sex]
    test_data_gender = test_data[test_data['sex'] == data_sex]
    for data_class in [1, 2 , 3]:
        #Sub-dataset with particular 'sex' and 'p_class' is made
        train_data_class = train_data_gender[train_data_gender['p_class'] == data_class]
        test_data_class = test_data_gender[test_data_gender['p_class'] == data_class].copy()
        
        #Median of age with the particular 'p_class' and 'sex' is computed
        missing_age = np.int(np.median(train_data_class['age']))
        
        #The data with missing 'age' is replaced with the computed age
        test_data_class['age'] = test_data_class['age'].fillna(missing_age)
        test_data[(test_data['sex'] == data_sex) & (test_data_gender['p_class'] == data_class)] = test_data_class

In [6]:
#The Feature are scaled inorder to avoid dominance of a type of data over others.
#Here MinMaxScalar is used, which converts x to (x - xmin)/(xmax - xmin), where xmin and xmax are of training set
scaler = MinMaxScaler()
scaler.fit(train_data)

train_data = scaler.transform(train_data)
test_data = scaler.transform(test_data)