In [1]:
#import libraries to support processing and analyzing the data
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn import preprocessing

In [2]:
#read the training data into a pandas dataframe
train_data_frame = pd.read_csv('train.csv')

In [3]:
#cleanse the dataframe by filling NaN values with the medians from each respective column
train_data_frame = train_data_frame.fillna(train_data_frame.median())

#update categorical columns to have categorical codes
train_data_frame['Sex'] = train_data_frame['Sex'].astype("category").cat.codes
train_data_frame['Embarked'] = train_data_frame['Embarked'].astype("category").cat.codes

In [4]:
#create X and y dataframes from the training dataframe to use for the logistic regression and the survived outcome
y_train = train_data_frame['Survived']
X_train = train_data_frame[['Sex', 'Fare', 'SibSp', 'Age']]

#call preprocessing.StandardScaler() to standardize the data
X_train = preprocessing.StandardScaler().fit_transform(X_train)

In [5]:
#split the training data to test different combinations of variables to use for the logistic regression
X_train_split, X_test_split, y_train_split, y_test_split = train_test_split(X_train, y_train, test_size=0.15, random_state = 6)

In [6]:
#use LogisticRegression to fit the training portion of the data from the training split 
logistic_regression = LogisticRegression(solver = 'lbfgs')
logistic_regression.fit(X_train_split, y_train_split)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [7]:
#predict whether passengers survived or not on the test portion of the data from the training data split
logistic_predictions_training = logistic_regression.predict(X_test_split)

In [8]:
#score the results from the predictions of X_test_split to the correct values from y_test_split
correct_prediction_score = logistic_regression.score(X_test_split, y_test_split)
correct_prediction_score 

0.8134328358208955

In [9]:
#read the test data into a pandas dataframe
test_data_frame = pd.read_csv('test.csv')

In [10]:
#repeat the same cleansing and transformation process for the testing data as was done for the training data
test_data_frame = test_data_frame.fillna(test_data_frame.median())
test_data_frame['Sex'] = test_data_frame['Sex'].astype("category").cat.codes
test_data_frame['Embarked'] = test_data_frame['Embarked'].astype("category").cat.codes

In [11]:
#create X dataframe from the testing dataframe with the desired variables for regression
X_test = test_data_frame[['Sex', 'Fare', 'SibSp','Age']]

#call preprocessing.StandardScaler() to standardize the data
X_test = preprocessing.StandardScaler().fit_transform(X_test)

In [12]:
#use logistic regression to fit the training data
logistic_regression.fit(X_train, y_train)

#predict the values for the testing data based on the regression fit from the training data
logistic_predictions = logistic_regression.predict(X_test)

In [13]:
#create submission template with Survived column and PasssengerId
logistic_predictions_data_frame = pd.DataFrame(logistic_predictions, columns = ['Survived'])
prediction_data_frame = pd.concat([test_data_frame['PassengerId'],logistic_predictions_data_frame], axis = 1)

In [14]:
#output predictions to csv for submission
prediction_data_frame.to_csv("prediction_output18.csv", index = False)