In [13]:
import numpy as np
import pandas
titanic = pandas.read_csv("train.csv")
titanic_test = pandas.read_csv("test.csv")

In [14]:
titanic.head(3)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S


In [15]:
# fillna method replaces/fills in all missing values, in this case with median age
titanic["Age"] = titanic["Age"].fillna(titanic["Age"].median()) 

# convert all males and females to 0 and 1 so describe algorithm can process it
titanic.loc[titanic["Sex"] == "male", "Sex"] = 0
titanic.loc[titanic["Sex"] == "female", "Sex"] = 1

# Replace all missing values with S, then replace S with 0, C with 1 and Q with 2
titanic["Embarked"] = titanic["Embarked"].fillna("S") 
titanic.loc[titanic["Embarked"] == "S", "Embarked"] = 0
titanic.loc[titanic["Embarked"] == "C", "Embarked"] = 1
titanic.loc[titanic["Embarked"] == "Q", "Embarked"] = 2

# fillna method replaces/fills in all missing values, in this case with median age
titanic_test["Age"] = titanic_test["Age"].fillna(titanic["Age"].median()) 

# convert all males and females to 0 and 1 so describe algorithm can process it
titanic_test.loc[titanic_test["Sex"] == "male", "Sex"] = 0
titanic_test.loc[titanic_test["Sex"] == "female", "Sex"] = 1

# Replace all missing values with S, then replace S with 0, C with 1 and Q with 2
titanic_test["Embarked"] = titanic_test["Embarked"].fillna("S") 
titanic_test.loc[titanic_test["Embarked"] == "S", "Embarked"] = 0
titanic_test.loc[titanic_test["Embarked"] == "C", "Embarked"] = 1
titanic_test.loc[titanic_test["Embarked"] == "Q", "Embarked"] = 2

titanic_test.head(5)

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",0,34.5,0,0,330911,7.8292,,2
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",1,47.0,1,0,363272,7.0,,0
2,894,2,"Myles, Mr. Thomas Francis",0,62.0,0,0,240276,9.6875,,2
3,895,3,"Wirz, Mr. Albert",0,27.0,0,0,315154,8.6625,,0
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",1,22.0,1,1,3101298,12.2875,,0


In [16]:
# making predictions using Linear Regression
# Import the linear regression class
from sklearn.linear_model import LinearRegression
# Sklearn also has a helper that makes it easy to do cross-validation
from sklearn.cross_validation import KFold
# The columns we'll use to predict the target
predictors = ["Pclass", "Sex", "Age", "SibSp", "Parch", "Fare", "Embarked"]
# Initialize our algorithm class
alg = LinearRegression()
# Generate cross-validation folds for the titanic data set
# It returns the row indices corresponding to train and test
# We set random_state to ensure we get the same splits every time we run this
kf = KFold(titanic.shape[0], n_folds=3, random_state=1)
predictions = []
for train, test in kf:
    # The predictors we're using to train the algorithm  
    # Note how we only take the rows in the train folds
    train_predictors = (titanic[predictors].iloc[train,:])
    # The target we're using to train the algorithm
    train_target = titanic["Survived"].iloc[train]
    # Training the algorithm using the predictors and target
    alg.fit(train_predictors, train_target)
    # We can now make predictions on the test fold
    test_predictions = alg.predict(titanic[predictors].iloc[test,:])
    predictions.append(test_predictions)
predictions = np.concatenate(predictions, axis=0)

In [17]:
predictions[predictions > .5] = 1
predictions[predictions <=.5] = 0

In [18]:
submission = pandas.DataFrame({
        "PassengerId": titanic["PassengerId"],
        "Survived": predictions
    })

In [20]:
predictions_true = (predictions == titanic["Survived"]) # boolean, check if prediction = actual
true_predictions = predictions[predictions_true] # only True or correct predictions are kept
num_true = (true_predictions.size) # number of correct predictions
tot_predictions = (predictions.size) # number of total predictions
accuracy = num_true / tot_predictions # proportion

accuracy

0.7833894500561167

In [22]:
submission.to_csv("SUBMISSION.csv", index=False)