Here is the solution for https://www.kaggle.com/c/titanic

In [151]:
import csv
import datetime as dt
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline

"""
'PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 
'Age', 'SibSp', 'Parch', 'Ticket','Fare', 'Cabin', 'Embarked'
"""

output_file_name_format = 'titanic-{:04}-{:02}-{:02}-{:02}-{:02}-{:02}.csv'

target = 'Survived'
features = ['Pclass', 'SexN', 'Age']

# load data from csv file
def read_data(file):
    with open(file, 'r', errors='backslashreplace') as f:
        reader = csv.DictReader(c.replace('\0', '') for c in f)
        for row in reader:
            yield row


def load_data(file):
    data = [d for d in read_data(file)]
    sex_numerical(data)
    return data


def print_data(data):
    for d in data:
        print(d)

        
def sex_numerical(data):
    for d in data:
        if d['Sex'] == 'male':
            d['SexN'] = 1
        else:
            d['SexN'] = 0

def numerical(val):
    try:
        return float(val)
    except ValueError:
        #return float('nan')
        return 0

    
def split(Y, test_frac=0.33):
    train_index, test_index = train_test_split(np.arange(len(Y)),
                                               test_size=test_frac,
                                               random_state=np.random.randint(10, 100),
                                               stratify=Y)
    return train_index, test_index


def train(file):
    data = load_data(file)
    #print_data(data)

    X = [tuple(numerical(d[f]) for f in features) for d in data]
    Y = [d[target] for d in data]
    X = np.array(X)
    Y = np.array(Y)
    
    train_index, test_index = split(Y)
    train_X = X[train_index]
    train_Y = Y[train_index]
    test_X = X[test_index]
    test_Y = Y[test_index]
    
    clf = LogisticRegression();
    clf.fit(train_X, train_Y);
    score = clf.score(test_X, test_Y)
    print('score: {}'.format(score))
    return clf


def predict(file, clf):
    data = load_data(file)
    X = [tuple(numerical(d[f]) for f in features) for d in data]
    predict = clf.predict(X)
    ####output to file####
    t = dt.datetime.now()
    file = output_file_name_format.format(t.year, t.month, t.day, t.hour, t.minute, t.second)
    f = open(file, 'w')
    f.write('PassengerId,Survived\n')
    for d, p in zip(data, predict):
        f.write(d['PassengerId'] + ',' + p + '\n')
    f.close()

In [152]:
# Main Function
train_file = 'train.csv'
test_file = 'test.csv'
output_file = 'output.csv'
clf = train(train_file)
predict(test_file, clf)

score: 0.7830508474576271
