In [67]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [68]:
# Loading data 
data = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")
test_ids = test["PassengerId"]

In [69]:
def clean(data):
    # remove the PassengerId, Name, Ticket and Cabin columns
    data.drop(["PassengerId", "Name", "Ticket", "Cabin"], inplace=True, axis = 1)
    # replace missing values in the Age and Fare columns by the median
    data["Age"].fillna(data["Age"].median(), inplace = True)
    data["Fare"].fillna(data["Fare"].median(), inplace = True)
    # replace missing values in the embarked column by U = unknown
    data["Embarked"].fillna("U", inplace = True)
        
clean(data)
clean(test)

In [70]:
# replace categorical values by numerical values
le = LabelEncoder()
cols = ["Sex", "Embarked"]
for col in cols:
    data[col] = le.fit_transform(data[col])
    test[col] = le.fit_transform(test[col])
    
data.head(5)

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,0,3,1,22.0,1,0,7.25,2
1,1,1,0,38.0,1,0,71.2833,0
2,1,3,0,26.0,0,0,7.925,2
3,1,1,0,35.0,1,0,53.1,2
4,0,3,1,35.0,0,0,8.05,2


In [71]:
Y = data["Survived"]
X = data.drop("Survived", axis = 1)
# split data to training set and cross validation set
X_train, X_val, Y_train, Y_val = train_test_split(X, Y, test_size = .25, random_state = 42)

In [72]:
# train the model
clf = LogisticRegression(random_state = 0, max_iter = 1000).fit(X_train, Y_train)

In [73]:
# make predictions and compute accuracy
predictions = clf.predict(X_val)
accuracy_score(Y_val, predictions)

0.8071748878923767

In [74]:
submission_preds = clf.predict(test)

In [75]:
df = pd.DataFrame({
    "PassengerId": test_ids.values,
    "Survived": submission_preds
})

In [76]:
df.to_csv("submission.csv", index = False)