In [13]:
import pandas as pd


In [14]:

#Read Files
data = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")
testIds = test["PassengerId"]



In [15]:
#Print out firsrt 5 rows of Data (To visualize what we have)
data.head(5)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [16]:
#To Keep Relevant Features, we will make a function to clean our data and drop columns that are not so relevant

def cleanData(data):

    #Data that we will drop, Ticket #, PassengerID, Name likely does not have an effect on survival
    data = data.drop(["Ticket", "Cabin", "Name", "PassengerId"], axis = 1, errors='ignore')

    
    #Columns that contain missing values
    cols = ["SibSp", "Parch", "Fare", "Age"]

    #Going through columns and filling any missing values with the median of that column
    for column in cols:

        #Instead of using inplace = True, have if statement
        if column in data.columns:
            data[column] = data[column].fillna(data[column].median())

    #Embarked is where the boat left from, We are making a new Unknown "U" for the missing values in the data
    #Instead of using inplace = True, have if statement
    if 'Embarked' in data.columns:
        data['Embarked'] = data['Embarked'].fillna("U")

    return data

#Clean the data via calling out function)
data = cleanData(data)
test = cleanData(test)



In [17]:
data.head(5)

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,0,3,male,22.0,1,0,7.25,S
1,1,1,female,38.0,1,0,71.2833,C
2,1,3,female,26.0,0,0,7.925,S
3,1,1,female,35.0,1,0,53.1,S
4,0,3,male,35.0,0,0,8.05,S


In [30]:


#Encode data simply with pandas, we don't want categorical features

# Define Our categorical features
categoricalFeatures = ['Sex', 'Embarked']


#Encode categorical features
pd.get_dummies(data=data, columns=categoricalFeatures, prefix=categoricalFeatures)



data.head(5)




Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,0,3,1,22.0,1,0,7.25,2
1,1,1,0,38.0,1,0,71.2833,0
2,1,3,0,26.0,0,0,7.925,2
3,1,1,0,35.0,1,0,53.1,2
4,0,3,1,35.0,0,0,8.05,2


In [33]:

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split


#y is what we wanna predict which is survival
y = data["Survived"]

#We will drop Survival because this is what we wanna predict 
X = data.drop("Survived", axis=1)

#Split data to our train and test variables for both x and y. 20% for testing, 80% training
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size =0.2, random_state=42)


#Should no longer see Survived Column, we are using everything else to predict survival
X.head(5)


Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,3,1,22.0,1,0,7.25,2
1,1,0,38.0,1,0,71.2833,0
2,3,0,26.0,0,0,7.925,2
3,1,0,35.0,1,0,53.1,2
4,3,1,35.0,0,0,8.05,2


In [34]:

#Logistic Regression as we have 2 possible outcomes, 1 for survived, 0 for not survived
classifier = LogisticRegression(random_state=0, max_iter=1000).fit(X_train, y_train)

In [35]:

#Make our predictions
predictions = classifier.predict(X_val)


from sklearn.metrics import accuracy_score

#We got an accuracy score of about 81%
accuracy_score(y_val, predictions)

0.8100558659217877

In [27]:
#Preidctions we will submit (For Kaggle)
submissionPredictions = classifier.predict(test)

In [28]:
df = pd.DataFrame({

    "PassengerId": testIds.values,
    "Survived": submissionPredictions,
    
})

In [29]:
df.to_csv("Submission.csv", index=False)