In [1]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression

# Loading And Analysis of Datasets

In [2]:
training_df = pd.read_csv("training_data.csv")
testing_df = pd.read_csv("testing_data.csv")

In [3]:
training_df.shape, testing_df.shape

((668, 11), (223, 10))

In [4]:
training_df

Unnamed: 0,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Survived
0,2,"Weisz, Mrs. Leopold (Mathilde Francoise Pede)",female,29.0,1,0,228414,26.0000,,S,1
1,3,"Williams, Mr. Howard Hugh ""Harry""",male,,0,0,A/5 2466,8.0500,,S,0
2,2,"Morley, Mr. Henry Samuel (""Mr Henry Marshall"")",male,39.0,0,0,250655,26.0000,,S,0
3,3,"Palsson, Mrs. Nils (Alma Cornelia Berglund)",female,29.0,0,4,349909,21.0750,,S,0
4,3,"Sutehall, Mr. Henry Jr",male,25.0,0,0,SOTON/OQ 392076,7.0500,,S,0
...,...,...,...,...,...,...,...,...,...,...,...
663,2,"Ilett, Miss. Bertha",female,17.0,0,0,SO/C 14885,10.5000,,S,1
664,3,"Morrow, Mr. Thomas Rowan",male,,0,0,372622,7.7500,,Q,0
665,3,"Bing, Mr. Lee",male,32.0,0,0,1601,56.4958,,S,1
666,3,"Strandberg, Miss. Ida Sofia",female,22.0,0,0,7553,9.8375,,S,0


In [5]:
training_df.describe()

Unnamed: 0,Pclass,Age,SibSp,Parch,Fare,Survived
count,668.0,536.0,668.0,668.0,668.0,668.0
mean,2.296407,29.70056,0.528443,0.407186,32.064552,0.402695
std,0.831638,14.240257,1.080327,0.854695,45.320835,0.490808
min,1.0,0.67,0.0,0.0,0.0,0.0
25%,2.0,21.0,0.0,0.0,7.925,0.0
50%,3.0,29.0,0.0,0.0,14.75,0.0
75%,3.0,38.25,1.0,0.0,31.275,1.0
max,3.0,80.0,8.0,6.0,512.3292,1.0


In [6]:
training_df.isnull().sum(axis = 0)

Pclass        0
Name          0
Sex           0
Age         132
SibSp         0
Parch         0
Ticket        0
Fare          0
Cabin       514
Embarked      1
Survived      0
dtype: int64

In [7]:
print(len(set(training_df["Embarked"])))
set(training_df["Embarked"])

4


{'C', 'Q', 'S', nan}

# Cleaning of Data

In [8]:
def sex_to_num(s):
    if s == "male":
        return 0
    else:
        return 1
    
def change_Embarked_to_num(s):
    if s == 'S':
        return 1
    elif s == 'C':
        return 2
    else:
        return 3

def clean_data(df):
    ## Remove irrelevant columns
    del df["Name"]
    del df["Cabin"]  ## Coz lots's of data is NAN
    del df["Ticket"]
    del df["Fare"]
    
    ## Fill NAN Values
    df.Embarked.fillna("S", inplace = True)
    MeanAge = df.Age.mean(axis = 0)
    df.Age.fillna(MeanAge, inplace = True)    
    
    ## Convert String to Integer
    df["Sex"] = df["Sex"].apply(sex_to_num)
    df["Embarked"] = df["Embarked"].apply(change_Embarked_to_num)
    return df

In [9]:
training_df = clean_data(training_df)
testing_df = clean_data(testing_df)

In [10]:
training_df

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Embarked,Survived
0,2,1,29.00000,1,0,1,1
1,3,0,29.70056,0,0,1,0
2,2,0,39.00000,0,0,1,0
3,3,1,29.00000,0,4,1,0
4,3,0,25.00000,0,0,1,0
...,...,...,...,...,...,...,...
663,2,1,17.00000,0,0,1,1
664,3,0,29.70056,0,0,3,0
665,3,0,32.00000,0,0,1,1
666,3,1,22.00000,0,0,1,0


In [11]:
training_df.isnull().sum(axis = 0)

Pclass      0
Sex         0
Age         0
SibSp       0
Parch       0
Embarked    0
Survived    0
dtype: int64

# GetBack Data into Numpy array

In [12]:
training_data = pd.DataFrame(training_df).to_numpy()
testing_data = pd.DataFrame(testing_df).to_numpy()

In [13]:
training_data.shape, testing_data.shape

((668, 7), (223, 6))

In [14]:
type(training_data)

numpy.ndarray

In [15]:
x_train = training_data[ : , 0 : -1]
y_train = training_data[ : , -1]
x_test = testing_data[ : , : ]

In [16]:
x_train.shape, y_train.shape, x_test.shape

((668, 6), (668,), (223, 6))

# Apply Logistic regression Model

In [17]:
clf = LogisticRegression(solver = 'saga')

In [18]:
clf.fit(x_train, y_train)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='saga', tol=0.0001, verbose=0,
                   warm_start=False)

In [19]:
clf.score(x_train, y_train)

0.8008982035928144

In [20]:
y_test_pred = clf.predict(x_test)

In [21]:
np.savetxt("Predictions.csv", y_test_pred)

In [22]:
## Got an accuracy of 0.843% on testing data