In [144]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import copy
import math

In [145]:
# Read the CSV file into a pandas DataFrame
df = pd.read_csv('train.csv')

In [146]:
X_train = df.drop(columns = ['PassengerId', 'Survived', 'Name', 'Ticket', 'Cabin'])
X_train

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,3,male,22.0,1,0,7.2500,S
1,1,female,38.0,1,0,71.2833,C
2,3,female,26.0,0,0,7.9250,S
3,1,female,35.0,1,0,53.1000,S
4,3,male,35.0,0,0,8.0500,S
...,...,...,...,...,...,...,...
886,2,male,27.0,0,0,13.0000,S
887,1,female,19.0,0,0,30.0000,S
888,3,female,,1,2,23.4500,S
889,1,male,26.0,0,0,30.0000,C


In [147]:
Y_train = df['Survived']
Y_train

0      0
1      1
2      1
3      1
4      0
      ..
886    0
887    1
888    0
889    1
890    0
Name: Survived, Length: 891, dtype: int64

One Hot Encoding

In [148]:
#List of columns to encode
columns_to_encode = ['Sex', 'Embarked']


X_train = pd.get_dummies(X_train, columns = columns_to_encode)

X_train

Unnamed: 0,Pclass,Age,SibSp,Parch,Fare,Sex_female,Sex_male,Embarked_C,Embarked_Q,Embarked_S
0,3,22.0,1,0,7.2500,0,1,0,0,1
1,1,38.0,1,0,71.2833,1,0,1,0,0
2,3,26.0,0,0,7.9250,1,0,0,0,1
3,1,35.0,1,0,53.1000,1,0,0,0,1
4,3,35.0,0,0,8.0500,0,1,0,0,1
...,...,...,...,...,...,...,...,...,...,...
886,2,27.0,0,0,13.0000,0,1,0,0,1
887,1,19.0,0,0,30.0000,1,0,0,0,1
888,3,,1,2,23.4500,1,0,0,0,1
889,1,26.0,0,0,30.0000,0,1,1,0,0


Solving issue of NaN values in Age Column using Simple Imputer

In [149]:
np.mean(X_train.Age)

29.69911764705882

In [150]:
from sklearn.impute import SimpleImputer

# Create a SimpleImputer object
imputer = SimpleImputer(strategy='mean')

# Impute the missing values in the training data
X_train = imputer.fit_transform(X_train)

# Impute the missing values in the test data
X_test = imputer.transform(X_test)

X_train



array([[ 3.        , 22.        ,  1.        , ...,  0.        ,
         0.        ,  1.        ],
       [ 1.        , 38.        ,  1.        , ...,  1.        ,
         0.        ,  0.        ],
       [ 3.        , 26.        ,  0.        , ...,  0.        ,
         0.        ,  1.        ],
       ...,
       [ 3.        , 29.69911765,  1.        , ...,  0.        ,
         0.        ,  1.        ],
       [ 1.        , 26.        ,  0.        , ...,  1.        ,
         0.        ,  0.        ],
       [ 3.        , 32.        ,  0.        , ...,  0.        ,
         1.        ,  0.        ]])

In [151]:
import numpy as np
from sklearn.linear_model import LogisticRegression

#loading data
X = X_train
y = Y_train

# Create a logistic regression model
model = LogisticRegression(max_iter=10000)

# Fit the model to the data
model.fit(X, y)

# Make predictions
predictions = model.predict(X)

# Evaluate the model
accuracy = model.score(X, y)

print('Accuracy:', accuracy)

Accuracy: 0.8013468013468014


Preprocessing Test Data

In [152]:
df = pd.read_csv('test.csv')

X_test = df.drop(columns = ['PassengerId', 'Name', 'Ticket', 'Cabin'])

columns_to_encode = ['Sex', 'Embarked']
X_test = pd.get_dummies(X_test, columns = columns_to_encode)

# Impute the missing values in the test data
X_test = imputer.transform(X_test)

X_test


array([[ 3.        , 34.5       ,  0.        , ...,  0.        ,
         1.        ,  0.        ],
       [ 3.        , 47.        ,  1.        , ...,  0.        ,
         0.        ,  1.        ],
       [ 2.        , 62.        ,  0.        , ...,  0.        ,
         1.        ,  0.        ],
       ...,
       [ 3.        , 38.5       ,  0.        , ...,  0.        ,
         0.        ,  1.        ],
       [ 3.        , 29.69911765,  0.        , ...,  0.        ,
         0.        ,  1.        ],
       [ 3.        , 29.69911765,  1.        , ...,  1.        ,
         0.        ,  0.        ]])

In [153]:
# Make predictions
predictions = model.predict(X_test)

probabilities = model.predict_proba(X_test)[:,1]


predictions

array([0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 0,
       1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1,
       1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1,
       1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1,
       1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0,
       1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0,
       0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1,
       1, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1,
       0, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0,
       1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1,
       0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1,
       0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0,
       0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0,

In [154]:
df = pd.read_csv(r"C:\Users\Akhil\projects\machine learning\kaggle prac\titanic\submission.csv")

In [156]:
df["Survived"] = predictions
df.to_csv(r"C:\Users\Akhil\projects\machine learning\kaggle prac\titanic\submission.csv", index = False)



In [None]:
df["Survived"] = predictions
df.to_csv(r"C:\Users\Akhil\projects\machine learning\kaggle prac\titanic\submission.csv", index = False)

