In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# **Import all required packages**

In [None]:
import pandas as pd
import sklearn
import seaborn as sns
import matplotlib.pyplot as plt 
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix,accuracy_score

sns.set_style('whitegrid')

# **Load Data**

In [None]:
data = pd.read_csv('/kaggle/input/titanic/train.csv')
data

In [None]:
data.shape # get shape

# **Data Analysis**

ANALYZE DATA USING PLOTS TO SHOW RELATIONSHIP BETWEEN DIFFERENT VARIABLES

In [None]:
sns.countplot(x = 'Survived',data = data) # countplot to check how many survived (1) and not survived (0).

In [None]:
sns.countplot(x = 'Survived',hue = 'Sex',data = data) # check how many male/female are survived (1) and not survived (0). 


In [None]:
sns.countplot(x = 'Survived',hue = 'Pclass', data = data) # check pasangers are from which class.

In [None]:
data['Age'].hist(bins = 10) # histogram of Age (agewise frequency of pasanger in titanic)

In [None]:
sns.countplot(x = 'SibSp',data = data) # get countplot of 'SibSp'

In [None]:
data.info() # get data info

# **Data Wrangling**
CLEAN DATA BY REMOVING NAN VALUES AND UNNECESSARY COLUMNS IN DATA

In [None]:
data.isnull() # check null values in data (False = not null, True = null)

In [None]:
data.isnull().sum()  # get sum of null values in each column.

In [None]:
sns.heatmap(data.isnull()) # heatmap where 'Age' and 'Cabin' has more null values.

In [None]:
data['Age'] = data['Age'].fillna(data['Age'].mean()) # null values in 'Age' is replaced by mean.
data = data.drop(['Cabin'],axis = 1) # drop 'Cabin' which is having more null values

In [None]:
data.shape # get shape (1 column dropped)

In [None]:
data.dropna(inplace = True) # remaining null values removed

In [None]:
data.isnull().sum() # check for the null values

In [None]:
sns.heatmap(data.isnull()) # heatmap for null values 

In [None]:
data.drop(['PassengerId','Name','Ticket'],axis = 1,inplace = True)  # drop unwanted column from data

In [None]:
pd.options.display.float_format = '{:,.2f}'.format
data.corr() # get correlation

In [None]:
sns.heatmap(data.corr(),annot = True,fmt = '.2f') # visualize correlation

In [None]:
# get dummies for 'Sex', 'Embarked','Pclass'
sex = pd.get_dummies(data['Sex'],drop_first = True) 
embarked = pd.get_dummies(data['Embarked'],drop_first = True)
pclass = pd.get_dummies(data['Pclass'],drop_first = True)

In [None]:
data = pd.concat([data,sex,pclass,embarked],axis = 1) # add it into data
data.drop(['Pclass','Sex','Embarked'],axis = 1, inplace = True) # remove previous one

# **Model Training**

In [None]:
x = data.drop(['Survived'],axis = 1) # get independent variable
y = data['Survived'] # get dependent (target) variable

In [None]:
# perform train-test-split with test_size of 0.2 
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size = 0.2,random_state = 42)

In [None]:
# data scaling
sc = StandardScaler()

x_train = sc.fit_transform(x_train)
x_test = sc.fit_transform(x_test)

In [None]:
model = LogisticRegression(max_iter = 150,n_jobs = 1)  # model building
model.fit(x_train,y_train) # model training
y_pred = model.predict(x_test) # get prediction

In [None]:
# performance evaluation 

print(confusion_matrix(y_test,y_pred))
print("Accuracy:",round(accuracy_score(y_test,y_pred)*100,2),'%')

# **Test Data**

In [None]:
test_data = pd.read_csv('/kaggle/input/titanic/test.csv') # load test data
test_data

In [None]:
test_data.shape # get shape

In [None]:
test_data.isnull().sum() # check null values

In [None]:
# handling null values

test_data['Age'] = test_data['Age'].fillna(test_data['Age'].mean())  # replace null values
test_data['Fare'] = test_data['Fare'].fillna(test_data['Fare'].mean()) # replace null values
test_data.drop(['Cabin'],axis = 1, inplace = True) # drop 'Cabin' having more null value

In [None]:
test_data.isnull().sum() # check null values

In [None]:
# get dummies 

tsex = pd.get_dummies(test_data['Sex'],drop_first = True)
tembarked = pd.get_dummies(test_data['Embarked'],drop_first = True)
tpclass = pd.get_dummies(test_data['Pclass'],drop_first = True)

In [None]:
t_data = pd.concat([test_data,tsex,tpclass,tembarked],axis = 1) # add it into data
t_data = t_data.drop(['Pclass','PassengerId','Name','Sex','Ticket','Embarked'],axis = 1) # drop unwanted coulumn
t_data = sc.fit_transform(t_data) # data scaling

In [None]:
result = model.predict(t_data) # make prediction on test data
result

In [None]:
result.shape # check result shape

In [None]:
# add it to csv file

id =  test_data['PassengerId']
d = {'PassengerId':id,'Survived':result}
df = pd.DataFrame(d)
df.to_csv('TitanicSubmission.csv',index = False)