In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import pandas as pd
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [None]:
## Training set
train = pd.read_csv("/kaggle/input/titanic/train.csv") 

## Test set
test  = pd.read_csv("/kaggle/input/titanic/test.csv") 

## Exploratory Data Analysis

In [None]:
train.describe()

In [None]:
train.head().T

In [None]:
test.head().T

### NULL Check

In [None]:
train.isnull().sum()

In [None]:
train.size

In [None]:
male_age = train[train['Sex'] == 'male'].Age.mean()

In [None]:
female_age = train[train['Sex'] == 'female'].Age.mean()

In [None]:
train.loc[train['Sex'] == 'male', 'Age'] = train[train['Sex'] == 'male']['Age'].fillna(male_age)

In [None]:
train.loc[train['Sex'] == 'female', 'Age'] = train[train['Sex'] == 'female']['Age'].fillna(female_age)

In [None]:
test.loc[test['Sex'] == 'female', 'Age'] = test[test['Sex'] == 'female']['Age'].fillna(female_age)

In [None]:
test.loc[test['Sex'] == 'male', 'Age'] = test[test['Sex'] == 'male']['Age'].fillna(male_age)

### Drop Cabin Column

In [None]:
train.drop(['Cabin'], axis=1, inplace=True)

In [None]:
test.drop(['Cabin'], axis=1, inplace=True)

### Create family group

In [None]:
train["family_group"] = train["SibSp"] + train["Parch"] + 1

In [None]:
test["family_group"] = test["SibSp"] + test["Parch"] + 1

### Drop PassengerId, Ticket, Name, SibSp, Parch

In [None]:
train.drop(['PassengerId', 'Ticket', 'Name', 'SibSp', 'Parch'], axis=1, inplace=True)
test.drop(['PassengerId', 'Ticket', 'Name', 'SibSp', 'Parch'], axis=1, inplace=True)

In [None]:
train.isnull().sum()

In [None]:
test.isnull().sum()

In [None]:
train['Embarked'].unique()

In [None]:
sns.countplot(train['Embarked'])

In [None]:
sns.countplot(test['Embarked'])

In [None]:
train['Embarked'] = train['Embarked'].fillna('S')

In [None]:
train.isnull().sum()

In [None]:
sns.lineplot(x=test.Age,y=test.Fare)

In [None]:
test[test.Fare.isnull()]

In [None]:
fare_50s = test[(test.Age >= 50) & (test.Age < 60)]['Fare'].mean()

In [None]:
test['Fare'] = test['Fare'].fillna(fare_50s)

In [None]:
test.isnull().sum()

### One-Hot Encoding

In [None]:
dummy_train = pd.get_dummies(train[['Sex', 'Embarked']])
dummy_test  = pd.get_dummies(test[['Sex', 'Embarked']])

train = pd.concat([train.drop(["Sex", "Embarked"], axis = 1), dummy_train], axis = 1)
test  = pd.concat([test.drop(["Sex", "Embarked"], axis = 1), dummy_test], axis = 1)

In [None]:
train.drop('Sex_female', axis=1, inplace=True)

In [None]:
test.drop('Sex_female', axis=1, inplace=True)

In [None]:
train = train.rename(columns={'Sex_male': 'Sex'})

In [None]:
train

In [None]:
test = test.rename(columns={'Sex_male': 'Sex'})

In [None]:
test

In [None]:
test.isnull().sum()

### Gender investigation

In [None]:
train.groupby(['Sex', 'Survived']).size().reset_index(name='No. of Survivors')

In [None]:
## Training Data
sns.countplot(x='Sex', data=train)

In [None]:
sns.countplot(x='Sex', data=test)

In [None]:
features = [
                'Pclass',
                'Age',
                'Fare',
                'family_group',
                'Sex',
                'Embarked_C',
                'Embarked_Q',
                'Embarked_S'
            ]
label    = ['Survived']
X_train = train[features]
y_train = train[label]

In [None]:
X_test = test[features]

In [None]:
X_train, X_valid, y_train, y_valid = train_test_split(X_train, y_train,test_size=0.3, random_state=3)

# Modeling

In [None]:
logisticRegression = LogisticRegression(max_iter=300, solver='liblinear')
logisticRegression.fit(X_train, y_train.values.ravel())

In [None]:
predictions = logisticRegression.predict(X_valid)

In [None]:
accuracy = accuracy_score(predictions, y_valid)
print(accuracy * 100)

In [None]:
from sklearn.metrics import classification_report
print(classification_report(y_valid, predictions))
from sklearn.metrics import confusion_matrix
confusion_matrix(y_valid, predictions)

## Submit

In [None]:
# One-hot encoding
# test[['Female', 'Male']] = pd.get_dummies(test['Sex'])
testPrediction = logisticRegression.predict(X_test)

In [None]:
X_test

In [None]:
## Test set
passenger_id  = pd.read_csv("/kaggle/input/titanic/test.csv")['PassengerId']

In [None]:
submission = pd.DataFrame({
    "PassengerId": passenger_id,
    "Survived": testPrediction
})

In [None]:
submission.to_csv('/kaggle/working/submission.csv', index=False)