In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))


# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
#data visualization
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
#Data collections
train_data = pd.read_csv('/kaggle/input/titanic/train.csv')
test_data = pd.read_csv('/kaggle/input/titanic/test.csv')

In [None]:
#Header
train_data.head(5)

In [None]:
test_data.head()

In [None]:
#Training Data - number of rows and columns
train_data.shape

In [None]:
#Test Data - number of rows and columns
test_data.shape

In [None]:
#Information about training data
print('Training Data')
print('-------------')
print(train_data.info())

In [None]:
#Information about test data
print('Test Data')
print('----------')
print(test_data.info())

# Data Visualization

In [None]:
sns.countplot(x='Survived',data=train_data,palette='RdBu_r')

In [None]:
sns.countplot(x='Survived',hue='Sex',data=train_data)

In [None]:
sns.countplot(x='Survived',hue='Pclass',data=train_data,palette='rainbow')

In [None]:
sns.distplot(train_data['Age'].dropna(),color = 'red',kde=False,bins=30)

# Data Cleaning

In [None]:
#Checking for the missing data
sns.heatmap(train_data.isnull(),yticklabels=False,cbar=False,cmap='viridis')

In [None]:
#Checking for the missing data -Test set
sns.heatmap(test_data.isnull(),yticklabels=False,cbar=False,cmap='viridis')

In [None]:
#Finding the average age of the passengers based on the Pclass

plt.figure(figsize=(10, 7))
sns.boxplot(x='Pclass',y='Age',data=train_data)

In [None]:
print('Average Age of passengers in class 1')
print(int(train_data[train_data['Pclass'] == 1]['Age'].mean()))

print('Average Age of passengers in class 2')
print(int(train_data[train_data['Pclass'] == 2]['Age'].mean()))

print('Average Age of passengers in class 3')
print(int(train_data[train_data['Pclass'] == 3]['Age'].mean()))

In [None]:
#Function to filling age value based on the Pclass

def fill_missing_age(arr):
    Age = arr[0]
    Pclass = arr[1]
    
    if pd.isnull(Age):
        if(Pclass == 1):
            return 38
        elif(Pclass == 2):
            return 29
        else:
            return 25
    else:
        return Age
 

In [None]:
train_data['Age'] = train_data[['Age','Pclass']].apply(fill_missing_age, axis=1)

In [None]:
#Test Set
print('Average Age of passengers in class 1')
print(int(test_data[test_data['Pclass'] == 1]['Age'].mean()))

print('Average Age of passengers in class 2')
print(int(test_data[test_data['Pclass'] == 2]['Age'].mean()))

print('Average Age of passengers in class 3')
print(int(test_data[test_data['Pclass'] == 3]['Age'].mean()))

In [None]:
test_data['Age'] = test_data[['Age','Pclass']].apply(fill_missing_age, axis=1)

In [None]:
#Visualizing the data, after filling the missing age values with mean
sns.heatmap(train_data.isnull(),yticklabels=False,cbar=False,cmap = 'viridis')

In [None]:
#Visualizing the data, after filling the missing age values with mean - Test Data
sns.heatmap(test_data.isnull(),yticklabels=False,cbar=False,cmap = 'viridis')

In [None]:
test_data["Fare"] = test_data["Fare"].fillna(test_data["Fare"].mean())

In [None]:
#Visualizing the data, after filling the missing age values with mean - Test Data
sns.heatmap(test_data.isnull(),yticklabels=False,cbar=False,cmap = 'viridis')

In [None]:
#Dropping the cabin data
train_data.drop('Cabin',axis=1,inplace=True)

In [None]:
#No NUll data
sns.heatmap(train_data.isnull(),yticklabels=False,cbar=False,cmap = 'viridis')

# Encoding Categorical Features

In [None]:
#converting Sex and Embark features to numerical data - Training Set
sex = pd.get_dummies(train_data['Sex'],drop_first=True)
embark = pd.get_dummies(train_data['Embarked'],drop_first=True)

In [None]:
#dropping the features Name, Ticket along with Sex and Embark - Training Set
train_data.drop(['Sex','Embarked','Name','Ticket'],axis=1,inplace=True)

In [None]:
train_data = pd.concat([train_data,sex,embark], axis = 1 )

In [None]:
train_data.info()

In [None]:
train_data.head()

In [None]:
#converting Sex and Embark features to numerical data - Test Set
sex = pd.get_dummies(test_data['Sex'],drop_first=True)
embark = pd.get_dummies(test_data['Embarked'],drop_first=True)

In [None]:
#dropping the features Name, Ticket along with Sex and Embark - Test Set
test_data.drop(['Sex','Embarked','Name','Ticket'],axis=1,inplace=True)

In [None]:
test_data = pd.concat([test_data,sex,embark], axis = 1 )


In [None]:
test_data.drop('Cabin',axis=1,inplace=True)

In [None]:
test_data.head()

In [None]:
print(train_data.shape)
print(test_data.shape)

# Logistic Regression Model

In [None]:
X_train = train_data.drop('Survived',axis = 1)
y_train = train_data['Survived']

In [None]:
print(X_train.head(5))
print(y_train.head(5))

In [None]:
test_data.info()

In [None]:
X_test = test_data

In [None]:
print(X_train.shape)
print(y_train.shape)
print(X_test.shape)

In [None]:
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression()
lr.fit(X_train,y_train)

In [None]:
predictions = lr.predict(X_test)

In [None]:
predictions.shape

In [None]:
from sklearn.metrics import accuracy_score, f1_score
from sklearn.metrics import confusion_matrix, classification_report

In [None]:
accuracy = lr.score(X_train, y_train)
print('Model Score')
print((accuracy*100).round(2))

# Model Submission - Logistic Regression

In [None]:
#Creating CSV file to submit
dictionary = {
    'PassengerID' : test_data['PassengerId'],
    'Survived' : predictions
}
submission_lr = pd.DataFrame(dictionary)

In [None]:
submission_lr.to_csv('LR_predictions_Titanic.csv',index=False)

In [None]:
submission_lr.shape

In [None]:
submission_lr.head()

In [None]:
sns.countplot(submission_lr['Survived'])

# Decision Tree 

In [None]:
from sklearn.tree import DecisionTreeClassifier
dtree = DecisionTreeClassifier()

In [None]:
dtree.fit(X_train,y_train)

In [None]:
predictions_dtree = dtree.predict(X_test)

In [None]:
print('Model Score - Decision Tree')
print(dtree.score(X_train,y_train)*100)

# Model Submission

In [None]:
#Creating CSV file to submit
dictionary = {
    'PassengerID' : test_data['PassengerId'],
    'Survived' : predictions_dtree
}
submission = pd.DataFrame(dictionary)

In [None]:
submission.head()

In [None]:
submission.to_csv('Decision_tree_predictions_Titanic.csv',index=False)

In [None]:
submission.shape

In [None]:
sns.countplot(submission['Survived'])