In [2]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

## Import Modules

In [3]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')
%matplotlib inline

## Loading the dataset

In [4]:
train = pd.read_csv('/kaggle/input/titanic/train.csv')
test = pd.read_csv('/kaggle/input/titanic/test.csv')
train.head()

In [5]:
## statistical info
train.describe()

In [6]:
## datatype info
train.info()

## Exploratory Data Analysis

In [7]:
## categorical attributes
sns.countplot(train['Survived'])

In [8]:
sns.countplot(train['Pclass'])

In [9]:
sns.countplot(train['Sex'])

In [10]:
sns.countplot(train['SibSp'])

In [11]:
sns.countplot(train['Parch'])

In [12]:
sns.countplot(train['Embarked'])

In [13]:
## numerical attributes
sns.distplot(train['Age'])

In [14]:
sns.distplot(train['Fare'])

In [17]:
class_fare = train.pivot_table(index='Pclass', values='Fare')
class_fare.plot(kind='bar')
plt.xlabel('Pclass')
plt.ylabel('Avg. Fare')
plt.xticks(rotation=0)
plt.show()

In [18]:
class_fare = train.pivot_table(index='Pclass', values='Fare', aggfunc=np.sum)
class_fare.plot(kind='bar')
plt.xlabel('Pclass')
plt.ylabel('Total Fare')
plt.xticks(rotation=0)
plt.show()

In [52]:
sns.barplot(data=train, x='Pclass', y='Fare', hue='Survived')

In [53]:
sns.barplot(data=train, x='Survived', y='Fare', hue='Pclass')

## Data Preprocessing

In [39]:
train_len = len(train)
# combine two dataframes
df = pd.concat([train, test], axis=0)
df = df.reset_index(drop=True)
df.head()

In [40]:
df.tail()

In [41]:
## find the null values
df.isnull().sum()

In [42]:
# drop or delete the column
df = df.drop(columns=['Cabin'], axis=1)

In [43]:
df['Age'].mean()

In [44]:
# fill missing values using mean of the numerical column
df['Age'] = df['Age'].fillna(df['Age'].mean())
df['Fare'] = df['Fare'].fillna(df['Fare'].mean())

In [45]:
df['Embarked'].mode()[0]

In [46]:
# fill missing values using mode of the categorical column
df['Embarked'] = df['Embarked'].fillna(df['Embarked'].mode()[0])

## Log transformation for uniform data distribution

In [47]:
sns.distplot(df['Fare'])

In [48]:
df['Fare'] = np.log(df['Fare']+1)

In [49]:
sns.distplot(df['Fare'])

## Correlation Matrix

In [50]:
corr = df.corr()
plt.figure(figsize=(15, 9))
sns.heatmap(corr, annot=True, cmap='coolwarm')

In [54]:
df.head()

In [55]:
## drop unnecessary columns
df = df.drop(columns=['Name', 'Ticket'], axis=1)
df.head()

## Label Encoding

In [57]:
from sklearn.preprocessing import LabelEncoder
cols = ['Sex', 'Embarked']
le = LabelEncoder()

for col in cols:
    df[col] = le.fit_transform(df[col])
df.head()

## Train-Test Split

In [58]:
train = df.iloc[:train_len, :]
test = df.iloc[train_len:, :]

In [59]:
train.head()

In [60]:
test.head()

In [61]:
# input split
X = train.drop(columns=['PassengerId', 'Survived'], axis=1)
y = train['Survived']

In [62]:
X.head()

## Model Training

In [79]:
from sklearn.model_selection import train_test_split, cross_val_score
# classify column
def classify(model):
    x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)
    model.fit(x_train, y_train)
    print('Accuracy:', model.score(x_test, y_test))
    
    score = cross_val_score(model, X, y, cv=5)
    print('CV Score:', np.mean(score))

In [80]:
from sklearn.linear_model import LogisticRegression
model = LogisticRegression()
classify(model)

In [81]:
from sklearn.tree import DecisionTreeClassifier
model = DecisionTreeClassifier()
classify(model)

In [82]:
from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier()
classify(model)

In [83]:
from sklearn.ensemble import ExtraTreesClassifier
model = ExtraTreesClassifier()
classify(model)

In [84]:
from xgboost import XGBClassifier
model = XGBClassifier()
classify(model)

In [85]:
from lightgbm import LGBMClassifier
model = LGBMClassifier()
classify(model)

In [87]:
from catboost import CatBoostClassifier
model = CatBoostClassifier(verbose=0)
classify(model)

## Complete Model Training with Full Data

In [88]:
model = LGBMClassifier()
model.fit(X, y)

In [89]:
test.head()

In [93]:
# input split for test data
X_test = test.drop(columns=['PassengerId', 'Survived'], axis=1)

In [94]:
X_test.head()

In [95]:
pred = model.predict(X_test)
pred

## Test Submission

In [96]:
sub = pd.read_csv('/kaggle/input/titanic/gender_submission.csv')
sub.head()

In [100]:
sub.info()

In [101]:
sub['Survived'] = pred
sub['Survived'] = sub['Survived'].astype('int')

In [102]:
sub.info()

In [103]:
sub.head()

In [104]:
sub.to_csv('submission.csv', index=False)