In [None]:
# Import Dependencies
%matplotlib inline

# Start Python Imports
import math, time, random, datetime

# Data Manipulation
import numpy as np
import pandas as pd

# Visualization 
import matplotlib.pyplot as plt
import missingno
import seaborn as sns
plt.style.use('seaborn-whitegrid')

# Preprocessing
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, label_binarize

# Machine learning
import catboost
from sklearn.model_selection import train_test_split
from sklearn import model_selection, tree, preprocessing, metrics, linear_model
from sklearn.svm import LinearSVC
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LinearRegression, LogisticRegression, SGDClassifier
from sklearn.tree import DecisionTreeClassifier
from catboost import CatBoostClassifier, Pool, cv

# Let's be rebels and ignore warnings for now
import warnings
warnings.filterwarnings('ignore')


# Importing Data from input folder

In [None]:
train = pd.read_csv("../input/titanic/train.csv")
test = pd.read_csv("../input/titanic/test.csv")
gsub = pd.read_csv("../input/titanic/gender_submission.csv") # How my submission should look like

In [None]:
#Viewing the five top rows of the dataset in train.csv
train.head()

In [None]:
#Viewing the five bottom rows of the dataset in the train.csv
train.tail()

In [None]:
train.Age.plot.hist()

In [None]:
# View the test data (Same columns as the train data)
test.head()

In [None]:
# View the exmaple of submission dataframe
gsub.head()

# Data Descriptions

**Survival**: 0 = No, 1 = Yes

**pclass (Ticket class)**: 1 = 1st, 2 = 2nd, 3 = 3rd

**sex**: Sex

**Age**: Age in years

**sibsp**: number of siblings/spouses aboard the Titanic

**parch**: number of parents/children aboard the Titanic

**ticket**: Ticket number

**fare**: Passenger fare

**cabin**: Cabin number

**embarked**: Port of Embarkation, **C** = Cherbourg, **Q** = Queenstown,**S** = Southampton

In [None]:
train.describe()

# Are there values missing?
Where are the wholes in our data?
These are rows which are missing a value or have NaN (NOt a Number) instead of something similar to other columns.

In [None]:
#Plotting a graph of missing values
missingno.matrix(train, figsize = (30,20))

Cleary there are missing values especially in the Cabin column.
It is vital to visualize missing values early so that I know where the major holess are in my dataset.
Such information will assist me with my EDA and figure out what kind of data cleaning and preprocessing is needed.

In [None]:
# To see better , lets see the number of missing values in the dataset
train.isnull().sum()

# To analyse the data, am going to create two new dataframes

**What are dataframae?** is a table or a two-dimensional array-like structure in which each column contains values of one variable and each row contains one set of values from each column.

**Characteristics of a dataframe**
1. The column names should be non-empty
2. The row names should be unique.
3. The data stores in a data frame can be of numeric, factor or character type.
4. Each column should contain same number of data items.

**Creating Datframes**

Shall create two dataframes
1. For exploring discretised continous variables (are countable in finite number of times like chnage in your pocket)
2. For exploring continous variables (Takes forvere to count like time)

In [None]:
df_dis = pd.DataFrame() # for descretised countinous variables
df_con = pd.DataFrame() # for continous variables

# What datatype are in the datframe?

As a general rule of thumb, features with a datatype of object could be considered categorical features. And those which are float or int (numbers) could be considered numerical features.

However, we can find features whihc are numerical to be categorical.

The aim for the next step is to figure out how best to process the data so our machine learning model can learn from it.

Ideally all features will be encoded into a numerical value of some kind.

In [None]:
# Let see he different dataypes that we have on this dataset
train.dtypes

# Exploring Ech features individually 
GOing through each column repeatedly and see which oes to use in our first models. Some may need more processing than others to get ready.

In [None]:
train.head()

# Target feature: Survived

Descriptiom: Wherether the passenger survived or not

Key: 0 = did not survive, 1 = survived

This is the variable we want our machine learning model to predict based off all the others.

In [None]:
# How many People survived?
fid = plt.figure(figsize=[20,1])
sns.countplot(y='Survived', data=train);
print(train.Survived.value_counts())

In [None]:
# Adding the baove to our subset datframes
df_dis['Survived'] = train['Survived']
df_con['Survived'] = train['Survived']

In [None]:
df_dis.head()

In [None]:
df_con.head()

# Feature: Pclass

Description: The ticket class of the passenger.

key: 1 = 1st class, 2 = 2nd class, 3 = 3rd class

**PLotting the distribution for the feature Pclass**

Lets look at the distribution of each feature first so that we can understand what kind of spread there is acrss the dataset

For instance, if there are values which are completely outside of the dostribution, we may not want to include them in our model.

In [None]:
#Distribution plot for Pclass
sns.distplot(train.Pclass)

With this feature the values are numerical (1,2 and 3) in nature by the are categories since passenger in Class 3 doesn't equal a pessanger in Class 2 + passenger in Class 1

In [None]:
# How many missing variable does Pclass have?
 
train.Pclass.isnull().sum()

Since there are no missing valaues in Pclass, its a good candidate to add on both of the dateframes (df_dis & df_con) that we created earlier.

In [None]:
# Adding Pclass to df_dis and df_con
df_dis['Pclass'] = train['Pclass']
df_con['Pclass'] = train['Pclass']

# Feature: Name
Description : The name of the Passenger

In [None]:
# How many different names are there?
train.Name.value_counts()

Since there are so many different names and to keep this EDA fast, we won't move forward using the name variable, so we skip it.

# Fetaure : Sex

Description: The sex of passenger (male or female)


In [None]:
# Let's view the distribution of sex
plt.figure(figsize=[20,5])
sns.countplot(y="Sex", data=train);
print(train.Sex.value_counts())

In [None]:
# Are ther any null values in Sex column?
train.Sex.isnull().sum()

In [None]:
train.Sex.head()

Since the column sex already contian s binaray values (male or female) shall add them directly to our subset dataframes.

In [None]:
# Add sex to the subset dataframes
df_dis['Sex'] = train['Sex']
df_dis['Sex'] = np.where(df_dis['Sex'] == 'female',1,0) # change sex to 0 for male and 1 for female
df_con['Sex'] = train['Sex']

In [None]:
# How does the sex varible look compared to survival?
#We can see this becasue they're both binarys.
fig = plt.figure(figsize=(10, 10))
sns.distplot(df_dis.loc[df_dis['Survived'] == 1]['Sex'], kde_kws={'label': 'Survived'});
sns.distplot(df_dis.loc[df_dis['Survived'] == 0]['Sex'], kde_kws={'label': 'Did not survive'});
print(train['Survived'])

# NOt many people Survived, but female where most who survived