In [12]:
import random

import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
from matplotlib.patches import Circle
import seaborn as sns

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.compose import ColumnTransformer
from sklearn.metrics import accuracy_score

In [13]:
train = pd.read_csv('train.csv', sep=',')
train_df = train.drop(columns=["PassengerId"], axis=1)

In [14]:
display(train_df)

Unnamed: 0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
1,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
3,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
4,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...,...
886,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S
887,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S
888,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,S
889,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C


In [4]:
train.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [15]:
test = pd.read_csv('test.csv', sep=',')
test_df = test.drop(columns=["PassengerId"], axis=1)

In [16]:
display(test_df)

Unnamed: 0,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0000,,S
2,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S
...,...,...,...,...,...,...,...,...,...,...
413,3,"Spector, Mr. Woolf",male,,0,0,A.5. 3236,8.0500,,S
414,1,"Oliva y Ocana, Dona. Fermina",female,39.0,0,0,PC 17758,108.9000,C105,C
415,3,"Saether, Mr. Simon Sivertsen",male,38.5,0,0,SOTON/O.Q. 3101262,7.2500,,S
416,3,"Ware, Mr. Frederick",male,,0,0,359309,8.0500,,S


In [18]:
cell_hover = {  # for row hover use <tr> instead of <td>
    'selector': 'td:hover',
    'props': 'background-color: #000066; color: white'
#     'props': [('background-color', '#ffffb3'; 'color')]
}
index_names = {
    'selector': '.index_name',
    'props': 'font-style: italic; color: darkgrey; font-weight:normal;'
}
headers = {
    'selector': 'th:not(.index_name)',
    'props': 'background-color: #000066; color: white;'
}

table_styles = [cell_hover, index_names, headers]
set_properties = {"background-color": "#023e8a","color":"white","border": "1.3px solid white"}

styles = train_df.sample(10).style.set_table_styles(table_styles).set_properties(**set_properties).highlight_null(null_color="#c600de").set_caption("10 sample records from the dataset.")
styles

Unnamed: 0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
163,0,3,"Calic, Mr. Jovo",male,17.0,0,0,315093,8.6625,,S
507,1,1,"Bradley, Mr. George (""George Arthur Brayton"")",male,,0,0,111427,26.55,,S
19,1,3,"Masselmani, Mrs. Fatima",female,,0,0,2649,7.225,,C
519,0,3,"Pavlovic, Mr. Stefo",male,32.0,0,0,349242,7.8958,,S
459,0,3,"O'Connor, Mr. Maurice",male,,0,0,371060,7.75,,Q
790,0,3,"Keane, Mr. Andrew ""Andy""",male,,0,0,12460,7.75,,Q
245,0,1,"Minahan, Dr. William Edward",male,44.0,2,0,19928,90.0,C78,Q
877,0,3,"Petroff, Mr. Nedelio",male,19.0,0,0,349212,7.8958,,S
323,1,2,"Caldwell, Mrs. Albert Francis (Sylvia Mae Harbaugh)",female,22.0,1,1,248738,29.0,,S
450,0,2,"West, Mr. Edwy Arthur",male,36.0,1,2,C.A. 34651,27.75,,S


In [19]:
print("\033[1mSome basic information about the dataset.\033[0m\n")

train_df.info()

[1mSome basic information about the dataset.[0m

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 11 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Survived  891 non-null    int64  
 1   Pclass    891 non-null    int64  
 2   Name      891 non-null    object 
 3   Sex       891 non-null    object 
 4   Age       714 non-null    float64
 5   SibSp     891 non-null    int64  
 6   Parch     891 non-null    int64  
 7   Ticket    891 non-null    object 
 8   Fare      891 non-null    float64
 9   Cabin     204 non-null    object 
 10  Embarked  889 non-null    object 
dtypes: float64(2), int64(4), object(5)
memory usage: 76.7+ KB


In [21]:
def missing_conditional_coloring(val):
    """
    If the value is between 0 to 10, then highlight with a different color and
    if the value if larger than 10 then highligh that value with a different
    color or left as it is.
    """
    val = val.tolist()
    return ['background-color:#e14258;color:white' if i > 10 else 'background-color:#c600de;color:white' if (i > 0 and i < 10) else 'background-color:#023e8a;color:white' for i in val]


total_missing_df = pd.DataFrame(train_df.isnull().sum() / train_df.shape[0] * 100, columns=["% (df)"])
total_missing_test_df = pd.DataFrame(test_df.isnull().sum() / test_df.shape[0] * 100, columns=["% (test df)"])
missing = pd.concat([total_missing_df, total_missing_test_df], axis=1)
missing.T.style.set_table_styles(table_styles).set_properties(**set_properties).highlight_null(null_color="#c600de").apply(missing_conditional_coloring).set_caption("Missing values to every column in terms of %")

Unnamed: 0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
% (df),0.0,0.0,0.0,0.0,19.86532,0.0,0.0,0.0,0.0,77.104377,0.224467
% (test df),,0.0,0.0,0.0,20.574163,0.0,0.0,0.0,0.239234,78.229665,0.0


In [22]:
train_df.describe().style.set_table_styles(table_styles).set_properties(**set_properties).highlight_null(null_color="#c600de").set_caption("Some basic statistical values of the dataset.")


Unnamed: 0,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,714.0,891.0,891.0,891.0
mean,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,0.0,1.0,0.42,0.0,0.0,0.0
25%,0.0,2.0,20.125,0.0,0.0,7.9104
50%,0.0,3.0,28.0,0.0,0.0,14.4542
75%,1.0,3.0,38.0,1.0,0.0,31.0
max,1.0,3.0,80.0,8.0,6.0,512.3292
