In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
import missingno as msno
from datetime import date
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.neighbors import LocalOutlierFactor
from sklearn.preprocessing import MinMaxScaler, LabelEncoder, StandardScaler, RobustScaler
from statsmodels.stats.proportion import proportions_ztest
from sklearn.ensemble import RandomForestClassifier

In [None]:
pd.set_option("display.max_columns", None)
pd.set_option("display.max_rows", None)
pd.set_option("display.float_format", lambda x: "%.2f" % x)
pd.set_option("display.width", 500)

In [None]:
test_df = pd.read_csv("../input/titanic/test.csv")
train_df = pd.read_csv("../input/titanic/train.csv")

**Learning Data**

In [None]:
def data_summary(dataframe):
    print("############## SHAPE ##############")
    print(dataframe.shape[0])
    print("############## TYPES ##############")
    print(dataframe.dtypes)
    print("############## NULL ##############")
    print(dataframe.isnull().sum())
    print("############ DESCRIBE ############")
    print(dataframe.describe([0.0, 0.25, 0.50, 0.75, 0.90, 0.95, 0.99]).T)

In [None]:
data_summary(test_df)

In test dataset, variables "Age", "Cabin" and "Fare" has null values. Especially Cabin variable has 327 null values. Considering that there are 418 inputs in total, it can be seen that 78% of the variable is unknown. Maybe this variable can be ignored.

In [None]:
data_summary(train_df)

In train dataset, variables "Age", "Cabin" and "Embarked" has null values. Like test dataset, in this dataset too, Cabin variable has  null values. There are 819 inputs in total, and 84% of Cabin values is unknown.

**Feature Engineering**

***Feature Extraction***

In [None]:
train_df["Cabin" + "_NA_FLAG"] = np.where(train_df["Cabin"].isnull(), 0, 1)   
na_flags = train_df.loc[:, train_df.columns.str.contains("_NA")].columns
    
for col in na_flags:
    print(pd.DataFrame({"Target_Mean": train_df.groupby(col)["Survived"].mean(),
                            "Count": train_df.groupby(col)["Survived"].count()}))

The probability of survival of people with null values was found to be 30%. People with known "cabin" numbers have a 67% chance of surviving. The difference was found to be significant. It has been learned that people who do not have a cabin number in the Titanic disaster are "usually" ship crew. However, there are also passengers among the people with these null values. We need to examine this variable separately.

In [None]:
test_df["Cabin" + "_NA_FLAG"] = np.where(test_df["Cabin"].isnull(), 0, 1)   
test_df.head()

In [None]:
train_df.loc[((train_df["SibSp"] + train_df["Parch"]) > 0), "Alone"] = "No"
train_df.loc[((train_df["SibSp"] + train_df["Parch"]) == 0), "Alone"] = "Yes"

In [None]:
train_df.groupby("Alone")["Survived"].mean()

In [None]:
test_df.loc[((test_df["SibSp"] + test_df["Parch"]) > 0), "Alone"] = "No"
test_df.loc[((test_df["SibSp"] + test_df["Parch"]) == 0), "Alone"] = "Yes"

In [None]:
test_stat, pvalue = proportions_ztest(count = [train_df.loc[train_df["Alone"]== "Yes", "Survived"].sum(),
                                              train_df.loc[train_df["Alone"]== "No", "Survived"].sum()],
                                     nobs = [train_df.loc[train_df["Alone"]== "Yes", "Survived"].shape[0],
                                            train_df.loc[train_df["Alone"] == "No", "Survived"].shape[0]])

print("Test Stat = %.4f, p-value = %.4f" % (test_stat, pvalue))

In [None]:
train_df["Title"] = train_df.Name.str.extract(" ([A-Za-z]+)\.", expand = False)
train_df.head()

In [None]:
train_df[["Title", "Survived", "Age"]].groupby(["Title"]).agg({"Survived": "mean", "Age": ["count", "mean"]})

In [None]:
test_df["Title"] = test_df.Name.str.extract(" ([A-Za-z]+)\.", expand = False)
test_df.head()

**Exploring Data Analysis**

In [None]:
def categoric_data(dataframe, cat_th = 10, car_th=20):
    """
    It serves to determine whether the variables in the dataset are categorical, numerical or cardinal variables.
    
    Parameters:
    ----------------
        dataframe: dataframe
                dataframe that wants to apply
        cat_th: int, optional
                Class threshold for numeric but categorical variables
        car_th: int, optional
                Class threshold for categorical but cardinal variables
                
    Returns:
    ---------------
    cat_cols: list
            Categorical variable list
    num_cols: list
            Numerical variable list
    cat_but_car: list
            Categorical but cardinal variable list
    num_but_cat: list
            Numerical but categorical variable list
            
    Notes:
    ---------------
    cat_cols + num_cols + cat_but_car = total variables
    num_but_cat variables are in cat_cols.
    """
    
    cat_cols = [col for col in dataframe.columns if str(dataframe.dtypes[col]) in ["category", "bool", "object"]]
    num_but_cat = [col for col in dataframe.columns if str(dataframe.dtypes[col]) in ["int64", "float64"] and dataframe[col].nunique() < cat_th]
    cat_but_car = [col for col in dataframe.columns if str(dataframe.dtypes[col]) in ["category", "object"] and dataframe[col].nunique() > car_th]
    cat_cols = cat_cols + num_but_cat
    cat_cols = [col for col in cat_cols if col not in cat_but_car]
    num_cols = [col for col in dataframe.columns if str(dataframe[col].dtypes) in ["int64", "float64"]]
    num_cols = [col for col in num_cols if col not in num_but_cat]
    
    print(f"Observations: {dataframe.shape[0]}")
    print(f"Variebles: {dataframe.shape[1]}")
    print(f"cat_cols: {len(cat_cols)}")
    print(f"num_cols: {len(num_cols)}")
    print(f"cat_but_car: {len(cat_but_car)}")
    print(f"num_but_cat: {len(num_but_cat)}")
    
    return cat_cols, num_cols, cat_but_car

In [None]:
cat_cols, num_cols, cat_but_car = categoric_data(train_df)

In [None]:
test_cat_cols, test_num_cols, test_cat_but_car = categoric_data(test_df)

**Target Variable Analysis**

In [None]:
def target_analysis(dataframe, target="Survived"):
    if col not in [target]:
        print(pd.DataFrame(dataframe.groupby([col])[target].mean()))
        print("#################################")

In [None]:
for col in cat_cols:
        target_analysis(train_df)

It seems like being a "female" and "1st class" are a factor for being "Survived" from Titanic Disaster.

***Outlier Detection***

In [None]:
def outlier_detection(dataframe,col_name, q1=0.05, q3= 0.95):
    """
     It serves to determine whether the variables in the dataset are outlier or not.
    
    Parameters:
    ----------------
        dataframe: dataframe
                dataframe that wants to apply
        col_name : str
                Column name that will be apply
              q1 : float, optional
                Lower quarter threshold
              q3 : float, optional
                Upper quarter threshold
                
    Returns:
    ---------------
    low_limit: float
            Calculated lower limit value
    up_limit : float
            Calculated up limit value
    outliers: DataFrame
            Dataframe of outliers
            
    Notes:
    ---------------
    Iqr: Inter Quartile Range
    """
    low = dataframe[col_name].quantile(q1)
    up = dataframe[col_name].quantile(q3)
    iqr = up - low
    low_limit = low - 1.5 * iqr
    up_limit = up + 1.5* iqr
    outliers = dataframe[(dataframe[col_name] < low_limit) | (dataframe[col_name] > up_limit)]
    print(f"col name: {col_name}")
    print(f"up limit: {up_limit}")
    print(f"low limit: {low_limit}")
    print(f"outliers: {outliers[col_name].count()}")
    print("#########################################################")
    return low_limit, up_limit, outliers
    

In [None]:
num_cols = [col for col in num_cols if "PassengerId" not in col]

In [None]:
for col in num_cols:
    low_limit, up_limit, outliers = outlier_detection(train_df, col)

In [None]:
sns.boxplot(x = train_df["Fare"])
plt.show()

There are 3 classes of voyager on Titanic. Therefore, ticket prices may vary. But there is too much space between 200 and 500 on the boxplot. We can accept these 3 values as an outlier.

In [None]:
Fare_Outliers = outliers["Fare"]
Fare_Outliers

To fix this stuation we can use re-assignment method.

In [None]:
train_df.loc[(train_df["Fare"] > up_limit), "Fare"] = up_limit

In [None]:
train_df["Fare"].max()

In [None]:
sns.boxplot(x = train_df["Fare"])
plt.show()

In [None]:
test_num_cols = [col for col in test_num_cols if "PassengerId" not in col]

In [None]:
for col in test_num_cols:
    test_low_limit, test_up_limit, test_outliers = outlier_detection(test_df, col)

In [None]:
test_df.loc[(test_df["Fare"] > test_up_limit), "Fare"] = test_up_limit

***Missing Values***

In [None]:
def missing_values(dataframe, col_name):
    print(f"col_name:{col_name}")
    print(f"null:{dataframe[col_name].isnull().sum()}")
    print(f"Percentage:{dataframe[col_name].isnull().sum() / dataframe.shape[0]*100}")
    print("################################################################")

In [None]:
[missing_values(train_df, col) for col in train_df.columns if train_df[col].isnull().sum()>0]

In [None]:
[missing_values(test_df, col) for col in test_df.columns if test_df[col].isnull().sum()>0]

In [None]:
dff = pd.get_dummies(train_df[cat_cols + num_cols], drop_first = True)

In [None]:
test_dff = pd.get_dummies(test_df[test_cat_cols + test_num_cols], drop_first = True)

In [None]:
scaler = MinMaxScaler()
dff = pd.DataFrame(scaler.fit_transform(dff), columns = dff.columns ) 
dff.head()

In [None]:
test_dff = pd.DataFrame(scaler.fit_transform(test_dff), columns = test_dff.columns ) 
test_dff.head()

In [None]:
from sklearn.impute import KNNImputer
imputer = KNNImputer(n_neighbors = 5)
dff = pd.DataFrame(imputer.fit_transform(dff), columns = dff.columns )  

In [None]:
test_dff = pd.DataFrame(imputer.fit_transform(test_dff), columns = test_dff.columns )  

In [None]:
train_df["age_knn"] = dff["Age"]
train_df.loc[train_df["Age"].isnull(), ["Age", "age_knn"]].head()

In [None]:
test_df["age_knn"] = test_dff["Age"]
test_df.loc[test_df["Age"].isnull(), ["Age", "age_knn"]].head()

"Age" is old column with null values. "age knn" is new column with predicted values by knn method.

In [None]:
train_df["Age"] = train_df["age_knn"]
train_df.drop(axis = 1, columns = "age_knn", inplace = True )
train_df.head()

In [None]:
test_df["Age"] = test_df["age_knn"]
test_df.drop(axis = 1, columns = "age_knn", inplace = True )
test_df.head()

In [None]:
train_df["Embarked"] = train_df["Embarked"].fillna(train_df["Embarked"].mode()[0])

In [None]:
test_df["Fare"] = test_df["Fare"].fillna(test_df["Fare"].median())

In [None]:
train_df.isnull().sum()

In [None]:
test_df.isnull().sum()

In [None]:
test_stat, pvalue = proportions_ztest(count = [train_df.loc[train_df["Cabin_NA_FLAG"]== 1, "Survived"].sum(),
                                              train_df.loc[train_df["Cabin_NA_FLAG"]== 0, "Survived"].sum()],
                                     nobs = [train_df.loc[train_df["Cabin_NA_FLAG"]== 1, "Survived"].shape[0],
                                            train_df.loc[train_df["Cabin_NA_FLAG"] == 0, "Survived"].shape[0]])

print("Test Stat = %.4f, p-value = %.4f" % (test_stat, pvalue))

In [None]:
train_df.drop(axis = 1, columns = "Cabin", inplace= True)
train_df.isnull().sum()

In [None]:
test_df.drop(axis = 1, columns = "Cabin", inplace= True)
test_df.isnull().sum()

***Encoding and Standardization***

In [None]:
lab = LabelEncoder()
train_df["Sex"] = lab.fit_transform(train_df["Sex"])
lab.inverse_transform([0,1])

In [None]:
test_df["Sex"] = lab.fit_transform(test_df["Sex"])
lab.inverse_transform([0,1])

In [None]:
train_df["Alone"] = lab.fit_transform(train_df["Alone"])
lab.inverse_transform([0,1])

In [None]:
test_df["Alone"] = lab.fit_transform(test_df["Alone"])
lab.inverse_transform([0,1])

In [None]:
one_hot_train = [col for col in train_df.columns if 8 >= train_df[col].nunique() > 2]
one_hot_train

In [None]:
one_hot_test = [col for col in test_df.columns if 8 >= test_df[col].nunique() > 2]
one_hot_test

In [None]:
train_df = pd.get_dummies(train_df, columns = one_hot_train, drop_first = True)
train_df.head()

In [None]:
train_df.shape

In [None]:
test_df = pd.get_dummies(test_df, columns = one_hot_test, drop_first = True)
test_df.head()

In [None]:
test_df.shape

In [None]:
useless_train_cols = [col for col in train_df.columns if train_df[col].nunique() == 2 and (train_df[col].value_counts() / len(train_df) < 0.01).any(axis=None)]
useless_train_cols

In [None]:
useless_test_cols = [col for col in test_df.columns if test_df[col].nunique() == 2 and (test_df[col].value_counts() / len(test_df) < 0.01).any(axis=None)]
useless_test_cols

**Modelling**

In [None]:
y_train = train_df["Survived"]
x_train = train_df.drop(["PassengerId", "Survived", "Name", "Ticket", "Title"], axis = 1)
x_test = test_df.drop(["PassengerId", "Name", "Ticket", "Title", "Parch_9"], axis = 1)

In [None]:
model = RandomForestClassifier(random_state = 46).fit(x_train, y_train)
pred = model.predict(x_test)

In [None]:
output = pd.DataFrame({'PassengerId': test_df.PassengerId, 'Survived': pred})
output.to_csv('submission.csv', index=False)

In [None]:
score = model.score(x_test, pred)
score

In [None]:
feature_importance= pd.DataFrame(sorted(zip(model.feature_importances_, x_train.columns), reverse = True), columns = ['Value', 'Feature'])
plt.figure(figsize=(10,5))
sns.barplot(x='Value', y='Feature', data=feature_importance.sort_values(by='Value', ascending=False))
plt.tight_layout()
plt.show()

Order of Impotance:
    Age > Fare > Sex > Pclass_3 > Cabin_NA_Flag > Embarked_S > Alone

As we see, the variable "Cabin" that included too many null values is 5th important variable for survived. It was a true decision not to delete this column.

In [None]:
output.head(20)