In [None]:
import pandas as pd
import os
import matplotlib.pyplot as plt
import numpy as np

import seaborn as sns
sns.set(style="white") #white background style for seaborn plots
sns.set(style="whitegrid", color_codes=True)

In [None]:
DATA_DIR = './titanic'
data_files = reversed([os.path.join(DATA_DIR, filename) for filename in os.listdir(DATA_DIR)])
df_list = []
for filename in data_files:
    df_list.append(pd.read_csv(filename))
df = pd.concat(df_list, sort=False)
df = df.reset_index(drop=True)

number_of_train_dataset = df.Survived.notnull().sum()
number_of_test_dataset = df.Survived.isnull().sum() 
y_true = df.pop("Survived")[:number_of_train_dataset]

df.tail()

In [None]:
df.dtypes
df.info()
df.isnull().sum()
df.describe()
df.head(2).T


In [None]:
pd.options.display.float_format = '{:.2f}'.format
df.isnull().sum() / len(df) * 100

In [None]:
df[df["Age"].notnull()].groupby(["Sex"])["Age"].mean()

In [None]:
df[df["Age"].notnull()].groupby(["Pclass"])["Age"].mean()

In [None]:
df["Age"].fillna(
    df.groupby("Pclass")["Age"].transform("mean"), inplace=True)
df.isnull().sum() / len(df) * 100

In [None]:
df.groupby("Pclass")["Age"].transform("mean")

### One-hot encoding

In [None]:
object_columns = ["PassengerId", "Pclass", "Name", "Sex", "Ticket", "Cabin", "Embarked"]
numeric_columns = ["Age", "SibSp", "Parch", "Fare"]

In [None]:
for col_name in object_columns:
    df[col_name] = df[col_name].astype(object)

for col_name in numeric_columns:
    df[col_name] = df[col_name].astype(float)
df["Parch"] = df["Parch"].astype(int)
df["SibSp"] = df["SibSp"].astype(int)


In [None]:
df.info()

In [None]:
def merge_and_get(ldf, rdf, on, how="inner", index=None):
    if index is True:
        return pd.merge(ldf,rdf, how=how, left_index=True, right_index=True)
    else:
        return pd.merge(ldf,rdf, how=how, on=on)

In [None]:
df.loc[61,"Embarked"]           = "S"
df.loc[829,"Embarked"]           = "S"

In [None]:
one_hot_df = merge_and_get(df, pd.get_dummies(df["Sex"], prefix="Sex"), on=None, index=True) #sex_modified
one_hot_df = merge_and_get(one_hot_df, pd.get_dummies(df["Pclass"], prefix="Pclass"), on=None, index=True) #sex_modified
one_hot_df = merge_and_get(one_hot_df, pd.get_dummies(df["Embarked"], prefix="Embarked"), on=None, index=True) #sex_modified


In [None]:
temp_columns = ["Sex", "Pclass", "Embarked"]

for col_name in temp_columns:
    temp_df = pd.merge(one_hot_df[col_name], y_true, left_index=True, right_index=True)
    sns.countplot(x="Survived", hue=col_name, data=temp_df)
    plt.show()

In [None]:
temp_df = pd.merge(one_hot_df[temp_columns], y_true, left_index=True, right_index=True)
g = sns.catplot(x="Embarked", hue="Pclass", col="Survived",
                 data=temp_df, kind="count",
                 height=4, aspect=.7);

In [None]:
temp_df = pd.merge(one_hot_df[temp_columns], y_true, left_index=True, right_index=True)
g = sns.catplot(x="Pclass", hue="Sex", col="Survived",
                 data=temp_df, kind="count",
                 height=4, aspect=.7);

In [None]:
temp_df = pd.merge(one_hot_df[temp_columns], y_true, left_index=True, right_index=True)
g = sns.catplot(x="Embarked", hue="Sex", col="Survived",
                 data=temp_df, kind="count",
                 height=4, aspect=.7);

In [None]:
crosscheck_columns = [col_name for col_name in one_hot_df.columns.tolist()
                     if col_name.split("_")[0] in temp_columns and  "_"  in col_name ] + ["Sex"]

# temp_columns
temp_df = pd.merge(one_hot_df[crosscheck_columns], y_true, left_index=True, right_index=True)

corr =  temp_df.corr()
sns.set()
ax = sns.heatmap(corr, annot=True,  linewidths=.5, cmap="YlGnBu")

### Mean Encoding

In [None]:
temp_df = pd.merge(
    one_hot_df["Pclass"], y_true, left_index=True, right_index=True)
temp_df.groupby("Pclass")["Survived"].mean()

In [None]:
temp_df["Pclass"].replace(
    temp_df.groupby("Pclass")["Survived"].mean())

In [None]:
temp_columns = ["Sex", "Pclass", "Embarked"]

me_list = []
for col_name in temp_columns:
    temp_df = pd.merge(one_hot_df[col_name], y_true, left_index=True, right_index=True)
    temp_df["me_"+col_name] = temp_df[col_name].replace(temp_df.groupby(col_name)["Survived"].mean())
#     sns.countplot(x=col_name, hue="Survived", data=temp_df)    
#     plt.show()
#     sns.countplot(x="me_"+col_name, hue="Survived", data=temp_df)
#     plt.show()
    
    me_list.append(temp_df.drop("Survived", axis=1))
temp_df =  pd.merge(
    pd.concat(me_list,axis=1)[["me_" + col_name for col_name in temp_columns ]],
    y_true, left_index=True, right_index=True)
corr =  temp_df.corr()
sns.set()
ax = sns.heatmap(corr, annot=True,  linewidths=.5, cmap="YlGnBu")

### Categorical Combination

In [None]:
temp_columns = ["Sex", "Pclass", "Embarked"]
one_hot_df["Sex-Pclass"] = df["Sex"].map(str)+df["Pclass"].map(str)
one_hot_df["Embarked-Pclass"] = df["Embarked"].map(str)+df["Pclass"].map(str)

one_hot_df = merge_and_get(one_hot_df, pd.get_dummies(one_hot_df["Sex-Pclass"], prefix="SexPclass"), on=None, index=True) #sex_modified
one_hot_df = merge_and_get(one_hot_df, pd.get_dummies(one_hot_df["Embarked-Pclass"], prefix="EmbarkedPclass"), on=None, index=True) #sex_modified
one_hot_df

crosscheck_columns = [col_name for col_name in one_hot_df.columns.tolist()
                     if col_name.split("_")[0] in ["SexPclass", "EmbarkedPclass"] and  "_"  in col_name ] 

temp_df = pd.merge(one_hot_df[crosscheck_columns], y_true, left_index=True, right_index=True)

corr =  temp_df.corr()
plt.subplots(figsize=(20,15))
ax = sns.heatmap(corr, annot=True,  linewidths=.5, cmap="YlGnBu")

In [None]:
corr

### Numeric data types

In [None]:
temp_df = pd.merge(one_hot_df[numeric_columns], y_true, left_index=True, right_index=True)

sns.pairplot(temp_df)

In [None]:
corr =  temp_df.corr()
sns.set()
plt.subplots(figsize=(20,15))
ax = sns.heatmap(corr, annot=True,  linewidths=.8, cmap="YlGnBu")

In [None]:
sns.barplot(x="SibSp", y="Fare", hue="Survived", data=temp_df, ci=68, capsize=.2)

In [None]:
sns.barplot(x="Parch", y="Fare", hue="Survived", data=temp_df, ci=68, capsize=.2)

In [None]:
sns.barplot(x="SibSp", y="Age", hue="Survived", data=temp_df, ci=68, capsize=.2)

In [None]:
sns.barplot(x="SibSp", y="Age", hue="Survived", data=temp_df, ci=68, capsize=.2)

### Bining 

In [None]:
one_hot_df["Fare"].value_counts()

In [None]:
one_hot_df["Fare"].hist(bins=50)

In [None]:
one_hot_df

In [None]:
fig = plt.figure()
fig.set_size_inches(10,5) # 싸이즈 설정

ax = []
colors = ['b', 'g', 'r', 'c', 'm', 'y', 'k']
for i, col_name in enumerate(numeric_columns):
    ax.append(fig.add_subplot(2,2,i+1)) # 두개의 plot 생성
    X_1 = one_hot_df[col_name]
    
    ax[i].hist(X_1)
    ax[i].set_title(col_name)

In [None]:
df["Parch"].value_counts()

In [None]:
bins = [-1, 0, 2, 5, 9]
group_name = ["l1", "l2", "l3", "l4"]

bin_one_hot_df = merge_and_get(one_hot_df,
    pd.get_dummies(pd.cut(df["Parch"], bins, labels=group_name ), prefix="parch"), on=None, index=True)

bins = [-1, 0, 1, 4, 10]
group_name = ["l1", "l2", "l3", "l4"]
bin_one_hot_df = merge_and_get(bin_one_hot_df,
    pd.get_dummies(pd.cut(df["SibSp"], bins, labels=group_name ), prefix="SibSp"), on=None, index=True)

bin_one_hot_df

### Log Transform

In [None]:
fig = plt.figure()
fig.set_size_inches(10,5) # 싸이즈 설정

ax = []
colors = ['b', 'g', 'r', 'c', 'm', 'y', 'k']
for i, col_name in enumerate(numeric_columns):
    ax.append(fig.add_subplot(2,2,i+1)) # 두개의 plot 생성
    X_1 = bin_one_hot_df[col_name]
    
    ax[i] = sns.distplot(X_1, bins=10)
    ax[i].set_title(col_name)

In [None]:
fig = plt.figure()
fig.set_size_inches(10,5) # 싸이즈 설정

ax = []
colors = ['b', 'g', 'r', 'c', 'm', 'y', 'k']
for i, col_name in enumerate(numeric_columns):
    ax.append(fig.add_subplot(2,2,i+1)) # 두개의 plot 생성
    X_1 = np.log10(bin_one_hot_df[col_name]+0.5)
    
    ax[i]= sns.distplot(X_1, bins=10)
    ax[i].set_title(col_name)

In [None]:
bin_one_hot_df.isnull().sum()

In [None]:
bin_one_hot_df["Fare"] = bin_one_hot_df["Fare"].fillna(bin_one_hot_df["Fare"].mean())
# bin_one_hot_df.isnull().sum()

In [None]:
log_bin_one_hot_df = merge_and_get(bin_one_hot_df, np.log10(bin_one_hot_df["Fare"]+0.5), on=None, index=True)

log_bin_one_hot_df.rename(columns={'Fare_x':'Fare',
                          'Fare_y':'log_fare'}, 
                 inplace=True)
log_bin_one_hot_df

In [None]:
object_columns

In [None]:
temp_df = pd.merge(log_bin_one_hot_df[numeric_columns+object_columns], y_true, left_index=True, right_index=True)

for col_name in ["Pclass","Sex","Embarked" ]:
    sns.violinplot(x=col_name, y="Fare", hue="Survived", data=temp_df, split=True)
    plt.show()

In [None]:
temp_df = pd.merge(log_bin_one_hot_df[numeric_columns+object_columns], y_true, left_index=True, right_index=True)

for col_name in ["Pclass","Sex","Embarked" ]:
    sns.violinplot(x=col_name, y="Age", hue="Survived", data=temp_df, split=True)
    plt.show()

In [None]:
temp_df = pd.merge(log_bin_one_hot_df[numeric_columns+object_columns], y_true, left_index=True, right_index=True)

for col_name in ["Pclass","Sex","Embarked" ]:
    sns.violinplot(x=col_name, y="SibSp", hue="Survived", data=temp_df, split=True)
    plt.show()

In [None]:
temp_df = pd.merge(log_bin_one_hot_df[numeric_columns+object_columns], y_true, left_index=True, right_index=True)

for col_name in ["Pclass","Sex","Embarked" ]:
    sns.swarmplot(x=col_name, y="Age", hue="Survived", data=temp_df, split=True)
    plt.show()

In [None]:
temp_df = pd.merge(log_bin_one_hot_df[numeric_columns+object_columns], y_true, left_index=True, right_index=True)

for col_name in ["Age","Fare","SibSp" ]:
    sns.swarmplot(x="Survived", y=col_name, data=temp_df, split=True)
    plt.show()

In [None]:
temp_df = pd.merge(log_bin_one_hot_df[numeric_columns+object_columns], y_true, left_index=True, right_index=True)
for col_name in ["Parch","Fare","SibSp" ]:
    sns.scatterplot(x="Age", y=col_name, data=temp_df, hue="Survived")
    plt.show()

In [None]:
temp_df = pd.merge(log_bin_one_hot_df[numeric_columns+object_columns], y_true, left_index=True, right_index=True)
for col_name in ["Parch","Age","SibSp" ]:
    sns.scatterplot(x="Fare", y=col_name, data=temp_df, hue="Survived")
    plt.show()

In [None]:
temp_df = pd.merge(log_bin_one_hot_df[numeric_columns+object_columns], y_true, left_index=True, right_index=True)
for col_name in ["Parch","Age","Fare" ]:
    sns.scatterplot(x="SibSp", y=col_name, data=temp_df, hue="Survived")
    plt.show()

In [None]:
temp_df = pd.merge(log_bin_one_hot_df[numeric_columns+object_columns], y_true, left_index=True, right_index=True)
for col_name in ["SibSp","Age","Fare" ]:
    sns.scatterplot(x="Parch", y=col_name, data=temp_df, hue="Survived")
    plt.show()

In [None]:
from sklearn.preprocessing import PolynomialFeatures
poly_features = PolynomialFeatures(degree=3)
X_poly = pd.DataFrame(
    poly_features.fit_transform(log_bin_one_hot_df[numeric_columns]))

temp_df = pd.concat([X_poly, y_true], axis=1)
corr =  temp_df.corr()
sns.set()
plt.subplots(figsize=(20,15))
ax = sns.heatmap(corr,  linewidths=.8, cmap="YlGnBu")

### String handling

In [None]:
from collections import Counter
Counter((log_bin_one_hot_df["Name"].str.lower() + " ").sum().split()).most_common(3)

In [None]:
log_bin_one_hot_df["is_mr"] = log_bin_one_hot_df["Name"].str.lower().str.contains(pat = 'mr.') 
log_bin_one_hot_df["is_miss"] = log_bin_one_hot_df["Name"].str.lower().str.contains(pat = 'miss.') 
log_bin_one_hot_df["is_mrs"] = log_bin_one_hot_df["Name"].str.lower().str.contains(pat = 'mrs.') 

In [None]:
log_bin_one_hot_df["Ticket"].str.rsplit(expand=True, )

In [None]:
log_bin_one_hot_df["Ticket"].str.extractall('(\d+)')

In [None]:
Counter((log_bin_one_hot_df["Ticket"].str.lower() + " ").sum().split()).most_common(30)

In [None]:
log_bin_one_hot_df["is_pc"] = log_bin_one_hot_df["Ticket"].str.lower().str.contains(pat = 'pc') 
log_bin_one_hot_df["is_ca"] = log_bin_one_hot_df["Ticket"].str.lower().str.contains(pat = 'c.a.')  | log_bin_one_hot_df["Ticket"].str.lower().str.contains(pat = 'ca')
log_bin_one_hot_df["is_paris"] = log_bin_one_hot_df["Ticket"].str.lower().str.contains(pat = 'paris')
log_bin_one_hot_df["is_soton"] = log_bin_one_hot_df["Ticket"].str.lower().str.contains(pat = 'soton')
log_bin_one_hot_df["is_ston"] = log_bin_one_hot_df["Ticket"].str.lower().str.contains(pat = 'ston')
log_bin_one_hot_df["is_so"] = log_bin_one_hot_df["Ticket"].str.lower().str.contains(pat = 's.o')


In [None]:
log_bin_one_hot_df

In [None]:
log_bin_one_hot_df.isnull().sum()

In [None]:
test = log_bin_one_hot_df["Cabin"].fillna("99999")
Counter((test.str.lower() + " ").sum().split()).most_common(30)

In [None]:
log_bin_one_hot_df["is_cabin_none"] = test.str.contains(pat = '9999')
log_bin_one_hot_df["is_cabin_a"] = test.str.contains(pat = 'a')
log_bin_one_hot_df["is_cabin_b"] = test.str.contains(pat = 'b')
log_bin_one_hot_df["is_cabin_c"] = test.str.contains(pat = 'c')
log_bin_one_hot_df["is_cabin_d"] = test.str.contains(pat = 'd')
log_bin_one_hot_df["is_cabin_e"] = test.str.contains(pat = 'e')
log_bin_one_hot_df["is_cabin_f"] = test.str.contains(pat = 'f')
log_bin_one_hot_df["is_cabin_g"] = test.str.contains(pat = 'g')

In [None]:
log_bin_one_hot_df.head(2).T

In [None]:
log_bin_one_hot_df.isnull().sum()

In [None]:
def count_cabin(x):
    if type(x) is int:
        return 0
    else:
        return len(x)
log_bin_one_hot_df["number_of_Cabin"] = log_bin_one_hot_df["Cabin"].str.split(" ").fillna(0).map(count_cabin)
log_bin_one_hot_df["log_number_of_Cabin"] = np.log(log_bin_one_hot_df["number_of_Cabin"] + 0.01)

In [None]:
sns.countplot(log_bin_one_hot_df["number_of_Cabin"])

In [None]:
temp_df = pd.merge(log_bin_one_hot_df[["number_of_Cabin", "log_number_of_Cabin"]], y_true, left_index=True, right_index=True)
sns.barplot(x="Survived", y="number_of_Cabin", data=temp_df)
plt.show()
print(temp_df.corr())

### Feature Elimination

In [None]:
features = log_bin_one_hot_df.columns.tolist()
features

In [None]:
all_df = log_bin_one_hot_df.copy(deep=True)

elimination_features= ['PassengerId','Name','Cabin','Ticket']
for col_name in elimination_features:
    all_df.drop(col_name , axis=1, inplace=True)

In [None]:
all_df

In [None]:
# del all_df["Sex"]
# del all_df["Pclass"]
# del all_df["Embarked"]
del all_df["Sex-Pclass"]
del all_df["Embarked-Pclass"]

In [None]:
X_train = all_df[:number_of_train_dataset].values
X_test = all_df[number_of_train_dataset:].values
y_train = y_true.copy()

In [None]:
from sklearn.feature_selection import RFE
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectFromModel

select = RFE(RandomForestClassifier(n_estimators=100))
# select  = SelectFromModel(estimator=RandomForestRegressor(n_estimators=200), threshold="median")

select.fit(X_train, y_train)

# transform training set
X_train_selected = select.transform(X_train)
X_train_selected.shape

In [None]:
select.get_support()


In [None]:
all_df.columns[select.get_support()]

In [None]:
from sklearn.ensemble import RandomForestClassifier

clf = RandomForestClassifier(
    n_estimators=100, max_depth=20, random_state=0)
clf.fit(X_train, y_train)  


# clf.fit(X_train_selected, y_train)  



In [None]:
idx = (all_df[number_of_train_dataset:].index+1).tolist()
y_pre = clf.predict(select.transform(X_test))  

submission_columns = ["PassengerId","Survived"]
submission_df = pd.DataFrame( [idx,y_pre]).T
submission_df.columns = submission_columns
for col_name in submission_columns:
    submission_df[col_name] = submission_df[col_name].astype(int)
submission_df.to_csv("submission.csv", index=False)