In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
pd.set_option('display.max_columns', None) # Setting pandas to display a N number of columns
pd.set_option('display.max_rows', None) # Setting pandas to display a N number rows
pd.set_option('display.width', 1000) # Setting pandas dataframe display width to N

import pandas_profiling 

train = pd.read_csv('../input/titanic/train.csv')
test = pd.read_csv('../input/titanic/test.csv')
dataset = [train, test]

# Let's get a summary of our datasets

print('Entries in training set: ', len(train), '\nEntries in testing set: ',len(test))

for df in dataset:
    print(df.isna().sum())

# A combination of training and test dataset would be helpful in data analysis

train_test_comb = pd.concat([train, test], axis=0)

## Feature Engineering Section

### Sex to number

In [None]:
test.info()

In [None]:
for df in dataset:
    df['Age'].fillna(df['Age'].mean(), inplace=True)
    df['Fare'].fillna(df['Fare'].mean(), inplace=True)
    df.Sex.replace({'female':0, 'male': 1}, inplace=True)
    df['FamSize']=df['SibSp']+df['Parch']
    df.drop(['Cabin','Ticket','SibSp','Parch'],axis=1,inplace=True)

In [None]:
train.head()

### We believe cabin, passengerid and ticket number have no effect on the causation of survival rate

### Identify titles of each person

In [None]:
for df in dataset:
    df['Title'] = df['Name'].str.split(', ', expand=True)[1].str.split('.', expand=True)[0]
    print(list(df['Title'].unique()))
    # Replace the titles that has less than 20 ocurrences with 'Misc'
    title_names = (df['Title'].value_counts()> 5) #this will create a true false series with title name as index
    df['Title'] = df['Title'].apply(lambda x: x if title_names.loc[x] == True else 'Misc')
    df.drop(['Name'],axis=1,inplace=True)
print(train['Title'].value_counts())


In [None]:
train.head()

In [None]:
train.corr()

In [None]:
for df in dataset:
    df['Fare_cat'] = pd.qcut(df['Fare'], q=4, labels=(1,2,3,4))
    df['Age_cat'] = pd.qcut(df['Age'], q=4, labels=(1,2,3,4))
    df.drop(['Fare','Age'],axis=1,inplace=True)

In [None]:
train.head()

In [None]:
counts = train["Age_cat"].value_counts()
percent100 = train["Age_cat"].value_counts(normalize=True).mul(100).round(1).astype(str) + '%'
acc_day=pd.DataFrame({'counts': counts, 'Percent': percent100})
print(acc_day)

A hypothesis is that port of embarkment is not relevant to survival

In [None]:
train.head()

In [None]:
train.info()

In [None]:
train.drop(['Embarked'],axis=1,inplace=True)
test.drop(['Embarked'],axis=1,inplace=True)

In [None]:
for df in dataset:
    # Convert category dtypes to integers
    df['Age_cat'] = df['Age_cat'].astype(np.int32)
    df['Fare_cat'] = df['Fare_cat'].astype(np.int32)
    # lambda function to change the values of 'Familysize'
    df['FamSize'] = df['FamSize'].apply(lambda x: 'Alone' if x==0 else('Small' if x>0 and x<5 else('Medium' if x>=5 and x<7 else 'Large')))

In [None]:
# One-hot encoding
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, StandardScaler

features = ['Age_cat', 'Fare_cat', 'Pclass', 'Sex', 'Title', 'FamSize']
encoded_features = []

for df in dataset:
  for feature in features:
    encoded = OneHotEncoder().fit_transform(df[feature].values.reshape(-1, 1)).toarray()
    cols = [f'{feature}_{n}' for idx, n in enumerate(df[feature].unique())]
    encoded_df = pd.DataFrame(encoded, columns=cols)
    encoded_df.index = df.index
    encoded_features.append(encoded_df)

train_one = pd.concat([train, *encoded_features[:6]], axis=1)
test_one = pd.concat([test, *encoded_features[6:]], axis=1)

dataset = [train_one, test_one]

In [None]:
for df in dataset:
    df.drop(['Pclass','Sex','FamSize','Title'],axis=1,inplace=True)

In [None]:
from sklearn.model_selection import train_test_split

features = [x for x in train_one.columns if x!='Survived']

x = train_one[features].to_numpy()
y = train_one['Survived'].to_numpy()

x_train, x_val, y_train, y_val = train_test_split(x, y, train_size = int(0.85*len(train_one)), shuffle=False ,random_state=1912)

print(x_train.shape, y_train.shape, x_val.shape, y_val.shape)

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, classification_report

clf = RandomForestClassifier(criterion='gini', 
                        n_estimators=300,
                        max_depth=4,
                        min_samples_split=4,
                        min_samples_leaf=7,
                        max_features='auto',
                        oob_score=True,
                        random_state=1400,
                        n_jobs=-1)

clf.fit(x_train, y_train)

y_pred = clf.predict(x_val)

cm = confusion_matrix(y_val, y_pred)
print(cm)
print(classification_report(y_val, y_pred))

In [None]:
newcols=np.zeros(len(test_one))
test_one['Title_Dr']=newcols
test_one['Title_Rev']=newcols
indexes=pd.DataFrame(test_one.index.values)
indexes.head()

In [None]:
test_data = test_one[features].to_numpy()

prediction_clf = clf.predict(test_data)
print(len(prediction_clf))

output = pd.DataFrame({'PassengerId': test['PassengerId'], 'Survived': prediction_clf})
output.to_csv('/kaggle/working/my_submission.csv', index=False)