In [None]:
import numpy as np
import pandas as pd
import matplotlib
import seaborn as sns
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.preprocessing import StandardScaler, LabelEncoder, RobustScaler
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.ensemble import GradientBoostingRegressor


In [None]:
train_df = pd.read_csv('../input/titanic/train.csv')
test_df = pd.read_csv('../input/titanic/test.csv')

In [None]:
test_df.head()

In [None]:
train_df.head()

In [None]:
full_df = pd.concat([train_df, test_df]).reset_index(
    drop=True).drop(['PassengerId'], axis=1)


In [None]:
full_df

In [None]:
full_df.info()

In [None]:
miss_data = full_df[['Age', 'Fare', 'Cabin', 'Embarked']]

In [None]:
miss_data.value_counts()

In [None]:
miss_data.describe()

In [None]:
print(f'''
    Count of Age missings: {miss_data.Age.isna().sum()} or {round(miss_data.Age.isna().sum() * 100 / len(miss_data), 2)} %
    Count of Fare missings: {miss_data.Fare.isna().sum()} or {round(miss_data.Fare.isna().sum() * 100 / len(miss_data), 2)}
    Count of Cabin missings: {miss_data.Cabin.isna().sum()} or {round(miss_data.Cabin.isna().sum() * 100 / len(miss_data), 2)} %
    Count of Embarked missings: {miss_data.Embarked.isna().sum()} or {round(miss_data.Embarked.isna().sum() * 100 / len(miss_data), 2)} %
    ''')


In [None]:
full_df[full_df.Fare == 0]

In [None]:
full_df[full_df.Fare == 0].describe()

In [None]:
full_df.Embarked.fillna(full_df.Embarked.mode()[0], inplace = True)
full_df.Fare.fillna(full_df.Fare.mode()[0], inplace = True)
full_df.drop(['Cabin', 'Ticket'], axis=1, inplace=True)

In [None]:
sns.displot(data=full_df, x='Fare')

In [None]:
Fare_class = pd.qcut(full_df.Fare, q=8, labels=[1, 2, 3, 4, 5, 6, 7, 8])
full_df['Fare_class'] = Fare_class.astype(int)
sns.displot(data=full_df, x='Fare_class', hue='Survived')

In [None]:
full_df.corr()

In [None]:
full_df['Title']=0
full_df['Title']=full_df.Name.str.extract('([A-Za-z]+)\.') 
full_df['Title'].replace(['Mlle','Mme','Ms','Dr','Major','Lady','Countess','Jonkheer','Col',
                         'Rev','Capt','Sir','Don'],['Miss','Miss','Miss','Mr','Mr','Mrs','Mrs','Other','Other','Other','Mr','Mr','Mr'],inplace=True)


In [None]:
full_df

In [None]:
sns.catplot(data=full_df, y='Age', x='Title', kind='bar')

In [None]:
full_df.isna().sum()

In [None]:
full_df.drop(['Age'], axis=1).corrwith(other=full_df.Age)

In [None]:
full_df.Age.describe()

In [None]:
sns.catplot(data=full_df, y='Age', x='Pclass', kind='bar')
sns.catplot(data=full_df, y='Age', x='SibSp', kind='bar')
sns.catplot(data=full_df, y='Age', x='Parch', kind='bar')
sns.catplot(data=full_df, y='Age', x='Fare_class', kind='bar')

In [None]:
le = LabelEncoder()
categorical_features = ['Sex', 'Title', 'Embarked']
full_df_enc = full_df.copy()
for feature  in categorical_features: 
    feature_enc = le.fit_transform(full_df[feature])
    full_df_enc[feature] = pd.Series(feature_enc.reshape(-1,))


In [None]:
full_df_enc

In [None]:
sns.heatmap(data=full_df_enc.corr(), annot=True)


In [None]:
sns.heatmap(data=full_df_enc.corr(), annot=True)


In [None]:
full_df_enc.loc[full_df_enc['Title'] == 0, 'Title'] = 4


In [None]:
full_df_enc.drop(['Age'], axis=1).corrwith(other=full_df_enc.Age)


In [None]:
X = full_df_enc[full_df_enc['Age'].notna()].drop(['Age', 'Name', 'Survived'], axis=1)
y = full_df_enc[full_df_enc['Age'].notna()].Age
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=0, stratify=X['Title'])


In [None]:
regr_3 = GradientBoostingRegressor(random_state=0, max_depth=4, n_estimators=1200, 
                                   learning_rate=0.018, loss='huber', warm_start=True, verbose=1)

regressors = [regr_3]
for regressor in regressors:
    pipe = Pipeline([('scaler', StandardScaler()), ('regr', regressor)])
    pipe.fit(X_train, y_train)

    print(regressor)
    print(pipe.score(X_train, y_train))
    print(f' test_score: {pipe.score(X_test, y_test)}')
  

In [None]:
pd.Series(pipe.predict(X_test)-y_test).describe()


In [None]:
full_df_enc.info()

In [None]:
predicted_age = pipe.predict(full_df_enc[full_df_enc['Age'].isna()].drop(['Name', 'Survived', 'Age'], axis=1))


In [None]:
full_df_enc.loc[full_df_enc['Age'].isna(), 'Age'] = np.round(predicted_age, 1)


In [None]:
full_df_enc.Age.hist()


In [None]:
full_df.Age.hist()


In [None]:
X_sur = full_df_enc.loc[full_df_enc.Survived.notna()].drop(['Survived', 'Name'], axis=1)  
y_sur = np.array(full_df_enc.loc[full_df_enc.Survived.notna()]['Survived'].astype(int))
X_train_sur, X_test_sur, y_train_sur, y_test_sur = train_test_split(X_sur, y_sur, test_size=0.4, random_state = 0, stratify=X_sur['Title'])


In [None]:
from sklearn.dummy import DummyClassifier
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV, PassiveAggressiveClassifier
from sklearn.metrics import confusion_matrix
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.ensemble import AdaBoostClassifier, BaggingClassifier, ExtraTreesClassifier, GradientBoostingClassifier, IsolationForest, RandomForestClassifier, RandomTreesEmbedding, StackingClassifier
from sklearn.ensemble import VotingClassifier
# from hpbandster_sklearn import HpBandSterSearchCV


In [None]:
classifier_10 = GradientBoostingClassifier(random_state=0, criterion='friedman_mse',
                                           learning_rate=0.18,
                                           loss='exponential',
                                           max_depth=5,
                                           max_leaf_nodes=30,
                                           min_samples_leaf=3,
                                           min_samples_split=20,
                                           subsample=0.9736842105263157,
                                           n_estimators=22)


In [None]:
classifiers = [classifier_10]
for classifier in classifiers:
    classifier.fit(X_train_sur, y_train_sur)
    print(classifier)
    print(f'train score: {classifier.score(X_train_sur, y_train_sur)}')
    print(f'test score: {classifier.score(X_test_sur, y_test_sur)}')


In [None]:
confusion_matrix(y_test_sur, classifier_10.predict(X_test_sur))


In [None]:
predicted_survived = classifier_10.predict(full_df_enc.loc[full_df_enc.Survived.isna()].drop(['Name', 'Survived'], axis=1))


In [None]:
predicted_survived


In [None]:
test_df['Survived'] = predicted_survived.astype(int)


In [None]:
submission = pd.DataFrame(index=[test_df.PassengerId, test_df.Survived])



In [None]:
submission.to_csv('submission.csv')
