In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import re

In [None]:
train_file = "train.csv"
test_file = "test.csv"

In [None]:
df_train_org = pd.read_csv(train_file)
df = df_train_org.copy()# .set_index("PassengerId")
print(df.head())
df_test_org = pd.read_csv(test_file)
df_test = df_test_org.copy()# .set_index("PassengerId")

In [None]:
df['train'] = pd.Series([1 for _ in range(len(df) + 1)])
df_test['train'] = pd.Series([0.0 for _ in range(len(df_test) + 1)])

In [None]:
df = df.set_index('PassengerId')
df_test = df_test.set_index("PassengerId")


In [None]:
all_data = pd.concat([df,df_test], ignore_index=True)
print(all_data)

In [None]:
all_data = all_data.rename(mapper=str.lower, axis=1) 
all_data = all_data.rename(columns={"survived":"y", "embarked":"from", "pclass":"c"})

In [None]:
fam_sizes = all_data['fam_name'].value_counts() 

all_data['fam_size'] = all_data['fam_name'].apply(lambda x: fam_sizes[x])


In [None]:
# in the previous three categories, females have an extremely high probability of surviving. Yet, among the male survivors
# the determining factor is still unclear. Let's consider the following new column which the family size determined out of the name column


# what matters here in the name is the first part representing the family name
all_data['fam_name'] = all_data['name'].apply(lambda x: re.sub('.;:?', ",", x).strip().lower().split(",")[0]) 
# print(all_data['fam_name'])

fam_sizes = all_data.pivot_table(columns='y', index='fam_name', values='ticket', aggfunc='count')
# print(fam_sizes)

In [None]:
df = all_data[all_data['train'] == 1].copy()

In [None]:
X_num_names = ["y", "c", "age", "sibsp", "parch", "fare"]
X_cat_names = ["name", "sex", "ticket", "cabin", "from"]
df_num = df.loc[:, X_num_names]
df_cat = df.loc[:, X_cat_names]

In [None]:
df_num.describe()

In [None]:
for col in df_num.columns:
    plt.hist(df_num[col])
    plt.title(col)
    plt.show()

In [None]:
df_survive = df.drop('train', axis=1)[df['y']==1]
print(df_survive.describe())
print("#" * 50)
df_dead = df.drop('train', axis=1)[df['y'] == 0]
print(df_dead.describe())

In [None]:
df_survive.reset_index().plot(kind='scatter', x='index', y='fare', title='fare variation for survivors')

In [None]:
df_dead.reset_index().plot(kind='scatter', x='index', y='fare', title='fare variation for the dead')

In [None]:
df_ultra_rich = df.drop(['ticket', 'name', 'train', 'cabin', 'from', 'fam_name'], axis=1)[df['fare'] >= 200]
print(df_ultra_rich[df_ultra_rich['y'] != 0])
print("**" * 100)
print(df_ultra_rich[df_ultra_rich['y'] == 0]) 

## so if the passengers are ultra rich, then a female has quite high probability of surviving.
## the males with the most expensive fares are the ones to survive.

In [None]:
# let's consider the passengers paying fare in the range [100, 200]
df_rich = df.drop(['ticket', 'name', 'train', 'cabin', 'from', 'fam_name'], axis=1)[(df['fare'] >= 100) & (df['fare'] <200)]
print(df_rich[df_rich['y'] == 1])
print("*" * 100)
print(df_rich[df_rich['y'] == 0])

In [None]:
df_3 = df.drop(['ticket', 'name', 'train', 'cabin', 'from', 'fam_name'], axis=1)[(df['fare'] >= 50) & (df['fare'] <100)]
df_3.reset_index().plot(kind='scatter', x='index', y='fare', title='third category fare')
df_3_sur = df_3[df_3['y'] == 1]
df_3_dead = df_3[df_3['y'] == 0]
df_3_sur.reset_index().plot(kind='scatter', x='index', y='fare', title = 'third category fare survivors')
df_3_dead.reset_index().plot(kind='scatter', x='index', y='fare', title = 'third category fare dead')

In [None]:
print(df_3_sur.describe())
print(df_3_dead.describe())

In [None]:
# print(df_3_sur['sex'].value_counts())
# print(df_3_dead['sex'].value_counts())
print(df_3[df_3['sex'] == 'male'][['y', 'fam_size', 'fare', 'age']].sort_values(['fam_size', 'y'], ascending=[False, False]))

In [None]:
df_4 = df.drop(['ticket', 'name', 'train', 'cabin', 'from', 'fam_name'], axis=1)[(df['fare'] < 50) & (df['fare'] >= 40)]
print(df_4)

In [None]:
df_5 = df.drop(['ticket', 'name', 'train', 'cabin', 'from', 'fam_name'], axis=1)[(df['fare'] < 40)]
df_5.reset_index().plot(kind='scatter', x='index', y='fare')
print(df_5[df_5['y']==1]['c'].value_counts())
print(df_5[df_5['y']==0]['c'].value_counts())


In [None]:
fun_list = [pd.Series.count, np.sum, np.mean, np.std, np.max, np.min]

df_survived_class = df_survive.groupby("c").agg({"fare":fun_list})
df_dead_class = df_dead.groupby("c").agg({"fare":fun_list})
print(df_survived_class)
print(df_dead_class)


In [None]:
df_num.corr()

In [None]:
print(pd.pivot_table(df, index='y', values=['c','age', 'fare'], aggfunc=[pd.Series.count, np.mean]))

In [None]:
# comparing survivors' values with respect to categorical variables.
print(pd.pivot_table(df, index='y', columns=['c'], values='ticket', aggfunc='count'))
print(pd.pivot_table(df, index='y', columns=['sex'], aggfunc='count', values='ticket'))
print(pd.pivot_table(df, index='y', columns='from', aggfunc='count', values='ticket'))
# at first glance it might seem that passengers embarking from "C" are more likely to survive. Yet, it might be useful to consider
# the social class of people coming from the different stations.

print(pd.pivot_table(df, index='c', columns=['from'], values='name', aggfunc='count'))
# the last observation did not provide evidence to completely rool out the possibility of positive correlation between the embarkment 
# point and survival, more investigation is needed.


In [None]:

# 0: male, 1: female
genre_mapper = {"male":0, "female":1}
all_data["sex"] = pd.Series([genre_mapper[x] for x in all_data['sex']])

from_mapper = {"C":1, "Q":2, "S":3}
all_data["from"] = pd.Series([from_mapper[x] if x in from_mapper else x for x in all_data["from"]])

all_data['from'] = all_data['from'].astype(float)
print(all_data.loc[:, ["y", "from"]].corr())

In [None]:
df = all_data[all_data['train'] == 1].copy()

In [None]:
# understand the relation between classes and the embarkment station
from_class_ana = df.groupby("from").agg({"c":['count', 'mean'], "fare":'mean'})
print(from_class_ana)
# so we can say the embarkment station has little to no correlation with the social class

print(pd.pivot_table(df, index='from',columns='c',values='ticket', aggfunc='count'))


In [None]:
# # understanding the cabin

# print(df["cabin"].isna().sum())
# print(df["cabin"].copy().dropna().count())
# so only 204 passengers bought cabins

In [None]:
# # we assume that the cabins are indeed separated by spaces
# df['num_cabins'] = df.cabin.apply(lambda x: 0 if pd.isna(x) else len(x.split(" ")))
# print(df.num_cabins.value_counts())
# # let's check the relation between number of cabins and social class
# print(pd.pivot_table(df, index='num_cabins', columns='c', values='ticket', aggfunc='count'))

In [None]:
# understanding the fare column:
print (df.loc[:, ["fare"]].describe())
fare_s = df['fare']
fare_df = df.loc[:, ["fare"]]
# consider the nan values
print(fare_s.isna().sum())
# there is no nan values: such a delight !!!

print(fare_s[lambda x : x == 0].count())

In [None]:
# we can see that the survival probability is higher for upper social classes. However, It might be worth noting
# that the ticket's price: fare is as well influencial. In other words, a 3rd class passenger who paid more than a 1st passenger
# might be more likely to survive, let's consider this subtle detail: it might lead to a helpful feature

# df_no_fare = df[df['fare'] == 0]
# df_fare = df[df['fare'] != 0]
# fare_np = df_fare["fare"].values

# quantiles_values = [0, 0.25, 0.5, 0.75, 1]
# fare_quantiles = {}
# for i in range(1, 4):
#     fare_quantiles["fare_q_c" + str(i)] = np.quantile(df_fare[df_fare["c"] == i]["fare"].values, quantiles_values)

# for key, value in fare_quantiles.items():
#     print(str(key) + ": " + str(value)) 

# def quartile_number(value, quantiles):
#     # value assumed to be at least larger or equal then the lowest value
#     assert (value >= quantiles_values[0])
#     for i in range(len(quantiles) - 1):
#         if value >= quantiles[i] and value < quantiles[i + 1]:
#             return i + 1
#     return len(quantiles) - 1 

# def classify_passenger(row):
#     return quartile_number(row['fare'], fare_quantiles["fare_q_c" + str(int(row['c']))])    


# df_fare['quartile_class'] = df_fare.loc[:, ['fare', 'c']].apply(lambda row: classify_passenger(row), axis=1)
# print(df_fare.loc[:, ['fare', 'c', 'quartile_class']].head(15))



In [None]:

# fare_class_quartile_effect = pd.pivot_table(df_fare, index='y', columns=['c','quartile_class'], values='name', aggfunc='count')
# print(fare_class_quartile_effect)
# print("#" * 50)
# fare_class_quartile_effect.loc[2] = fare_class_quartile_effect.loc[1] / fare_class_quartile_effect.loc[0]
# print(fare_class_quartile_effect)

In [None]:
# the results are promissing and thus it is worthy experimenting with quartile class feature
# let's add the values to the all_data dataframe
# first impute the zero values with the column mean

fare_by_class_mean = all_data[all_data['fare'] > 0].groupby("c").agg({"fare": np.mean}).squeeze()
print(fare_by_class_mean)


def fill_up_fare(row):
    if row['fare'] == 0 or np.isnan(row['fare']):
        row['fare'] = fare_by_class_mean[row['c']]
    return row

# we can see that there is positive correlation between the quartile_class feature and survival
# it is necessary to impute the row: the mean seems like a reasonable choice

all_data = all_data.apply(lambda row: fill_up_fare(row) , axis=1)

print(all_data['fare'].isna().sum()) # there is no Nan values anymore
print(all_data[all_data['fare'] <= 0]['fare'].sum()) # there is no 0 fare values anymore



In [None]:
# now the quartile class should be added to the add_data DF
# quantiles_values = [0, 0.25, 0.5, 0.75, 1]
# fare_quantiles = {}
# for i in range(1, 4):
#     fare_quantiles["fare_q_c" + str(i)] = np.quantile(all_data[all_data["c"] == i]["fare"].values, quantiles_values)

# all_data['quartile_class'] = all_data.loc[:, ['fare', 'c']].apply(lambda row: classify_passenger(row), axis=1)

In [None]:
# print(df.loc[:, ["y", "num_cabins"]].corr())

# There are 3 features that might reflect in a passenger's social image:
# * class * fare * num_cabins
# let's consider each individually

# print((pd.pivot_table(df, index='y', columns=[ 'num_cabins', 'c'], values='ticket', aggfunc=['count'])))

In [None]:
## the number of cabins is quite a helpful feature as well so it seems reasonable to add it to the all_data df
# all_data['num_cabins'] = all_data.cabin.apply(lambda x: 0 if pd.isna(x) else len(x.split(" ")))


In [None]:
# let's first consider the title associated with a passenger's name

df['title'] = df['name'].apply(lambda x: re.sub('[:?;.]', ",", x).split(",")[1].strip())

In [None]:
print(df.title.value_counts().index.sort_values())

In [None]:
# let's consider the non-uniform titles and their relevance to the survival
non_comm_title = df[df['title'].isin(['Capt', 'Col', 'Don', 'Dr', 'Jonkheer', 'Lady', 'Major','Rev', 'Sir', 'the Countess'])]
print(pd.pivot_table(non_comm_title, index='y', columns='title', values='ticket', aggfunc='count'))

In [None]:
# comm_title = df[df['title'].isin(['Master','Miss', 'Mlle', 'Mme', 'Mr', 'Mrs', 'Ms'])]
# print(pd.pivot_table(comm_title, index='y', columns='title', values='ticket', aggfunc='count'))

In [None]:
# let's impute the age
print(df['age'].isna().sum())
print(len(df))

In [None]:
class_3_age = df[(~np.isnan(df['age'])) &(df['c'] == 3) ]['age'].values
print(np.nanstd(class_3_age))
print(np.amax(class_3_age), np.amin(class_3_age))

In [None]:
df_test = all_data[all_data['train'] != 1]
print(df_test['age'].isna().sum())

In [None]:
age_class = all_data.groupby('c').agg({"age":[np.mean, np.median]}).iloc[:, 0]
print(age_class)
# count the number of missing age values in each class 
print(all_data[np.isnan(all_data['age'])].groupby('c').agg({"name":'count'})) 
# as we can see there are only few values missing values for first and second class, thus it might not be harmful
# to fill the missing values with the class's mean age
# however, since the 3rd class has a large number of missing values, a more careful imputation might be needed.

as we can see there 4 main titles associated with passenger from the 3rd class.
According to the following [link](https://prowritingaid.com/art/968/mr%2c-mrs%2c-ms-and-miss%3a-everything-you-need-to-know-about-titles.aspx), the title is generally associated with 
an age category or a matrial status (that indeed correlates with age...). It might be a good idea to associate these two features


In [None]:
# let's first consider the title associated with a passenger's name

all_data['title'] = all_data['name'].apply(lambda x: re.sub('[:?;.]', ",", x).split(",")[1].strip())


In [None]:
# let's fill the third class missing ages with the mean of the associated title
print(all_data[all_data['c'] == 3]['title'].value_counts())

title_age_class_3 = all_data[all_data['c'] == 3].groupby('title').agg({"age":np.nanmean}).iloc[:, 0]
print(title_age_class_3)

In [None]:
def fill_up_age_class_3(row):
    value1 = np.round(title_age_class_3[row['title']])
    value2 = np.round(age_class[row['c']]) # used only for one passenger with title Ms.
    if np.isnan(row['age']):
        row['age'] =  value2 if np.isnan(value1) else value1
    return row

def fill_up_age_class_1_2(row):
    if np.isnan(row['age']):
        row['age'] = np.round(age_class[row['c']])
    return row

def fill_up_age(row):
    if row['c'] == 3:
        return fill_up_age_class_3(row)
    return fill_up_age_class_1_2(row)

all_data = all_data.apply(fill_up_age, axis=1)

In [None]:
# it is time to drop the unncessary columns
print(all_data.columns)

In [None]:
all_data = all_data.drop(['name', 'ticket', 'cabin', 'title','sibsp', 'parch', 'fam_name'], axis=1)

In [None]:
df_train = all_data[all_data['train'] == 1].copy()
df_test = all_data[all_data['train'] != 1].copy()
# df_train.dropna(subset=['from','age'], inplace=True) # drop nan values 

print(df_train.columns, df_test.columns)
df_train.dropna(subset=['from', 'age'], inplace=True)
X_train = df_train.drop(['train', 'y'],axis=1).values

y_train = df_train['y'].values
X_test = df_test.drop(['train', 'y'],axis=1).values

In [None]:
# time to scale the data
# we can use the sklearn class for this

from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

X_train = scaler.fit_transform(X_train)
X_test = scaler.fit_transform(X_test)

In [None]:
# try all the models baseline models I currently know
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score

n_splits = 6
random_state = 5
kf = KFold(n_splits=n_splits, shuffle=True, random_state=random_state)

solver = 'liblinear'
lr = LogisticRegression(solver=solver)
cv_scores = cross_val_score(lr, X_train, y_train, cv=kf)
print(np.mean(cv_scores))

In [None]:
from sklearn.svm import SVC

svm = SVC() # the non-linear SVM
cv_scores = cross_val_score(svm, X_train, y_train, cv=kf)
print(cv_scores)

In [None]:
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier()
cv_scores = cross_val_score(svm, X_train, y_train, cv=kf)
print(cv_scores)

In [None]:
from sklearn.tree import DecisionTreeClassifier
dt = DecisionTreeClassifier(random_state=2) 
cv_scores = cross_val_score(dt, X_train, y_train)
print(cv_scores)

In [None]:
from sklearn.metrics import accuracy_score
def model_best_version(model, model_name):
    print(model_name)
    print("parameters \n" + str(model.best_params_))
    print("best f1score \n" + str(model.best_score_))



In [None]:
from sklearn.model_selection import GridSearchCV

# tune the parameters
lamda = np.array([10 ** x for x in np.linspace(-5, 0.1)])

lr_params = {"max_iter": [2000], "penalty":['l2'], 'C': 1 / lamda, 'solver':['liblinear']}

lr_best = GridSearchCV(lr, param_grid=lr_params, cv=6, n_jobs=-1, scoring='f1')

lr_best.fit(X_train, y_train)

model_best_version(lr_best, "LogisticRegression")

In [None]:
knn_params = {'n_neighbors' : range(5, 20),
              'weights' : ['uniform', 'distance'],
              'algorithm' : ['auto', 'ball_tree','kd_tree'],
              'p' : [1,2]}
knn_best = GridSearchCV(knn, param_grid = knn_params, cv = 6,  n_jobs = -1)
knn_best.fit(X_train,y_train)
model_best_version(knn_best, "KNN")

In [None]:
from sklearn.svm import SVC

svm_params = [{'kernel': ['rbf'], 'gamma': [.1,.5,1,2,5,10], 'C': 1 / lamda}]
# {'kernel': ['poly'], 'degree' : [2,3,4,5], 'C': 1 / lamda}]

svc_best = GridSearchCV(SVC(), param_grid=svm_params, cv=6, n_jobs=-1)
svc_best.fit(X_train, y_train)
model_best_version(svc_best, "NON-linear SVM")

In [None]:
dt_params = {"max_depth": [2, 3,4,5,6], "min_samples_leaf": [0.02, 0.04, 0.05, 0.1, 0.12, 0.15], "max_features":["log2", "sqrt", None]}
dt_best = GridSearchCV(dt, param_grid=dt_params, cv=6, n_jobs=-1)
dt_best.fit(X_train, y_train)
model_best_version(dt_best, "DecisionTreeClassifier")

print(cross_val_score(dt_best, X_train, y_train, cv=kf).mean())


In [None]:
X_train_pred = dt_best.predict(X_train)

from sklearn.metrics import confusion_matrix
print(confusion_matrix(y_train, X_train_pred))


In [None]:
y_diff = pd.DataFrame(np.abs(X_train_pred - y_train), columns=['diff'])

y_diff = y_diff[y_diff['diff'] == 1]

train_mis = df_train.drop(['train'], axis=1).iloc[y_diff.index,:]

print(train_mis[train_mis['y'] == 1].head(20))
# print(train_mis[(train_mis['y'] == 0.0) & (train_mis['quartile_class'].isin([3,4]))])
# print("#" * 100)

# print(train_mis[(train_mis['y'] == 1.0) & (train_mis['quartile_class']).isin([1,2])])


In [None]:
# gather submissions
lr_pred = lr_best.predict(X_test)
knn_pred = knn_best.predict(X_test)
dt_pred = dt_best.predict(X_test)


# sub_1 = pd.DataFrame({"PassengerId": df_test_org['PassengerId'], "Survived": lr_pred}).astype(int)
sub_knn = pd.DataFrame({"PassengerId": df_test_org['PassengerId'],"Survived": knn_pred}).astype(int)
sub_dt = pd.DataFrame({"PassengerId": df_test_org['PassengerId'],"Survived": dt_pred}).astype(int)
# sub_1.to_csv("sub1.csv", index=False)
sub_knn.to_csv("sub_knn.csv", index=False)
sub_dt.to_csv("sub_dt.csv", index=False)


In [None]:
# let's consider more complicated models such as Random Forests model.
from sklearn.ensemble import RandomForestClassifier
# rf_basic = RandomForestClassifier()
from sklearn.model_selection import RandomizedSearchCV

# let's try to tune a RandomForest model
rf = RandomForestClassifier()
print(rf.get_params())


In [None]:

rf_params = {'max_depth':[4, 5, 6, 7, 8], 'max_features': ['sqrt', 'log2'], 'min_samples_leaf':[0.02, 0.03, 0.04, 0.05, 0.08, 0.1]
, 'max_samples':[0.8, 0.85, 0.9, 1]}

rf_basic = RandomForestClassifier(random_state=8)
num_folds=6
rf_searcher = GridSearchCV(estimator=rf_basic, 
                            param_grid=rf_params, 
                            n_jobs=-1, 
                            scoring='accuracy', 
                            cv=num_folds)
rf_searcher.fit(X_train, y_train)


In [None]:

rf_best = rf_searcher.best_estimator_

print(rf_best.score(X_train, y_train))

splits = 5
random_state = 3
kf = KFold(n_splits=splits, random_state=random_state, shuffle=True)
print(cross_val_score(rf_best, X_train, y_train, cv=kf).mean())
# it seems that the estimator does not overfit the data (not too badly either way)
    

In [None]:
train_accs = []
cv_accs = []
estimators = range(20, 251, 10)
for i in estimators:
    
    rf_best.n_estimators = i
    rf_best.fit(X_train, y_train)
    train_accs.append(rf_best.score(X_train, y_train))
    cv_accs.append(cross_val_score(rf_best, X_train, y_train, cv=kf).mean())


In [None]:

fig, ax = plt.subplots()
ax.plot(estimators, train_accs, '-b', label='train')
ax.plot(estimators, cv_accs, '--r', label='CV')
leg = ax.legend();

In [None]:
# we can see that the overall performance is more promising in the range [20, 50]
# let's try to focus on this range
min_train_performance = 0.82
# the goal is to choose the model with a train accuracy larger than the treshhold while minimizing the difference between the train accuracy
# and cross validation score

from copy import deepcopy

train_accs = []
cv_accs = []
estimators = range(20, 51)
best_performance = 1
best_model = None
best_t_score = 0
performances = []
t_scores = []

for r in range(0, 100):
    rf_best.random_state = r
    for i in estimators:    
        rf_best.n_estimators = i
        rf_best.fit(X_train, y_train)
        
        t_score = rf_best.score(X_train, y_train)
        cv_score = cross_val_score(rf_best, X_train, y_train, cv=kf).mean()
        
        if t_score >= min_train_performance and abs(t_score - cv_score) <= best_performance:
            best_t_score = t_score
            best_performance = t_score - cv_score
            best_model = deepcopy(rf_best) 
    performances.append(best_performance)
    t_scores.append(best_t_score)


fig, ax = plt.subplots()
ax.plot(range(0, 100), performances, '-b', label='per')
ax.plot(range(0, 100), t_scores, '--r', label='score')
leg = ax.legend();



In [None]:
rf_pred = best_model.predict(X_test)

# sub_1 = pd.DataFrame({"PassengerId": df_test_org['PassengerId'], "Survived": lr_pred}).astype(int)
sub_rf = pd.DataFrame({"PassengerId": df_test_org['PassengerId'],"Survived": rf_pred}).astype(int)

sub_rf.to_csv('sub_rf.csv', index=False)

In [None]:
import xgboost as xgb

xgc = xgb.XGBClassifier(seed=123, objective="reg:logistic")

params = {'n_estimators': range(50, 100),
    'max_depth': [3, 4, 5], 
    "eta": [0.001, 0.01, 0.05, 0.1, 0.2],
    "subsample": [0.8, 0.9, 1], 
    "lambda": [0.01, 0.05, 0.1, 0,5, 0.8], 
}

grid_mse = GridSearchCV(xgc, param_grid=params, scoring='neg_mean_squared_error', cv=4, verbose=1)

grid_mse.fit(X_train, y_train)




In [None]:
xgb_pred = best_model.predict(X_test)

# sub_1 = pd.DataFrame({"PassengerId": df_test_org['PassengerId'], "Survived": lr_pred}).astype(int)
sub_xgb = pd.DataFrame({"PassengerId": df_test_org['PassengerId'],"Survived": xgb_pred}).astype(int)

sub_xgb.to_csv('sub_xgb.csv', index=False)

SVM: {'C': 6.866488450042998, 'gamma': 0.1, 'kernel': 'rbf'}  
RF: {{'bootstrap': True, 'ccp_alpha': 0.0, 'class_weight': None, 'criterion': 'gini', 'max_depth': 6, 'max_features': 'sqrt', 'max_leaf_nodes': None, 'max_samples': 0.9, 'min_impurity_decrease': 0.0, 'min_samples_leaf': 0.02, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'n_estimators': 68, 'n_jobs': None, 'oob_score': False, 'random_state': 6, 'verbose': 0, 'warm_start': False}}

XGBOOST: xgb
parameters 
{'eta': 0.001, 'lambda': 0.01, 'max_depth': 5, 'n_estimators': 90, 'subsample': 1}
best f1score 
0.8369389568941139
None

update: 

XGboost
parameters 
{'eta': 0.001, 'lambda': 0.01, 'max_depth': 3, 'n_estimators': 81, 'subsample': 0.8}
best f1score 
0.8335555286227931
