In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

In [2]:
train_data = pd.DataFrame(pd.read_csv('train.csv'))
test_data = pd.DataFrame(pd.read_csv('test.csv'))

In [3]:
train_data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [4]:
test_data.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


In [25]:
c = test_data.columns.tolist().index('Name')

for i, j in test_data.head().iterrows():
    h = [k for k in j]
    h = h + [i]
    h.remove(c)
    print([k for k in j] + [i])

    


ValueError: list.remove(x): x not in list

In [None]:
train_data['Title'] = train_data.Name.str.extract(r',\s*([^\.]*)\s*\.', expand=False)
test_data['Title'] = test_data.Name.str.extract(r',\s*([^\.]*)\s*\.', expand=False)

In [None]:
def title_transform(x):
    if x == 'Mr':
        return x
    elif x in ['Mrs', 'Miss', 'Mme','Ms','Lady', 'Mlle', 'the Countess']:
        return 'Ms'
    elif x == 'Master':
        return x
    else:
        return 'Rare'

In [None]:
train_data['Title'] = train_data.Title.apply(title_transform)
test_data['Title'] = test_data.Title.apply(title_transform)

In [None]:
train_data['FamilySize'] = train_data['SibSp'] + train_data['Parch']
test_data['FamilySize'] = test_data['SibSp'] + test_data['Parch']

In [None]:
train_data['IsAlone'] = train_data['FamilySize'].eq(0).astype(int)
test_data['IsAlone'] = test_data['FamilySize'].eq(0).astype(int)

In [None]:
train_data['HasCabin'] = train_data.Cabin.notna().astype(int)
test_data['HasCabin'] = test_data.Cabin.notna().astype(int)

In [None]:
train_data.isnull().sum()

In [None]:
test_data.isnull().sum()

In [None]:
obj_features = ['Sex', 'Embarked', 'Title']
enc_features = ['Sex_enc', 'Embarked_enc', 'Title_enc']
train_data[obj_features].nunique()

In [None]:
imp_obj = SimpleImputer(strategy='most_frequent')
train_data[obj_features] = train_data[obj_features].fillna('missed')
test_data[obj_features] = test_data[obj_features].fillna('missed')

encoder = OrdinalEncoder()
train_data[enc_features] = pd.DataFrame([[np.nan, np.nan, np.nan]])
test_data[enc_features] = pd.DataFrame([[np.nan, np.nan, np.nan]])

train_data[enc_features] = encoder.fit_transform(train_data[obj_features])
test_data[enc_features] = encoder.transform(test_data[obj_features])

train_data[enc_features] = train_data[enc_features].astype(int)
test_data[enc_features] = test_data[enc_features].astype(int)

In [None]:
train_data.isnull().sum()

In [None]:
test_data.isnull().sum()

In [None]:
test_data['Fare'] = test_data.Fare.fillna(0)

In [None]:
test_age = test_data.Age.mean()
test_data['Age'] = train_data.Age.fillna(test_age)

In [None]:
train_data.drop(['Name', 'Ticket', 'Cabin'], axis=1, inplace=True)
test_data.drop(['Name', "Ticket", 'Cabin'], axis=1, inplace=True)

In [None]:
train_data.info()

In [None]:
test_data.info()

In [None]:
OH_train = pd.get_dummies(train_data)
OH_test = pd.get_dummies(test_data)
OH_train.head()
OH_test.head()

In [None]:
features = [col for col in OH_train.columns]
del features[0:2]
# del features[9:11]
# del features[2:4]
# del features[3]

# ls = ['PassengerId', 'Survived', 'SibSp', 'Parch', 'FamilySize']
# for i in ls:
#     del features[features.index(i)]
features

In [None]:
X = OH_train[features].copy()
y = train_data.Survived.copy()

In [None]:
X.head()

In [None]:
y.head()

In [None]:
x_train, x_test, y_train, y_test = train_test_split(X, y, train_size=0.8,
                                                    test_size=0.2, random_state=1)

In [None]:
train_age = x_train.Age.mean()
test_age = x_test.Age.mean()
x_train['Age'] = x_train.Age.fillna(train_age)
x_test['Age'] = x_test.Age.fillna(test_age)

In [None]:
from xgboost import XGBRegressor, XGBClassifier

In [None]:
def mae_xg(i, j):
    model = XGBClassifier(random_state=1, learning_rate=i, n_estimators=100000, n_jobs=4)
    model.fit(x_train, y_train,
             early_stopping_rounds=j, 
             eval_set=[(x_test, y_test)],
             verbose=False)
    return mean_absolute_error(model.predict(x_test), y_test)

In [None]:
mae_dict = {}
digit_dict = {}
i_range = [i/1000 for i in range(1, 101)]
for i in i_range:
    print(i)
    for j in range(1, 201):
        tmp = mae_xg(i, j)
        if i not in mae_dict:
            mae_dict[i] = tmp
            digit_dict[i] = j
        elif tmp < mae_dict[i]:
            mae_dict[i] = tmp
            digit_dict[i] = j

In [None]:
l1 = min(mae_dict, key=mae_dict.get)
l2 = digit_dict[l1]
l1, l2, mae_dict[l1]

In [None]:
model = XGBClassifier(random_state=1, learning_rate=l1, n_estimators=40000, n_jobs=4)
model.fit(x_train, y_train,
         early_stopping_rounds=l2, 
         eval_set=[(x_test, y_test)],
         verbose=False)
OH_test['Embarked_missed'] = 0
pred = model.predict(OH_test[features])

In [None]:
mean_absolute_error(model.predict(x_test), y_test)

In [None]:
from sklearn.pipeline import Pipeline
pipe_model = Pipeline(steps=[('preprocessor', SimpleImputer()),
                              ('model', XGBClassifier(random_state=1, learning_rate=l1,
                                                      n_estimators=40000, n_jobs=4)),])

In [None]:
from sklearn.model_selection import cross_val_score

# Multiply by -1 since sklearn calculates *negative* MAE
scores = -1 * cross_val_score(pipe_model, X, y,
                              cv=5,
#                               fit_params={'verbose': False},
                              scoring='neg_mean_absolute_error')

print("MAE score:\n", scores.mean())

In [None]:
output = pd.DataFrame(
    {'PassengerId': test_data.PassengerId,
     'Survived': pred}).to_csv('submission.csv', index=False)

In [None]:
!kaggle competitions submit -c titanic -f submission.csv -m "sklearn try"