In [None]:
# This script shows you how to make a submission using a few
# useful Python libraries.
# It gets a public leaderboard score of 0.76077.
# Maybe you can tweak it and do better...?

import pandas as pd
import xgboost as xgb
from sklearn.preprocessing import LabelEncoder
import numpy as np

# Load the data
train_df = pd.read_csv('../data/train.csv', header=0)
test_df = pd.read_csv('../data/test.csv', header=0)


# We'll impute missing values using the median for numeric columns and the most
# common value for string columns.
# This is based on some nice code by 'sveitser' at http://stackoverflow.com/a/25562948
from sklearn.base import TransformerMixin
class DataFrameImputer(TransformerMixin):
    def fit(self, X, y=None):
        self.fill = pd.Series([X[c].value_counts().index[0]
            if X[c].dtype == np.dtype('O') else X[c].median() for c in X],
            index=X.columns)
        return self
    def transform(self, X, y=None):
        return X.fillna(self.fill)

feature_columns_to_use = ['AnimalType', 'SexuponOutcome']
nonnumeric_columns = ['AnimalType', 'SexuponOutcome']

# Join the features from train and test together before imputing missing values,
# in case their distribution is slightly different
big_X = train_df[feature_columns_to_use].append(test_df[feature_columns_to_use])
big_X_imputed = DataFrameImputer().fit_transform(big_X)

# XGBoost doesn't (yet) handle categorical features automatically, so we need to change
# them to columns of integer values.
# See http://scikit-learn.org/stable/modules/preprocessing.html#preprocessing for more
# details and options
le = LabelEncoder()
for feature in nonnumeric_columns:
    big_X_imputed[feature] = le.fit_transform(big_X_imputed[feature])

# Prepare the inputs for the model
train_X = big_X_imputed[0:train_df.shape[0]].as_matrix()
test_X = big_X_imputed[train_df.shape[0]::].as_matrix()
train_df.OutcomeType = le.fit_transform(train_df.OutcomeType)
train_y = train_df['OutcomeType']
train_y

# You can experiment with many other options here, using the same .fit() and .predict()
# methods; see http://scikit-learn.org
# This example uses the current build of XGBoost, from https://github.com/dmlc/xgboost
param = {"objective": "multi:softprob",
         'num_class':5,
              "eta": 0.05,
              "max_depth": 8,
              "min_child_weight": 8,
              "eval_metric" : "auc",
              "silent": 1,
              "seed":1,
              "subsample": 0.5,
              "colsample_bytree" : 0.5,
              "alpha" : 4}
              
print(train_X)
num_round=100
from sklearn.cross_validation import train_test_split
X_train, X_valid,y_train,y_valid = train_test_split(train_X,train_y, test_size=0.01, random_state=10)
xgmat = xgb.DMatrix(X_train, label=y_train)
xgmat_valid = xgb.DMatrix(X_valid, label=y_valid)
xgmat_test=xgb.DMatrix(test_X)
watchlist = [(xgmat_valid,'eval'),(xgmat,'train') ]

gbm = xgb.train(param,xgmat, num_round,watchlist,early_stopping_rounds=0)
predictions = gbm.predict(xgmat_test)

predictions
# # Kaggle needs the submission to have a certain format;
# # see https://www.kaggle.com/c/titanic-gettingStarted/download/gendermodel.csv
# # for an example of what it's supposed to look like.
# submission = pd.DataFrame({ 'PassengerId': test_df['PassengerId'],
#                             'Survived': np.round(predictions,0) })
# submission.to_csv("submission.csv", index=False)

