### Open train and test data set

In [13]:
#!pip install pandas
#!pip install sklearn
#!pip install xgboost

In [28]:
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
import xgboost as xgb
import pandas as pd

Tomamos el código del baseline

In [2]:
def transform_data(train_data_fname, test_data_fname):
    df_train = pd.read_csv(train_data_fname)
    df_train['is_train_set'] = 1
    df_test = pd.read_csv(test_data_fname)
    df_test['is_train_set'] = 0

    # we  get the TripType for the train set. To do that, we group by VisitNumber and
    # then we get the max (or min or avg)
    y = df_train.groupby(["VisitNumber", "Weekday"], as_index=False).max().TripType

    # we remove the TripType now, and concat training and testing data
    # the concat is done so that we have the same columns for both datasets
    # after one-hot encoding
    df_train = df_train.drop("TripType", axis=1)
    df = pd.concat([df_train, df_test])
    
    # the next three operations are the ones we have just presented in the previous lines
    
    # drop the columns we won't use (it may be good to use them somehow)
    #df = df.drop([ "FinelineNumber"], axis=1)  #do not drop upc
    #df = df.drop(["Upc", "FinelineNumber"], axis=1)
    # one-hot encoding for the DepartmentDescription
    df = pd.get_dummies(df, columns=["DepartmentDescription"], dummy_na=True)

    # now we add the groupby values
    df = df.groupby(["VisitNumber", "Weekday"], as_index=False).sum()
    
    # finally, we do one-hot encoding for the Weekday
    df = pd.get_dummies(df, columns=["Weekday"], dummy_na=True)

    # get train and test back
    df_train = df[df.is_train_set != 0]
    df_test = df[df.is_train_set == 0]
    
    X = df_train.drop(["is_train_set"], axis=1)
    yy = None
    XX = df_test.drop(["is_train_set"], axis=1)

    return X, y, XX, yy

In [3]:
raw = "https://raw.githubusercontent.com/DiploDatos/AprendizajeSupervisado/master/practico/data/"

In [4]:
X, y, XX, yy = transform_data(raw+'train.csv', raw+'test.csv')

In [6]:
dmatrix = xgb.DMatrix(data = X, label = y)

In [7]:
X_train, X_valid, y_train, y_valid = train_test_split(X,y, train_size=0.80, random_state=42)

In [8]:
X_train.shape, X_valid.shape, y_train.shape, y_valid.shape

((53623, 81), (13406, 81), (53623,), (13406,))

In [9]:
y.value_counts().count(), y_train.value_counts().count(), y_valid.value_counts().count()

(38, 38, 38)

In [10]:
model_xgb = xgb.XGBClassifier(objective = "multi:softprob", random_state = 0, max_depth = 8, learning_rate = 0.1, sampling_method = "gradient_based")

In [11]:
model_xgb.fit(X_train,y_train)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.1, max_delta_step=0, max_depth=8,
              min_child_weight=1, missing=nan, monotone_constraints='()',
              n_estimators=100, n_jobs=0, num_parallel_tree=1,
              objective='multi:softprob', random_state=0, reg_alpha=0,
              reg_lambda=1, sampling_method='gradient_based',
              scale_pos_weight=None, subsample=1, tree_method='exact',
              validate_parameters=1, verbosity=None)

In [14]:
y_valid_pred = model_xgb.predict(X_valid)

In [29]:
accuracy_score(y_valid, y_valid_pred)

0.7195285692973296


And finally, we predict the unknown label for the testing set

In [30]:
X.shape, XX.shape

((67029, 81), (28645, 81))

In [31]:
yy = model_xgb.predict(XX)

The last thing we do is generating a file that should be submitted on kaggle

In [32]:
submission = pd.DataFrame(list(zip(XX.VisitNumber, yy)), columns=["VisitNumber", "TripType"])


In [33]:
submission.to_csv("submission_xgb_gradsamp.csv", header=True, index=False)
