In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import xgboost as xgb
from sklearn.metrics import confusion_matrix    # 生成混淆矩阵函数
import matplotlib.pyplot as plt 

In [2]:
data_dir = "data/"

In [3]:
test = pd.read_csv(data_dir + "test.csv")
train = pd.read_csv(data_dir + "train.csv")

category_list = train["category"].unique()
category_list.sort()
category_mapping = {label:idx for idx,label in enumerate(category_list)}
num_mapping = {idx:label for idx,label in enumerate(category_list)}

train['category_num'] = train['category'].map(category_mapping)

column_names = []
for i in range(27):
    column_names.append("image_" + str(i))

xgb_column_names = []
for i in range(27):
    xgb_column_names.append("xgb_category_" + str(i))

# train_image_df = pd.read_csv(data_dir + "train_pred_image.csv")
train_image_df2 = pd.read_csv(data_dir + "image2_train_pred.csv")
train_image_df2.columns = column_names

# test_image_df = pd.read_csv(data_dir + "test_pred_image.csv")
test_image_df2 = pd.read_csv(data_dir + "image2_test_pred.csv")
test_image_df2.columns = column_names

dfmin = train_image_df2.min(axis = 1)
dfmax = train_image_df2.max(axis = 1)
train_image_df3 = train_image_df2.copy()
for c in list(train_image_df3.columns):
    train_image_df3.loc[:,c] = (train_image_df2.loc[:,c] - dfmin)/(dfmax-dfmin)

dfmin = test_image_df2.min(axis = 1)
dfmax = test_image_df2.max(axis = 1)
test_image_df3 = test_image_df2.copy()
for c in list(test_image_df3.columns):
    test_image_df3.loc[:,c] = (test_image_df2.loc[:,c] - dfmin)/(dfmax-dfmin)
    
train_fasttext_df = pd.read_csv(data_dir + "train_pred_fast_text.csv")
test_fasttext_df = pd.read_csv(data_dir + "test_pred_fast_text.csv",nrows=21628)

train_lstm_df = pd.read_csv(data_dir + "train_lstm_X.csv")
train_lstm_df = train_lstm_df.drop(columns = ["gender_","baseColour_","season_","usage_"])
test_lstm_df = pd.read_csv(data_dir + "test_lstm_X.csv")

train_lstm_df1 = pd.read_csv(data_dir + "new_train_lstm_X.csv")
test_lstm_df1 = pd.read_csv(data_dir + "new_test_lstm_X.csv")

category_xgb_train = pd.read_csv(data_dir + "category_xgb_train.csv")
category_xgb_train.columns = xgb_column_names
category_xgb_test = pd.read_csv(data_dir + "category_xgb_test.csv")
category_xgb_test.columns = xgb_column_names

In [4]:
train_total = pd.concat([category_xgb_train, train_fasttext_df, train_lstm_df1, train_image_df3],axis = 1)
test_total = pd.concat([category_xgb_test, test_fasttext_df, test_lstm_df1, test_image_df3],axis = 1)

xgb_train_X = train_total[:17000]
xgb_eval_X = train_total[17000:]

category_list = train["category"].unique()
category_list.sort()
category_mapping = {label:idx for idx,label in enumerate(category_list)}
num_mapping = {idx:label for idx,label in enumerate(category_list)}

train_total_Y = train['category'][:len(train)].map(category_mapping)

train_Y = train['category'][:17000].map(category_mapping)
eval_Y = train['category'][17000:len(train)].map(category_mapping)

xg_train = xgb.DMatrix(xgb_train_X, label=train_Y)
xg_eval = xgb.DMatrix(xgb_eval_X, label=eval_Y)  


# setup parameters for xgboost  
param = {}  
# use softmax multi-class classification  
param['objective'] = 'multi:softmax'  
# scale weight of positive examples  
param['eta'] = 0.1  
param['max_depth'] = 3  
# param['silent'] = 1  
param['nthread'] = 4  
param['num_class'] = 27  
  
watchlist = [(xg_train,'train'), (xg_eval, 'eval') ]  
num_round = 200  
# bst = xgb.train(param, xg_train, num_round, watchlist );

# get prediction  
# pred = bst.predict(xg_eval); 

# print ('predicting, classification error=%f' % (1- sum(eval_Y == pred)/len(pred)))

In [5]:
def xgb_train(X_train, Y_train, X_val, Y_val):
    xg_train = xgb.DMatrix(X_train, label=Y_train)
    xg_eval = xgb.DMatrix(X_val, label=Y_val)  


    watchlist = [(xg_train,'train'), (xg_eval, 'eval') ]  
    num_round = 200  
    bst = xgb.train(param, xg_train, num_round, watchlist );

    # get prediction  
    pred = bst.predict(xg_eval); 

    print ('predicting, classification error=%f' % (1- sum(Y_val == pred)/len(pred)))
    return bst,pred

In [7]:
xg_train = xgb.DMatrix(train_total, label=train_total_Y)
   
watchlist = [(xg_train,'train'), (xg_eval, 'eval') ]  
num_round = 150  
bst_FINAL = xgb.train(param, xg_train, num_round, watchlist );

# get prediction  
# pred_bst_FINAL = bst_FINAL.predict(xg_eval); 

# print ('predicting, classification error=%f' % (1- sum(eval_Y == pred)/len(pred)))

[0]	train-merror:0.026032	eval-merror:0.023557
[1]	train-merror:0.023628	eval-merror:0.022477
[2]	train-merror:0.022472	eval-merror:0.021396
[3]	train-merror:0.021917	eval-merror:0.020316
[4]	train-merror:0.021177	eval-merror:0.019235
[5]	train-merror:0.020715	eval-merror:0.018803
[6]	train-merror:0.020206	eval-merror:0.01837
[7]	train-merror:0.01979	eval-merror:0.01837
[8]	train-merror:0.019513	eval-merror:0.017938
[9]	train-merror:0.019235	eval-merror:0.01837
[10]	train-merror:0.018773	eval-merror:0.017506
[11]	train-merror:0.018357	eval-merror:0.017074
[12]	train-merror:0.018079	eval-merror:0.016209
[13]	train-merror:0.017894	eval-merror:0.016209
[14]	train-merror:0.017617	eval-merror:0.015345
[15]	train-merror:0.017524	eval-merror:0.015561
[16]	train-merror:0.017293	eval-merror:0.015345
[17]	train-merror:0.016785	eval-merror:0.014912
[18]	train-merror:0.016738	eval-merror:0.015129
[19]	train-merror:0.016553	eval-merror:0.014912
[20]	train-merror:0.016322	eval-merror:0.014912
[21]	t

In [8]:
xg_test = xgb.DMatrix(test_total) 
pred_xg_test = bst_FINAL.predict(xg_test)
test["class_num"] = pred_xg_test
test['category'] = test["class_num"].map(num_mapping)
submit_df = pd.DataFrame(test, columns = ['id','category'])

In [9]:
submit_df.to_csv("submit.csv",index=False)