In [1]:
from utils import stacking_zoo
import pickle
import pandas as pd
import numpy as np

In [2]:
#load the corss validation set
train = pickle.load(open('../data/train_preprocessed.pkl','rb'))
test = pickle.load(open('../data/test_preprocessed.pkl','rb'))
test_pre_pred = pickle.load(open('../data/test_predicted.pkl','rb'))
labels = pickle.load(open('../data/labels_preprocessed.pkl','rb'))

tfidf10k_train = pickle.load(open('../data/tfidf_10k_train.pkl','rb'))
tfidf10k_test = pickle.load(open('../data/tfidf_10k_test.pkl','rb'))

In [3]:
print("train:", train.shape, "test:",test.shape, "test_pre_pred:",test_pre_pred.shape,
     "labels:", labels.shape, "tfidf-10k-train:",tfidf10k_train.shape, "tfidf-10k-test:",tfidf10k_test.shape)

train: (18660, 1) test: (18533, 1) test_pre_pred: (298, 29) labels: (18660, 29) tfidf-10k-train: (18660, 10000) tfidf-10k-test: (18533, 10000)


In [4]:
#create model
model = stacking_zoo.build_StackingModelCV()

In [5]:
#prepare output
sub = test.copy()
sub.drop(columns=['doc_text'], inplace=True)
for _col in labels.columns.tolist():
    sub[_col] = np.uint8(0)

In [6]:
for label in labels.columns:
    print('Fitting label "%s" now...' % label)
    model.fit(tfidf10k_train.todense(), labels[label].values)
    y = model.predict(tfidf10k_test.todense())
    sub[label] = y[:]

Fitting label "information_and_communication_technologies" now...
Fitting label "governance" now...
Fitting label "urban_development" now...
Fitting label "law_and_development" now...
Fitting label "public_sector_development" now...
Fitting label "agriculture" now...
Fitting label "communities_and_human_settlements" now...
Fitting label "health_and_nutrition_and_population" now...
Fitting label "culture_and_development" now...
Fitting label "environment" now...
Fitting label "social_protections_and_labor" now...
Fitting label "industry" now...
Fitting label "macroeconomics_and_economic_growth" now...
Fitting label "international_economics_and_trade" now...
Fitting label "conflict_and_development" now...
Fitting label "finance_and_financial_sector_development" now...
Fitting label "science_and_technology_development" now...
Fitting label "rural_development" now...
Fitting label "poverty_reduction" now...
Fitting label "private_sector_development" now...
Fitting label "informatics" now..

In [7]:
#review submission distribution by class
for label in sub.columns:
    pltme = pd.Series(sub[label].values, name=label)
    in_group = np.where(pltme >= 0.5, 1, 0).sum()
    print('Total for "%s" is %s' % (label, in_group))

Total for "information_and_communication_technologies" is 122
Total for "governance" is 566
Total for "urban_development" is 693
Total for "law_and_development" is 874
Total for "public_sector_development" is 1182
Total for "agriculture" is 449
Total for "communities_and_human_settlements" is 571
Total for "health_and_nutrition_and_population" is 2081
Total for "culture_and_development" is 26
Total for "environment" is 1630
Total for "social_protections_and_labor" is 967
Total for "industry" is 703
Total for "macroeconomics_and_economic_growth" is 5634
Total for "international_economics_and_trade" is 1541
Total for "conflict_and_development" is 314
Total for "finance_and_financial_sector_development" is 5958
Total for "science_and_technology_development" is 238
Total for "rural_development" is 439
Total for "poverty_reduction" is 1019
Total for "private_sector_development" is 3824
Total for "informatics" is 24
Total for "energy" is 947
Total for "social_development" is 395
Total for "w

In [8]:
#show if there are any rows that have NO prediction
xxx = sub.copy()
xxx[:] = np.where(xxx[:].values >= 0.5, 1, 0).astype(np.uint8)
xxx = pd.DataFrame(xxx.sum(axis=1))
xxx.columns = ['cnt']
msg_idx = xxx[(xxx.cnt < 1)].index.values
sub[(sub.index.isin(msg_idx))]
xxx[(xxx.cnt == 0)]

Unnamed: 0_level_0,cnt
row_id,Unnamed: 1_level_1
8,0
24,0
27,0
35,0
62,0
82,0
84,0
107,0
118,0
129,0


In [9]:
#save the probability distributions for our future stacking
sub = pd.concat([sub,test_pre_pred]).sort_values(by='row_id')
sub.to_csv('../submissions/proba/StackCV_xgb_lgbm_lr_etc_rfc.csv')

#convert to 1 / 0 output and save for submission
sub[:] = np.where(sub[:].values >= 0.5, 1, 0).astype(np.uint8)
sub.to_csv('../submissions/StackCV_xgb_lgbm_lr_etc_rfc.csv')