In [1]:
import pickle
import pandas as pd
import numpy as np

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics import f1_score, precision_score, recall_score
from sklearn.model_selection import train_test_split

import lightgbm as lgb
#from wordbatch.extractors import WordBag, WordHash

In [2]:
train = pickle.load(open('../data/train_preprocessed.pkl','rb'))
test = pickle.load(open('../data/test_preprocessed.pkl','rb'))
labels = pickle.load(open('../data/labels_preprocessed.pkl','rb'))
test_pre_pred = pickle.load(open('../data/test_predicted.pkl','rb'))
cols_target = [x for x in labels.columns if x != 'row_id']

sub = test.copy()
sub.drop(columns=['doc_text'], inplace=True)
for _col in labels:
    sub[_col] = np.uint8(0)

    #train, val, labels, val_labels = train_test_split(train, labels, random_state=42,
#                                                 test_size=0.1, shuffle=True)

## Utility Functions

In [3]:
def GetMicroAccuracy(labels, predictions):
    _val_f1 = f1_score(labels, predictions, average='micro')
    _val_recall = recall_score(labels, predictions, average='micro')
    _val_precision = precision_score(labels, predictions, average='micro')
    print ("[Weighted] F1 Score: %f,  Precision: %f,  Recall: %f" % (_val_f1, _val_precision, _val_recall))

## TF-IDF Data Prep

In [4]:
vect = TfidfVectorizer(max_features=5000, stop_words='english', ngram_range=(1,3), min_df=5e-4)
x = vect.fit_transform(train.doc_text.values)
test_x = vect.transform(test.doc_text.values)

## <font color="Red">LightGBM </font> Model

In [5]:
params = {
    'boosting_type':'gbdt',
    'random_state':736283,
    'objective':'binary',
    'metric':'auc',
    'max_bin':255,
    'num_leaves':200,
    'learning_rate':0.1,
    'tree_learner':'feature',
    'n_estimators':300,
    'n_jobs':-1,
    'nthread':-1,
    'verbosity':-1,
    'reg_lambda':0.001,
    'feature_fraction':0.9
}

model = lgb.LGBMClassifier(**params)

for cls in list(labels.columns.values):
    print('learning "%s"...' % cls)
    model.fit(x, labels[cls].values)
    sub[cls] = model.predict(test_x).round().astype(np.uint8)
    

learning "information_and_communication_technologies"...
learning "governance"...
learning "urban_development"...
learning "law_and_development"...
learning "public_sector_development"...
learning "agriculture"...
learning "communities_and_human_settlements"...
learning "health_and_nutrition_and_population"...
learning "culture_and_development"...
learning "environment"...
learning "social_protections_and_labor"...
learning "industry"...
learning "macroeconomics_and_economic_growth"...
learning "international_economics_and_trade"...
learning "conflict_and_development"...
learning "finance_and_financial_sector_development"...
learning "science_and_technology_development"...
learning "rural_development"...
learning "poverty_reduction"...
learning "private_sector_development"...
learning "informatics"...
learning "energy"...
learning "social_development"...
learning "water_resources"...
learning "education"...
learning "transport"...
learning "water_supply_and_sanitation"...
learning "gen

In [6]:
sub = pd.concat([sub,test_pre_pred]).sort_values(by='row_id')

In [7]:
sub.to_csv('../submissions/lightgbm_model_5k.csv')