In [1]:
# Ignore warnings
import warnings
warnings.filterwarnings('ignore')

import os, re


import pandas as pd  # dataframes
import numpy as np

from sklearn.model_selection import KFold
from sklearn.metrics import roc_auc_score

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.tree import DecisionTreeClassifier


np.random.seed(234)

In [2]:
#train = pd.read_csv("Devex_train.csv", encoding = "latin-1")
train = pd.read_csv("train_clean.csv")
test = pd.read_csv("Devex_test_questions.csv",encoding = "ISO-8859-1")
sub = pd.read_csv("Devex_submission_format.csv")

In [3]:
sub.head()

Unnamed: 0,ID,3.1.1,3.1.2,3.2.1,3.2.2,3.3.1,3.3.2,3.3.3,3.3.4,3.3.5,...,3.8.2,3.9.1,3.9.2,3.9.3,3.a.1,3.b.1,3.b.2,3.b.3,3.c.1,3.d.1
0,11437,1.0,1.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0
1,11474,0.0,1.0,1.0,1.0,0.0,1.0,0.0,1.0,1.0,...,0.0,1.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0
2,11475,1.0,0.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,...,0.0,1.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,1.0
3,11476,1.0,0.0,1.0,0.0,0.0,1.0,1.0,1.0,1.0,...,1.0,0.0,1.0,0.0,1.0,1.0,0.0,1.0,1.0,1.0
4,11486,1.0,1.0,0.0,0.0,1.0,0.0,1.0,1.0,1.0,...,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0


In [4]:
test.head()

Unnamed: 0,Unique ID,Type,Text
0,49848,Organization,4th Sector Health: <p>4th Sector Health is a U...
1,52348,Organization,Action for Global Health: <p>Action for Global...
2,103541,Organization,Scottish Association for Mental Health (SAMH):...
3,52382,Organization,Singapore Immunology Network: <p>The Singapore...
4,47212,Organization,Coastal Conservation and Education Foundation ...


In [5]:
train.head()

Unnamed: 0,Unique ID,Type,Text,3.1.1,3.1.2,3.2.1,3.2.2,3.3.1,3.3.2,3.3.3,...,3.8.2,3.9.1,3.9.2,3.9.3,3.a.1,3.b.1,3.b.2,3.b.3,3.c.1,3.d.1
0,12555.0,Grant,Centers of Biomedical Research Excellence (COB...,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,1,0
1,14108.0,Grant,Research on Regenerative Medicine Introduction...,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
2,23168.0,Organization,Catholic Health Association of India (CHAI): T...,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,1,0,1
3,219512.0,Contract,Quality Improvement Initiatives for Diabetes,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,274093.0,Tender,Provision of Thalassemia Drugs and Disposables...,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [6]:
len(train)

2995

In [7]:
cleanr = re.compile('<.*?>')

def remove_html(raw_html):
    cleantext = re.sub(cleanr, '', raw_html)
    cleantext = re.sub('&nbsp;', '', cleantext)
    cleantext = re.sub('&bull;', '', cleantext)
    cleantext = re.sub('&', '', cleantext)
    return cleantext

In [8]:
for i in range(len(test)):
    test.at[i,'Text'] = remove_html(test.at[i,'Text'])

In [9]:
test = test.replace({r'\x0D': ' '}, regex=True) #removing carriage returns

In [10]:
X_train = train['Text']
X_test = test['Text']

In [11]:
word =  TfidfVectorizer(ngram_range=(1,3), analyzer='word', norm='l2', stop_words='english',
               min_df=5, max_df=0.8, strip_accents='unicode', use_idf=True,
               smooth_idf=True, sublinear_tf=1 )

In [12]:
train_1 = word.fit_transform(X_train)

In [13]:
test_1 = word.transform(X_test)

In [14]:
labels = train[train.columns[3:]]

In [15]:
lablist = train.columns[3:].tolist()

In [16]:
target = train[lablist]
tr_ids = train[['Unique ID']]

In [17]:
folds = 5
scores = []
scores_classes = np.zeros((len(labels), folds))


submission = pd.DataFrame.from_dict({'Unique ID': test['Unique ID']})


#train each label at a time

kfold = KFold(n_splits=folds, shuffle=True, random_state=1001)
for j, (label) in enumerate(labels):
    
    classifier = DecisionTreeClassifier()
    
    avreal = target[label]
    lr_pred = []
    lr_fpred = []
    
    #create an array of zeros to save the prediction on the held out training set
    lr_avpred = np.zeros(train_1.shape[0])
    
    
    for i, (train_index, val_index) in enumerate(kfold.split(train_1, target[label].values)):
        X_train, X_val = train_1[train_index], train_1[val_index]          #split the train set
        y_train, y_val = target.loc[train_index], target.loc[val_index]    #split the label
        
        classifier.fit(X_train, y_train[label])
        
        scores_val = classifier.predict_proba(X_val)[:,1]    #predict label probability for validation set
        lr_avpred[val_index] = scores_val                    #store our class probability in the array created earlier
        lr_y_pred = classifier.predict_proba(test_1)[:,1]    # predict label probability of test set
        
        
        scores_classes[j][i] = roc_auc_score(y_val[label], scores_val)  #calculate our roc_auc score on held out set
        scores.append(scores_classes[j])
        
        print('\n Fold %02d class %s AUC: %.6f' % ((i+1), label, scores_classes[j][i]))
        
        # if first fold, test class probabilities is the predicted probabilities, otherwise, add the probabilities
        if i > 0:
            lr_fpred = lr_pred + lr_y_pred 
        else:
            lr_fpred = lr_y_pred
        
        lr_pred = lr_fpred
           
    print('\n Average class %s AUC:\t%.6f' % (label, np.mean(scores_classes[j])))
    
    # we find the average of our predicted probabilities for our test data. Save it in a csv file.
    # we also covert the probalities to integers and save as a csv file for submission.
    
    submission[label] = lr_pred / folds
    sub = np.around(submission.iloc[0:]).astype('int')
    
print('\n Overall AUC:\t%.6f' % (np.mean(scores)))
sub.to_csv('submissiondt.csv', index=False)
submission.to_csv('submissiondtp.csv', index=False)


 Fold 01 class 3.1.1 AUC: 0.809794

 Fold 02 class 3.1.1 AUC: 0.768659

 Fold 03 class 3.1.1 AUC: 0.798850

 Fold 04 class 3.1.1 AUC: 0.748929

 Fold 05 class 3.1.1 AUC: 0.795326

 Average class 3.1.1 AUC:	0.784312

 Fold 01 class 3.1.2 AUC: 0.564978

 Fold 02 class 3.1.2 AUC: 0.584857

 Fold 03 class 3.1.2 AUC: 0.570290

 Fold 04 class 3.1.2 AUC: 0.644243

 Fold 05 class 3.1.2 AUC: 0.649559

 Average class 3.1.2 AUC:	0.602785

 Fold 01 class 3.2.1 AUC: 0.803803

 Fold 02 class 3.2.1 AUC: 0.772008

 Fold 03 class 3.2.1 AUC: 0.794399

 Fold 04 class 3.2.1 AUC: 0.744990

 Fold 05 class 3.2.1 AUC: 0.776656

 Average class 3.2.1 AUC:	0.778371

 Fold 01 class 3.2.2 AUC: 0.804325

 Fold 02 class 3.2.2 AUC: 0.802935

 Fold 03 class 3.2.2 AUC: 0.758570

 Fold 04 class 3.2.2 AUC: 0.779513

 Fold 05 class 3.2.2 AUC: 0.786589

 Average class 3.2.2 AUC:	0.786386

 Fold 01 class 3.3.1 AUC: 0.877997

 Fold 02 class 3.3.1 AUC: 0.884887

 Fold 03 class 3.3.1 AUC: 0.942466

 Fold 04 class 3.3.1 AUC: 0