In [1]:
import re
import numpy as np
import pandas as pd

First, let's load original datasets

In [2]:
# Please download original Devex_train.csv and Devex_submission_format.csv from Zindi Website

df_train = pd.read_csv('Devex_train.csv', low_memory=False, encoding='latin1')
df_submission = pd.read_csv('Devex_submission_format.csv', low_memory=False, encoding='latin1')

In [3]:
df_train.head()

Unnamed: 0,Unique ID,Type,Text,Label 1,Label 2,Label 3,Label 4,Label 5,Label 6,Label 7,Label 8,Label 9,Label 10,Label 11,Label 12
0,12555,Grant,Centers of Biomedical Research Excellence (COB...,3.b.2 - Total net official development assista...,3.c.1 - Health worker density and distribution,,,,,,,,,,
1,14108,Grant,Research on Regenerative Medicine <h2><strong>...,3.b.2 - Total net official development assista...,,,,,,,,,,,
2,23168,Organization,Catholic Health Association of India (CHAI): <...,3.d.1 - International Health Regulations (IHR)...,3.8.1 - Coverage of essential health services ...,3.8.2 - Proportion of population with large ho...,3.b.3 - Proportion of health facilities that h...,,,,,,,,
3,219512,Contract,Quality Improvement Initiatives for Diabetes,3.4.1 - Mortality rate attributed to cardiovas...,,,,,,,,,,,
4,274093,Tender,Provision of Thalassemia Drugs and Disposables...,3.3.5 - Number of people requiring interventio...,3.4.1 - Mortality rate attributed to cardiovas...,,,,,,,,,,


Let's prepare the structure for a new, clean version of the train set

In [4]:
df_train.fillna(0, inplace=True)
df_train_clean = df_train.drop(columns=df_train.columns[3:15])

In [5]:
labels = df_submission.columns[1:]
df_train_clean = pd.concat([pd.DataFrame(columns=labels),df_train_clean])
df_train_clean.fillna(0, inplace=True)

Rearranging an order of columns

In [6]:
unique_id_col = df_train_clean.pop('Unique ID')
type_col = df_train_clean.pop('Type')
text_col = df_train_clean.pop('Text')

df_train_clean.insert(0, 'Unique ID', unique_id_col)
df_train_clean.insert(1, 'Type', type_col)
df_train_clean.insert(2, 'Text', text_col)

In [7]:
df_train_clean.head()

Unnamed: 0,Unique ID,Type,Text,3.1.1,3.1.2,3.2.1,3.2.2,3.3.1,3.3.2,3.3.3,...,3.8.2,3.9.1,3.9.2,3.9.3,3.a.1,3.b.1,3.b.2,3.b.3,3.c.1,3.d.1
0,12555.0,Grant,Centers of Biomedical Research Excellence (COB...,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,14108.0,Grant,Research on Regenerative Medicine <h2><strong>...,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,23168.0,Organization,Catholic Health Association of India (CHAI): <...,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,219512.0,Contract,Quality Improvement Initiatives for Diabetes,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,274093.0,Tender,Provision of Thalassemia Drugs and Disposables...,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


# Removing HTML mark-ups

In [8]:
cleanr = re.compile('<.*?>')

def remove_html(raw_html):
    cleantext = re.sub(cleanr, '', raw_html)
    cleantext = re.sub('&nbsp;', ' ', cleantext)
    cleantext = re.sub('&bull;', ' ', cleantext)
    return cleantext

In [9]:
for i in range(len(df_train_clean)):
    df_train_clean.at[i,'Text'] = remove_html(df_train_clean.at[i,'Text'])

In [10]:
df_train_clean = df_train_clean.replace({r'\x0D': ' '}, regex=True) #removing carriage returns

# Extracting labels

In [11]:
for i in range(len(df_train)):
    for j in range(3,15):
        if df_train.iloc[i,j]!=0:
            label = df_train.iloc[i,j][0:5] #first 5 characters of the string is a label  (e.g. 3.8.1)
            df_train_clean.at[i,label] = 1            

In [12]:
df_train_clean.head()

Unnamed: 0,Unique ID,Type,Text,3.1.1,3.1.2,3.2.1,3.2.2,3.3.1,3.3.2,3.3.3,...,3.8.2,3.9.1,3.9.2,3.9.3,3.a.1,3.b.1,3.b.2,3.b.3,3.c.1,3.d.1
0,12555.0,Grant,Centers of Biomedical Research Excellence (COB...,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,1,0
1,14108.0,Grant,Research on Regenerative Medicine Introduction...,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
2,23168.0,Organization,Catholic Health Association of India (CHAI): T...,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,1,0,1
3,219512.0,Contract,Quality Improvement Initiatives for Diabetes,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,274093.0,Tender,Provision of Thalassemia Drugs and Disposables...,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


# Saving to a CSV file

In [13]:
df_train_clean.to_csv('train_clean.csv', index=False, encoding='utf-8')