## Import required libraries

In [1]:
from warnings import simplefilter
from pandas import read_sql_table
from numpy import unique
from sqlalchemy import create_engine
from urllib import quote_plus
from processing.pre import PreProcessor
from processing.post import PostProcessor
from sklearn.metrics import classification_report

## Ignore warnings

In [2]:
simplefilter("ignore")

## Select language and category level (one or two)

In [3]:
lang_list = {1: "en", 2: "bn", 3: "banglish"}

In [4]:
lang = lang_list[1]

In [5]:
level = 2

In [6]:
if level == 1:
    tags = "tags_1"
else:
    tags = "tags_2"

## Load questions from database

In [8]:
engine = create_engine("mysql+pymysql://<username>:%s@<hostname>/<database>?charset=utf8" % quote_plus(<password>), encoding="utf-8")

In [10]:
with engine.connect() as con, con.begin():
    questions = read_sql_table(<table>, con)

## Select answered questions, relevant columns and drop incomplete rows

In [11]:
questions = questions[questions.status == "answered"]

In [12]:
questions = questions[questions.lang == lang]

In [13]:
questions = questions[["question_id", "body", tags]]

In [14]:
questions = questions.dropna()

In [15]:
questions.head()

Unnamed: 0,question_id,body,tags_2
0,240,hi i am 27 my height is 5 2 bt my weight is on...,Basic Health
10,429,what is the normal weight lenght of 27weeks ag...,Basic Health
11,427,hello apu i have a question how much solid foo...,Basic Health
17,212,dear maya apa how much vegetable protein vitam...,Basic Health
18,261,apa i am a 18years old boy but my height is on...,Basic Health


## Keep a list of all possible categories

In [17]:
unique_tags = unique(questions[tags])

In [18]:
print len(unique_tags), "Categories :", unique_tags

45 Categories : [u'ASK' u'BLAST' u'BMC' u'BRAC IED' u'Basic Health' u'Basic sex education '
 u'Beauty and Care' u'Cardiology' u'Career' u'Child/Forced Marriage'
 u'Communicable Diseases ' u'Contraception and Family Planning'
 u'Cybercrime' u'Dermatology' u'ENT' u'Elopement' u'Endocrinology'
 u'Family Law' u'Fitness' u'Gastroenterology' u'Gender Violence'
 u'Geriatric' u'Marie Stopes' u"Men's Health" u'Mental Health' u'Neurology'
 u'Oncology' u'Ophthalmology' u'Orthopedics ' u'Others' u'Parenting'
 u'Pediatrics/Child Care' u'Property Law' u'Relationships' u'Respiratory'
 u'STIs/STDs' u'Sajida Foundation' u'Sexuality' u'Technical Query'
 u'Teen Health' u'Urology/Nephrology' u'User Query'
 u'Womens Health - Labour and Post Pregnancy' u'Womens Health - Pregnancy'
 u'Womens Health and Physiology']


## Process questions for model fitting

In [19]:
pre = PreProcessor(lang)

In [20]:
questions["body"] = pre.clean(questions["body"]) 

In [21]:
questions = pre.process_tag(questions)

In [22]:
questions.head()

Unnamed: 0,question_id,body,tags
0,240,hi i am my height is bt my weight is only kg i...,[Basic Health]
1,429,what is the normal weight lenght of weeks aged...,"[Basic Health, Womens Health - Pregnancy]"
3,427,hello apu i have a question how much solid foo...,"[Basic Health, Pediatrics/Child Care]"
5,212,dear maya apa how much vegetable protein vitam...,"[Basic Health, Parenting, Pediatrics/Child Care]"
8,261,apa i am a years old boy but my height is only...,[Basic Health]


## Extract features

In [23]:
seed = 101

In [24]:
test_per = 0.2

In [25]:
data, labels = questions["body"], questions["tags"]

In [26]:
train_data, test_data, train_target, test_target = pre.extract_features(data, labels, unique_tags, test_per, seed)

## Model fitting

In [27]:
post = PostProcessor()

In [28]:
model = post.model_fit(train_data, train_target, seed)

## Make predictions

In [29]:
prediction = post.predict(test_data)

## Evaluate

In [31]:
print "Accuracy: ", round(post.accuracy(test_target, prediction), 2)

 Accuracy:  59.39


In [32]:
print classification_report(test_target, prediction, target_names = unique_tags)

                                           precision    recall  f1-score   support

                                      ASK       0.00      0.00      0.00         1
                                    BLAST       0.00      0.00      0.00         0
                                      BMC       0.00      0.00      0.00         1
                                 BRAC IED       0.00      0.00      0.00         0
                             Basic Health       0.44      0.72      0.54       109
                     Basic sex education        0.60      0.76      0.67        96
                          Beauty and Care       0.65      0.87      0.75        39
                               Cardiology       0.00      0.00      0.00         1
                                   Career       0.80      0.40      0.53        10
                    Child/Forced Marriage       0.00      0.00      0.00         0
                   Communicable Diseases        0.00      0.00      0.00         2
   