## Import required libraries

In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import MultiLabelBinarizer

## Load Pre and Post Processor Classes

In [2]:
from preprocess import PreProcessor
from postprocess import PostProcessor

## Select language and category level (one or two)

In [3]:
lang_list = {1: "en", 2: "bn", 3: "banglish"}

In [4]:
lang = lang_list[1]

In [5]:
level = 2

In [6]:
if level == 1:
    tags = "tags_1"
else:
    tags = "tags_2"

## Load "merged_table" from database and store it in a pandas dataframe

In [7]:
questions = pd.DataFrame("merged_table")

## Select answered questions, relevant columns and drop incomplete rows

In [8]:
questions = questions[questions.status == "answered"]

In [9]:
questions = questions[questions.lang == lang]

In [10]:
questions = questions[["question_id", "body", tags]]

In [11]:
questions = questions.dropna()

In [12]:
questions.head()

Unnamed: 0,question_id,body,tags_2
0,240,hi i am 27 my height is 5 2 bt my weight is on...,Basic Health
10,429,what is the normal weight lenght of 27weeks ag...,Basic Health
11,427,hello apu i have a question how much solid foo...,Basic Health
17,212,dear maya apa how much vegetable protein vitam...,Basic Health
18,261,apa i am a 18years old boy but my height is on...,Basic Health


## Keep a list of all possible categories

In [13]:
unique_tags = np.unique(questions[tags])

In [14]:
print len(unique_tags), "Categories :", unique_tags

45 Categories : ['ASK' 'BLAST' 'BMC' 'BRAC IED' 'Basic Health' 'Basic sex education '
 'Beauty and Care' 'Cardiology' 'Career' 'Child/Forced Marriage'
 'Communicable Diseases ' 'Contraception and Family Planning' 'Cybercrime'
 'Dermatology' 'ENT' 'Elopement' 'Endocrinology' 'Family Law' 'Fitness'
 'Gastroenterology' 'Gender Violence' 'Geriatric' 'Marie Stopes'
 "Men's Health" 'Mental Health' 'Neurology' 'Oncology' 'Ophthalmology'
 'Orthopedics ' 'Others' 'Parenting' 'Pediatrics/Child Care' 'Property Law'
 'Relationships' 'Respiratory' 'STIs/STDs' 'Sajida Foundation' 'Sexuality'
 'Technical Query' 'Teen Health' 'Urology/Nephrology' 'User Query'
 'Womens Health - Labour and Post Pregnancy' 'Womens Health - Pregnancy'
 'Womens Health and Physiology']


## Process questions for model fitting

In [15]:
pre = PreProcessor(lang)

In [16]:
questions["body"] = pre.clean(questions["body"]) 

In [17]:
questions = pre.process_tag(questions)

In [18]:
questions.head()

Unnamed: 0,question_id,body,tags
0,240,hi i am my height is bt my weight is only kg i...,[Basic Health]
1,429,what is the normal weight lenght of weeks aged...,"[Basic Health, Womens Health - Pregnancy]"
3,427,hello apu i have a question how much solid foo...,"[Basic Health, Pediatrics/Child Care]"
5,212,dear maya apa how much vegetable protein vitam...,"[Basic Health, Parenting, Pediatrics/Child Care]"
8,261,apa i am a years old boy but my height is only...,[Basic Health]


## Splitting data into "train" and "test" sets

In [19]:
data, labels = questions["body"], questions["tags"] 

In [20]:
train_data, test_data, train_target, test_target = train_test_split(data, labels, test_size = 0.2, random_state = 101)

## Converting the labels into binary sparse form

In [21]:
mlb = MultiLabelBinarizer(classes = unique_tags)

In [22]:
train_mlb = mlb.fit_transform(train_target)

In [23]:
test_mlb = mlb.transform(test_target)

In [24]:
print "Train :", train_mlb.shape, "& Test :", test_mlb.shape

Train : (3353, 45) & Test : (839, 45)


## Feature Extraction

In [25]:
tfidf_vect = TfidfVectorizer(analyzer = "word", stop_words = pre.stopwords(),
                            tokenizer = pre.tokenize, lowercase = False) 

In [26]:
train_dtm = tfidf_vect.fit_transform(train_data)

In [27]:
test_dtm = tfidf_vect.transform(test_data)

In [28]:
print "Train :", train_dtm.shape, "& Test :", test_dtm.shape

Train : (3353, 6035) & Test : (839, 6035)


## Model fitting

In [29]:
clf = OneVsRestClassifier(LogisticRegression(class_weight = "balanced", solver = "newton-cg"))

In [30]:
clf.fit(train_dtm, train_mlb)

  str(classes[c]))
  str(classes[c]))


OneVsRestClassifier(estimator=LogisticRegression(C=1.0, class_weight='balanced', dual=False,
          fit_intercept=True, intercept_scaling=1, max_iter=100,
          multi_class='ovr', n_jobs=1, penalty='l2', random_state=None,
          solver='newton-cg', tol=0.0001, verbose=0, warm_start=False),
          n_jobs=1)

## Make Predictions

In [31]:
post = PostProcessor(level)    

In [32]:
prediction = post.predict(clf, test_dtm)

## Evaluate

In [33]:
print "Accuracy: ", np.round(post.accuracy(mlb.inverse_transform(test_mlb), mlb.inverse_transform(prediction)), 2)

Accuracy:  53.02
