In [None]:
from google.colab import drive
drive.mount('/content/gdrive',force_remount=True)

Mounted at /content/gdrive


In [None]:
!pip install contractions



In [None]:
## import packages
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt 

import contractions
from bs4 import BeautifulSoup
import numpy as np
import re
import tqdm
import unicodedata
import pickle
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
stop_words = stopwords.words('english')

Basic Text Pre-processing
Steps include the following:


*    Remove stopwords
*   Fixing contractions
*   Removing special characters
*   Converting accented characters

Note : For some models we don't use any pre-processing like BERT!

In [None]:
def remove_accented_chars(text):
  ## make sure that  characters are converted and standardized into ASCII characters.
  ##example — converting é to e.
  text = unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('utf-8', 'ignore')
  return text

def pre_process_corpus(docs):
  norm_docs = []
  for doc in tqdm.tqdm(docs):
    doc = doc.translate(doc.maketrans("\n\t\r", "   "))
    doc = doc.lower()
    doc = remove_accented_chars(doc)
    doc = contractions.fix(doc)
    # lower case and remove special characters\whitespaces
    doc = re.sub(r'[^a-zA-Z0-9\s]', '', doc, re.I|re.A)
    doc = doc.strip()  
    doc = ' '.join([w for w in doc.split() if not w in stop_words])
    norm_docs.append(doc)
  
  return norm_docs

In [None]:
## import training and testing dataset
train = pd.read_csv('/content/gdrive/My Drive/lily/train.csv')
test = pd.read_csv('/content/gdrive/My Drive/lily/test.csv')

In [None]:
###  mapping the each value of Disease category to number
output_map = {'Animal Diseases': 0,
 'Bacterial Infections and Mycoses': 3,
 'Cardiovascular Diseases': 1,
 'Chemically-Induced Disorders': 2,
 'Congenital Hereditary and Neonatal Diseases and Abnormalities': 4,
 'Digestive System Diseases': 5,
 'Disorders of Environmental Origin': 22,
 'Endocrine System Diseases': 6,
 'Eye Diseases': 7,
 'Female Urogenital Diseases and Pregnancy Complications': 8,
 'Hemic and Lymphatic Diseases': 9,
 'Immune System Diseases': 10,
 'Male Urogenital Diseases': 11,
 'Musculoskeletal Diseases': 24,
 'Neoplasms': 12,
 'Nervous System Diseases': 13,
 'Nutritional and Metabolic Diseases': 14,
 'Occupational Diseases': 15,
 'Otorhinolaryngologic Diseases': 16,
 'Parasitic Diseases': 17,
 'Pathological Conditions and Signs and Symptoms': 18,
 'Respiratory Tract Diseases': 19,
 'Skin and Connective Tissue Diseases': 25,
 'Stomatognathic Diseases': 20,
 'Virus Diseases': 21,
 'Wounds and Injuries': 23}

train['categories'] = train['categories'].map(output_map)
test['categories'] = test['categories'].map(output_map)

In [None]:
%%time
norm_train_reviews = pre_process_corpus(list(train['abstract']))
norm_test_reviews = pre_process_corpus(list(test['abstract']))

100%|██████████| 43916/43916 [00:24<00:00, 1818.09it/s]
100%|██████████| 10862/10862 [00:05<00:00, 1828.70it/s]

CPU times: user 29.9 s, sys: 263 ms, total: 30.2 s
Wall time: 30.2 s





In [None]:
# build TFIDF features on train abstract
tv = TfidfVectorizer(min_df=3,  max_features=None,
            ngram_range=(1, 3),
            stop_words = 'english')
tv_train_features = tv.fit_transform(norm_train_reviews)

### Building Random Forest classifier

In [None]:
rfc=RandomForestClassifier(random_state=42)
param_grid = { 
    'n_estimators': [200, 500],
    'max_features': ['auto', 'sqrt', 'log2'],
    'max_depth' : [4,5,6,7,8,9,10,12],
    'criterion' :['gini', 'entropy']
}
CV_rfc = GridSearchCV(estimator=rfc, param_grid=param_grid, cv= 5)
CV_rfc.fit(tv_train_features, np.array(train['categories']))

GridSearchCV(cv=5, error_score=nan,
             estimator=RandomForestClassifier(bootstrap=True, ccp_alpha=0.0,
                                              class_weight=None,
                                              criterion='gini', max_depth=None,
                                              max_features='auto',
                                              max_leaf_nodes=None,
                                              max_samples=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              n_estimators=100, n_jobs=None,
                                              oob_score=False, random_state=42,
                                  

In [None]:
CV_rfc.best_params_

{'criterion': 'gini',
 'max_depth': 12,
 'max_features': 'auto',
 'n_estimators': 500}

In [None]:
rfc=RandomForestClassifier(random_state=42,criterion = 'gini',max_depth = 12,max_features = 'auto',n_estimators=500)
rfc.fit(tv_train_features,train['categories'])

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=12, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=500,
                       n_jobs=None, oob_score=False, random_state=42, verbose=0,
                       warm_start=False)

### Evaluating on test dataset

In [None]:
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
test_features = tv.transform(norm_test_reviews)
predictions = rfc.predict(test_features)
print("Accuracy: %.2f%%" % (accuracy_score(test['categories'], predictions)*100))

Accuracy: 47.42%


In [None]:
from sklearn.metrics import f1_score
print("macro:"+ str(f1_score(list(test['categories']), predictions, average='macro')*100))
print("micro: %.2f%%" % (f1_score(list(test['categories']), predictions, average='micro')*100))
print("weigted:"+ str(f1_score(list(test['categories']), predictions, average='weighted')*100))


## Naive bayes

In [None]:
from sklearn.naive_bayes import MultinomialNB
nb_model = MultinomialNB()
nb_model.fit(tv_train_features,train['categories'])

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [None]:
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
predictions = nb_model.predict(test_features)
print("Accuracy: %.2f%%" % (accuracy_score(test['categories'], predictions)*100))

Accuracy: 50.12%


## XGBOOST

In [None]:
from xgboost.sklearn import XGBClassifier
#initial model
xgb1 = XGBClassifier(learning_rate=0.1,
                    n_estimators=300,
                    max_depth=5,
                    min_child_weight=1,
                    gamma=0,
                    subsample=0.8,
                    colsample_bytree=0.8,
                    objective='multi:softmax',
                    nthread=4,
                    num_class=26,
                    seed=27)
xgb1.fit(tv_train_features, train['categories'])

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=0.8, gamma=0,
              learning_rate=0.1, max_delta_step=0, max_depth=5,
              min_child_weight=1, missing=None, n_estimators=300, n_jobs=1,
              nthread=4, num_class=26, objective='multi:softprob',
              random_state=0, reg_alpha=0, reg_lambda=1, scale_pos_weight=1,
              seed=27, silent=None, subsample=0.8, verbosity=1)

In [None]:
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
test_features = tv.transform(norm_test_reviews)
predictions = xgb1.predict(test_features)
print("Accuracy: %.2f%%" % (accuracy_score(test['categories'], predictions)*100))

Accuracy: 76.63%


## Best performing model : xgboost
## Accuracy : 76.63 %
## f1-score(micro) : 76%

#### Out of 10862 obersvations 8255 test data points are correctly predicted. error rate is 24 %

### why xgboost?
##### XGBoost is a scalable and accurate implementation of gradient boosting algorith and its Really fast when compared to other implementations of gradient boosting.
##### The evidence is that it is the go-to algorithm for competition winners on the Kaggle competitive data science platform.
##### works well for imbalanced dataset
##### When in doubt, use xgboost.it works well in almost every cases
