In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
from sklearn.naive_bayes import MultinomialNB

In [2]:
train_df = pd.read_csv('https://raw.githubusercontent.com/aatishsuman/health-advice/master/data/pubmed_causal_language_use.csv')
train_df.shape

(3061, 2)

In [3]:
X_train, X_validation, y_train, y_validation = train_test_split(train_df['sentence'], train_df['label'], test_size=0.1, random_state=42)
test_df = pd.read_csv('https://raw.githubusercontent.com/aatishsuman/health-advice/master/data/test.csv')
X_test, y_test = test_df['sentence'], test_df['label']
print(X_train.shape, X_validation.shape, y_train.shape, y_validation.shape, X_test.shape, y_test.shape)

(2754,) (307,) (2754,) (307,) (915,) (915,)


In [4]:
def build_model(alpha=1.0, min_df=1, max_df=1.0):
    vectorizer = TfidfVectorizer(encoding='latin-1', use_idf=False, min_df=min_df, max_df=max_df, stop_words='english')
    mnb = MultinomialNB(alpha=alpha)
    
    train_vectors = vectorizer.fit_transform(X_train)
    print('Train vector dimensions: ', train_vectors.shape)
    mnb_model = mnb.fit(train_vectors, y_train)
    
    print('Validation accuracy: ', mnb_model.score(vectorizer.transform(X_validation), y_validation))
    
    return mnb_model, train_vectors, vectorizer

In [5]:
def perform_grid_search(param_grid):
    pipeline = Pipeline([('tf', TfidfVectorizer(encoding='latin-1', use_idf=False, stop_words='english')),
                         ('nb', MultinomialNB())])
    grid_cv = GridSearchCV(pipeline, param_grid, cv=3, n_jobs=-1, verbose=3)
    grid_cv.fit(X_train, y_train)

    print("Best parameters set:")
    print("Best Model's Params: ", grid_cv.best_params_)
    print('Best Score: ', grid_cv.best_score_)

In [6]:
# baseline model
model, vectors, vectorizer = build_model()

Train vector dimensions:  (2754, 6559)
Validation accuracy:  0.7003257328990228


In [7]:
# grid search - Model 1
perform_grid_search({
    'tf__max_df': (0.01, 0.1, 0.2, 1.0),
    'tf__min_df': (5, 10, 20, 1),
    'nb__alpha': (1e-2, 1e-3, 1.0)
})

Fitting 3 folds for each of 48 candidates, totalling 144 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   8 tasks      | elapsed:    2.4s
[Parallel(n_jobs=-1)]: Done 104 tasks      | elapsed:    3.6s


Best parameters set:
Best Model's Params:  {'nb__alpha': 0.01, 'tf__max_df': 0.2, 'tf__min_df': 10}
Best Score:  0.6746550472040669


[Parallel(n_jobs=-1)]: Done 144 out of 144 | elapsed:    4.1s finished


In [8]:
# Model 1
ALPHA, MIN_DF, MAX_DF = 0.01, 10, 0.2

model, vectors, vectorizer = build_model(alpha=ALPHA, min_df=MIN_DF, max_df=MAX_DF)

Train vector dimensions:  (2754, 653)
Validation accuracy:  0.7296416938110749


In [9]:
# grid search - Model 2
perform_grid_search({
    'tf__max_df': (0.01, 0.1, 0.2, 1.0),
    'tf__min_df': (5, 6, 7, 8, 9, 10),
    'nb__alpha': (1e-2, 1e-3, 1.0)
})

Fitting 3 folds for each of 72 candidates, totalling 216 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   8 tasks      | elapsed:    0.1s


Best parameters set:
Best Model's Params:  {'nb__alpha': 0.01, 'tf__max_df': 0.2, 'tf__min_df': 8}
Best Score:  0.6782861292665214


[Parallel(n_jobs=-1)]: Done 216 out of 216 | elapsed:    2.5s finished


In [10]:
# Model 2
model, vectors, vectorizer = build_model(alpha=0.01, min_df=8, max_df=0.2)

Train vector dimensions:  (2754, 811)
Validation accuracy:  0.7198697068403909


In [11]:
# best model
model, vectors, vectorizer = build_model(alpha=0.01, min_df=10, max_df=0.2)

Train vector dimensions:  (2754, 653)
Validation accuracy:  0.7296416938110749


In [12]:
print('Train classification report:\n', classification_report(y_train, 
                                                              model.predict(vectors), 
                                                              target_names=['0','1','2','3']), '\n')
print('Test classification report:\n', classification_report(y_test, 
                                                             model.predict(vectorizer.transform(X_test)), 
                                                             target_names=['0','1','2','3']), '\n')

Train classification report:
               precision    recall  f1-score   support

           0       0.80      0.89      0.84      1218
           1       0.81      0.56      0.66       446
           2       0.88      0.32      0.47       197
           3       0.79      0.89      0.84       893

    accuracy                           0.80      2754
   macro avg       0.82      0.67      0.70      2754
weighted avg       0.80      0.80      0.78      2754
 

Test classification report:
               precision    recall  f1-score   support

           0       0.84      0.58      0.69       636
           1       0.17      0.13      0.15        61
           2       0.33      0.11      0.16        19
           3       0.39      0.83      0.53       199

    accuracy                           0.60       915
   macro avg       0.43      0.41      0.38       915
weighted avg       0.69      0.60      0.61       915
 

