In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_score
from sklearn.metrics import classification_report
from sklearn.svm import LinearSVC

In [2]:
train_df = pd.read_csv('https://raw.githubusercontent.com/aatishsuman/health-advice/master/data/pubmed_causal_language_use.csv')
train_df.shape

(3061, 2)

In [3]:
X_train, X_validation, y_train, y_validation = train_test_split(train_df['sentence'], train_df['label'], test_size=0.1, random_state=42)
test_df = pd.read_csv('https://raw.githubusercontent.com/aatishsuman/health-advice/master/data/test.csv')
X_test, y_test = test_df['sentence'], test_df['label']
print(X_train.shape, X_validation.shape, y_train.shape, y_validation.shape, X_test.shape, y_test.shape)

(2754,) (307,) (2754,) (307,) (915,) (915,)


In [4]:
def build_model(penalty='l2', dual=True, max_iter=1000, C=1, min_df=1, max_df=1.0, max_features=None):
    vectorizer = TfidfVectorizer(encoding='latin-1', min_df=min_df, max_df=max_df, max_features=max_features, stop_words='english')
    svc = LinearSVC(penalty=penalty, dual=dual, max_iter=max_iter, C=C, random_state=42)
    
    train_vectors = vectorizer.fit_transform(X_train)
    print('Train vector dimensions:', train_vectors.shape)
    svc_model = svc.fit(train_vectors, y_train)
    
    print('Validation accuracy:', svc_model.score(vectorizer.transform(X_validation), y_validation))
    pipe = Pipeline(steps=[('tfidf', vectorizer),
                       ('svc', svc)])
    scores = cross_val_score(pipe, X_train, y_train, cv=3)
    print('Cross validation score:', sum(scores)/len(scores))
    return svc_model, train_vectors, vectorizer

In [5]:
# baseline model
model, vectors, vectorizer = build_model()

Train vector dimensions: (2754, 6559)
Validation accuracy: 0.7263843648208469
Cross validation score: 0.6964415395787945


In [6]:
# Model 1
model, vectors, vectorizer = build_model(C=0.3, min_df=2)

Train vector dimensions: (2754, 3230)
Validation accuracy: 0.745928338762215
Cross validation score: 0.7073347857661583


In [7]:
# Model 2
model, vectors, vectorizer = build_model(penalty='l1', dual=False)

Train vector dimensions: (2754, 6559)
Validation accuracy: 0.762214983713355
Cross validation score: 0.7156862745098039


In [8]:
print('Train classification report:\n', classification_report(y_train, 
                                                              model.predict(vectors), 
                                                              target_names=['0','1','2','3']), '\n')
print('Test classification report:\n', classification_report(y_test, 
                                                             model.predict(vectorizer.transform(X_test)), 
                                                             target_names=['0','1','2','3']), '\n')

Train classification report:
               precision    recall  f1-score   support

           0       0.93      0.98      0.96      1218
           1       0.96      0.87      0.91       446
           2       0.97      0.77      0.86       197
           3       0.96      0.97      0.96       893

    accuracy                           0.95      2754
   macro avg       0.95      0.90      0.92      2754
weighted avg       0.95      0.95      0.94      2754
 

Test classification report:
               precision    recall  f1-score   support

           0       0.86      0.71      0.78       636
           1       0.27      0.33      0.30        61
           2       0.28      0.26      0.27        19
           3       0.49      0.74      0.59       199

    accuracy                           0.68       915
   macro avg       0.48      0.51      0.48       915
weighted avg       0.73      0.68      0.69       915
 

