### Preparation

In [1]:
import numpy as np
import pandas as pd
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)
downloaded = drive.CreateFile({'id':"1FGPFuPhDm7TP4ldlONogjAUtoskOay1J"})
downloaded.GetContentFile('train.csv') 
train = pd.read_csv('train.csv') 
downloaded = drive.CreateFile({'id':"1hB6XLrMXVgyFIjZ-tpS2x0u0aegIh5rY"})
downloaded.GetContentFile('val.csv') 
val = pd.read_csv('val.csv') 
downloaded = drive.CreateFile({'id':"1ZDTPRJjZgOdkFOngjuN7RKd8-PMWCTnx"})
downloaded.GetContentFile('test.csv') 
test = pd.read_csv('test.csv') 

In [3]:
train

Unnamed: 0.1,Unnamed: 0,index,review,condition
0,0,270,dentist prescribe really rough wisdom tooth re...,0
1,1,529,issue every day almost issue notice side effect,1
2,2,517,medication mg x daily chest pain shortness bre...,1
3,3,808,start take medication migraine yesterday altho...,0
4,4,970,work admirably immediate control reflux sympto...,1
...,...,...,...,...
689,689,156,medicine okay certainly get rid paranoid thoug...,0
690,690,903,make long story short miss year quality time c...,0
691,691,662,im diagnose bipolar disorder psychotic feature...,0
692,692,912,ive use week far skin seem peel burn however c...,0


In [4]:
val

Unnamed: 0.1,Unnamed: 0,index,review,condition
0,0,814,struggle acne half life try counter remedy ben...,0
1,1,494,nutshell even though want drug work hate take ...,0
2,2,887,intense pain along bottom rib second round car...,1
3,3,426,sooo im unwillingly pretty prone yi didnt opti...,0
4,4,642,personally terrible experience dexilant acid r...,1
...,...,...,...,...
293,293,658,start take loseasonique live busy lifestyle qu...,0
294,294,843,stuff really awful within week tear bicep hand...,1
295,295,292,va prescribe medication use year find excessiv...,1
296,296,539,side effect increase time arrythmia chest pain...,1


In [5]:
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline

X_train = train['review']
y_train = train['condition']
X_val = val['review']
y_val = val['condition']

In [6]:
X_test = test['review']
y_test = test['condition']

In [7]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidfvec = TfidfVectorizer(ngram_range=(1,1))
x_train = tfidfvec.fit_transform(X_train)
x_train

<694x3392 sparse matrix of type '<class 'numpy.float64'>'
	with 21716 stored elements in Compressed Sparse Row format>

In [8]:
x_train.todense().shape

(694, 3392)

In [9]:
features_taken = x_train.todense().shape[1]
features_taken

3392

### Train

#### linear

In [10]:
text_clf = Pipeline([('tfidf', TfidfVectorizer(ngram_range=(1,1),max_features=features_taken)), ('clf', SVC(C=1, kernel='linear'))])
text_clf.fit(X_train, y_train)
predicted = text_clf.predict(X_train)

In [11]:
text_clf

Pipeline(memory=None,
         steps=[('tfidf',
                 TfidfVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.float64'>,
                                 encoding='utf-8', input='content',
                                 lowercase=True, max_df=1.0, max_features=3392,
                                 min_df=1, ngram_range=(1, 1), norm='l2',
                                 preprocessor=None, smooth_idf=True,
                                 stop_words=None, strip_accents=None,
                                 sublinear_tf=False,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, use_idf=True,
                                 vocabulary=None)),
                ('clf',
                 SVC(C=1, break_ties=False, cache_size=200, class_weight=None,
                     coef0=0.0, decision_function_shape='ovr', degre

In [12]:
from sklearn.metrics import confusion_matrix
confusion_matrix(y_train, predicted)

array([[348,   2],
       [  5, 339]])

In [13]:
from sklearn.metrics import accuracy_score
accuracy_score(y_train, predicted)

0.9899135446685879

In [14]:
from sklearn.metrics import recall_score
recall_score(y_train, predicted)

0.9854651162790697

In [15]:
from sklearn.metrics import precision_score
precision_score(y_train, predicted)

0.9941348973607038

In [16]:
from sklearn.metrics import f1_score
f1_score(y_train, predicted)

0.9897810218978101

In [17]:
import numpy as np
from sklearn import metrics
fpr, tpr, thresholds = metrics.roc_curve(y_train, predicted)
metrics.auc(fpr, tpr)

0.9898754152823921

#### rbf

In [18]:
text_clf = Pipeline([('tfidf', TfidfVectorizer(ngram_range=(1,1),max_features=features_taken)), ('clf', SVC(C=1000, gamma='auto', kernel='rbf'))])
text_clf.fit(X_train, y_train)
predicted = text_clf.predict(X_train)

In [19]:
text_clf

Pipeline(memory=None,
         steps=[('tfidf',
                 TfidfVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.float64'>,
                                 encoding='utf-8', input='content',
                                 lowercase=True, max_df=1.0, max_features=3392,
                                 min_df=1, ngram_range=(1, 1), norm='l2',
                                 preprocessor=None, smooth_idf=True,
                                 stop_words=None, strip_accents=None,
                                 sublinear_tf=False,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, use_idf=True,
                                 vocabulary=None)),
                ('clf',
                 SVC(C=1000, break_ties=False, cache_size=200,
                     class_weight=None, coef0=0.0,
                     decision_fun

In [20]:
from sklearn.metrics import confusion_matrix
confusion_matrix(y_train, predicted)

array([[346,   4],
       [ 16, 328]])

In [21]:
from sklearn.metrics import accuracy_score
accuracy_score(y_train, predicted)

0.9711815561959655

In [22]:
from sklearn.metrics import recall_score
recall_score(y_train, predicted)

0.9534883720930233

In [23]:
from sklearn.metrics import precision_score
precision_score(y_train, predicted)

0.9879518072289156

In [24]:
from sklearn.metrics import f1_score
f1_score(y_train, predicted)

0.9704142011834319

In [25]:
import numpy as np
from sklearn import metrics
fpr, tpr, thresholds = metrics.roc_curve(y_train, predicted)
metrics.auc(fpr, tpr)

0.9710299003322258

#### poly

In [26]:
text_clf = Pipeline([('tfidf', TfidfVectorizer(ngram_range=(1,1),max_features=features_taken)), ('clf', SVC(C=1, degree=2, gamma='scale', kernel='poly'))])
text_clf.fit(X_train, y_train)
predicted = text_clf.predict(X_train)

In [27]:
text_clf

Pipeline(memory=None,
         steps=[('tfidf',
                 TfidfVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.float64'>,
                                 encoding='utf-8', input='content',
                                 lowercase=True, max_df=1.0, max_features=3392,
                                 min_df=1, ngram_range=(1, 1), norm='l2',
                                 preprocessor=None, smooth_idf=True,
                                 stop_words=None, strip_accents=None,
                                 sublinear_tf=False,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, use_idf=True,
                                 vocabulary=None)),
                ('clf',
                 SVC(C=1, break_ties=False, cache_size=200, class_weight=None,
                     coef0=0.0, decision_function_shape='ovr', degre

In [28]:
from sklearn.metrics import confusion_matrix
confusion_matrix(y_train, predicted)

array([[350,   0],
       [  0, 344]])

In [29]:
from sklearn.metrics import accuracy_score
accuracy_score(y_train, predicted)

1.0

In [30]:
from sklearn.metrics import recall_score
recall_score(y_train, predicted)

1.0

In [31]:
from sklearn.metrics import precision_score
precision_score(y_train, predicted)

1.0

In [32]:
from sklearn.metrics import f1_score
f1_score(y_train, predicted)

1.0

In [33]:
import numpy as np
from sklearn import metrics
fpr, tpr, thresholds = metrics.roc_curve(y_train, predicted)
metrics.auc(fpr, tpr)

1.0

### Validation

#### linear

In [34]:
text_clf = Pipeline([('tfidf', TfidfVectorizer(ngram_range=(1,1),max_features=features_taken)), ('clf', SVC(C=1, kernel='linear'))])
text_clf.fit(X_train, y_train)
predicted = text_clf.predict(X_val)

In [35]:
text_clf

Pipeline(memory=None,
         steps=[('tfidf',
                 TfidfVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.float64'>,
                                 encoding='utf-8', input='content',
                                 lowercase=True, max_df=1.0, max_features=3392,
                                 min_df=1, ngram_range=(1, 1), norm='l2',
                                 preprocessor=None, smooth_idf=True,
                                 stop_words=None, strip_accents=None,
                                 sublinear_tf=False,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, use_idf=True,
                                 vocabulary=None)),
                ('clf',
                 SVC(C=1, break_ties=False, cache_size=200, class_weight=None,
                     coef0=0.0, decision_function_shape='ovr', degre

In [36]:
from sklearn.metrics import confusion_matrix
confusion_matrix(y_val, predicted)

array([[139,   7],
       [ 21, 131]])

In [37]:
from sklearn.metrics import accuracy_score
accuracy_score(y_val, predicted)

0.9060402684563759

In [38]:
from sklearn.metrics import recall_score
recall_score(y_val, predicted)

0.8618421052631579

In [39]:
from sklearn.metrics import precision_score
precision_score(y_val, predicted)

0.9492753623188406

In [40]:
from sklearn.metrics import f1_score
f1_score(y_val, predicted)

0.903448275862069

In [41]:
import numpy as np
from sklearn import metrics
fpr, tpr, thresholds = metrics.roc_curve(y_val, predicted)
metrics.auc(fpr, tpr)

0.906948449891853

#### rbf

In [42]:
text_clf = Pipeline([('tfidf', TfidfVectorizer(ngram_range=(1,1),max_features=features_taken)), ('clf', SVC(C=1000, gamma='auto', kernel='rbf'))])
text_clf.fit(X_train, y_train)
predicted = text_clf.predict(X_val)

In [43]:
text_clf

Pipeline(memory=None,
         steps=[('tfidf',
                 TfidfVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.float64'>,
                                 encoding='utf-8', input='content',
                                 lowercase=True, max_df=1.0, max_features=3392,
                                 min_df=1, ngram_range=(1, 1), norm='l2',
                                 preprocessor=None, smooth_idf=True,
                                 stop_words=None, strip_accents=None,
                                 sublinear_tf=False,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, use_idf=True,
                                 vocabulary=None)),
                ('clf',
                 SVC(C=1000, break_ties=False, cache_size=200,
                     class_weight=None, coef0=0.0,
                     decision_fun

In [44]:
from sklearn.metrics import confusion_matrix
confusion_matrix(y_val, predicted)

array([[141,   5],
       [ 23, 129]])

In [45]:
from sklearn.metrics import accuracy_score
accuracy_score(y_val, predicted)

0.9060402684563759

In [46]:
from sklearn.metrics import recall_score
recall_score(y_val, predicted)

0.8486842105263158

In [47]:
from sklearn.metrics import precision_score
precision_score(y_val, predicted)

0.9626865671641791

In [48]:
from sklearn.metrics import f1_score
f1_score(y_val, predicted)

0.9020979020979021

In [49]:
import numpy as np
from sklearn import metrics
fpr, tpr, thresholds = metrics.roc_curve(y_val, predicted)
metrics.auc(fpr, tpr)

0.9072188175919251

#### poly

In [50]:
text_clf = Pipeline([('tfidf', TfidfVectorizer(ngram_range=(1,1),max_features=features_taken)), ('clf', SVC(C=1, degree=2, gamma='scale', kernel='poly'))])
text_clf.fit(X_train, y_train)
predicted = text_clf.predict(X_val)

In [51]:
text_clf

Pipeline(memory=None,
         steps=[('tfidf',
                 TfidfVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.float64'>,
                                 encoding='utf-8', input='content',
                                 lowercase=True, max_df=1.0, max_features=3392,
                                 min_df=1, ngram_range=(1, 1), norm='l2',
                                 preprocessor=None, smooth_idf=True,
                                 stop_words=None, strip_accents=None,
                                 sublinear_tf=False,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, use_idf=True,
                                 vocabulary=None)),
                ('clf',
                 SVC(C=1, break_ties=False, cache_size=200, class_weight=None,
                     coef0=0.0, decision_function_shape='ovr', degre

In [52]:
from sklearn.metrics import confusion_matrix
confusion_matrix(y_val, predicted)

array([[141,   5],
       [ 24, 128]])

In [53]:
from sklearn.metrics import accuracy_score
accuracy_score(y_val, predicted)

0.9026845637583892

In [54]:
from sklearn.metrics import recall_score
recall_score(y_val, predicted)

0.8421052631578947

In [55]:
from sklearn.metrics import precision_score
precision_score(y_val, predicted)

0.9624060150375939

In [56]:
from sklearn.metrics import f1_score
f1_score(y_val, predicted)

0.8982456140350876

In [57]:
import numpy as np
from sklearn import metrics
fpr, tpr, thresholds = metrics.roc_curve(y_val, predicted)
metrics.auc(fpr, tpr)

0.9039293439077144

### Test

#### linear

In [58]:
text_clf = Pipeline([('tfidf', TfidfVectorizer(ngram_range=(1,1),max_features=features_taken)), ('clf', SVC(C=1, kernel='linear'))])
text_clf.fit(X_train, y_train)
predicted = text_clf.predict(X_test)

In [59]:
text_clf

Pipeline(memory=None,
         steps=[('tfidf',
                 TfidfVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.float64'>,
                                 encoding='utf-8', input='content',
                                 lowercase=True, max_df=1.0, max_features=3392,
                                 min_df=1, ngram_range=(1, 1), norm='l2',
                                 preprocessor=None, smooth_idf=True,
                                 stop_words=None, strip_accents=None,
                                 sublinear_tf=False,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, use_idf=True,
                                 vocabulary=None)),
                ('clf',
                 SVC(C=1, break_ties=False, cache_size=200, class_weight=None,
                     coef0=0.0, decision_function_shape='ovr', degre

In [60]:
from sklearn.metrics import confusion_matrix
confusion_matrix(y_test, predicted)

array([[200,  19],
       [ 22, 197]])

In [61]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test, predicted)

0.906392694063927

In [62]:
from sklearn.metrics import recall_score
recall_score(y_test, predicted)

0.8995433789954338

In [63]:
from sklearn.metrics import precision_score
precision_score(y_test, predicted)

0.9120370370370371

In [64]:
from sklearn.metrics import f1_score
f1_score(y_test, predicted)

0.9057471264367817

In [65]:
import numpy as np
from sklearn import metrics
fpr, tpr, thresholds = metrics.roc_curve(y_test, predicted)
metrics.auc(fpr, tpr)

0.9063926940639269

#### rbf

In [66]:
text_clf = Pipeline([('tfidf', TfidfVectorizer(ngram_range=(1,1),max_features=features_taken)), ('clf', SVC(C=1000, gamma='auto', kernel='rbf'))])
text_clf.fit(X_train, y_train)
predicted = text_clf.predict(X_test)

In [67]:
text_clf

Pipeline(memory=None,
         steps=[('tfidf',
                 TfidfVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.float64'>,
                                 encoding='utf-8', input='content',
                                 lowercase=True, max_df=1.0, max_features=3392,
                                 min_df=1, ngram_range=(1, 1), norm='l2',
                                 preprocessor=None, smooth_idf=True,
                                 stop_words=None, strip_accents=None,
                                 sublinear_tf=False,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, use_idf=True,
                                 vocabulary=None)),
                ('clf',
                 SVC(C=1000, break_ties=False, cache_size=200,
                     class_weight=None, coef0=0.0,
                     decision_fun

In [68]:
from sklearn.metrics import confusion_matrix
confusion_matrix(y_test, predicted)

array([[208,  11],
       [ 27, 192]])

In [69]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test, predicted)

0.91324200913242

In [70]:
from sklearn.metrics import recall_score
recall_score(y_test, predicted)

0.8767123287671232

In [71]:
from sklearn.metrics import precision_score
precision_score(y_test, predicted)

0.9458128078817734

In [72]:
from sklearn.metrics import f1_score
f1_score(y_test, predicted)

0.909952606635071

In [73]:
import numpy as np
from sklearn import metrics
fpr, tpr, thresholds = metrics.roc_curve(y_test, predicted)
metrics.auc(fpr, tpr)

0.91324200913242

#### poly

In [74]:
text_clf = Pipeline([('tfidf', TfidfVectorizer(ngram_range=(1,1),max_features=features_taken)), ('clf', SVC(C=1, degree=2, gamma='scale', kernel='poly'))])
text_clf.fit(X_train, y_train)
predicted = text_clf.predict(X_test)

In [75]:
text_clf

Pipeline(memory=None,
         steps=[('tfidf',
                 TfidfVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.float64'>,
                                 encoding='utf-8', input='content',
                                 lowercase=True, max_df=1.0, max_features=3392,
                                 min_df=1, ngram_range=(1, 1), norm='l2',
                                 preprocessor=None, smooth_idf=True,
                                 stop_words=None, strip_accents=None,
                                 sublinear_tf=False,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, use_idf=True,
                                 vocabulary=None)),
                ('clf',
                 SVC(C=1, break_ties=False, cache_size=200, class_weight=None,
                     coef0=0.0, decision_function_shape='ovr', degre

In [76]:
from sklearn.metrics import confusion_matrix
confusion_matrix(y_test, predicted)

array([[205,  14],
       [ 25, 194]])

In [77]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test, predicted)

0.910958904109589

In [78]:
from sklearn.metrics import recall_score
recall_score(y_test, predicted)

0.8858447488584474

In [79]:
from sklearn.metrics import precision_score
precision_score(y_test, predicted)

0.9326923076923077

In [80]:
from sklearn.metrics import f1_score
f1_score(y_test, predicted)

0.9086651053864168

In [81]:
import numpy as np
from sklearn import metrics
fpr, tpr, thresholds = metrics.roc_curve(y_test, predicted)
metrics.auc(fpr, tpr)

0.910958904109589