# TfidfVectorizer parameter tuning with gridsearchCV

### Import data and packages:

In [2]:
#import data frames from previous notebook

%store -r textdata

%store -r titledata

In [3]:
#import packages

import pandas as pd 

from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.model_selection import train_test_split, GridSearchCV, RepeatedStratifiedKFold

from sklearn.pipeline import Pipeline

from sklearn.linear_model import LogisticRegression

from sklearn.metrics import classification_report



## TF-IDF on text column:

In [4]:
# Corpus Selection
x_text = textdata.text.astype(str)

# Target Selection
y_text = textdata.label

In [5]:
#split text data into training and test sets 
x_text_train, x_text_test, y_text_train, y_text_test = train_test_split(
    x_text, y_text,
    test_size=0.3,
    random_state=42
)

### Naive-Bayes

In [6]:

from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.model_selection import train_test_split, GridSearchCV
tfidf_nb = Pipeline([('tfidf', TfidfVectorizer()),
                     ('clf', MultinomialNB())])
tfidf_nb_params = {
    'tfidf__min_df': (1, 2, 5, 10),
    'tfidf__max_df': (0.6,0.7, 0.8, 0.9),
    'tfidf__max_features': (None, 10000, 15000, 20000)
}

In [7]:
cv = RepeatedStratifiedKFold(n_splits=3,random_state=42)


clf_nb_text = GridSearchCV(tfidf_nb, tfidf_nb_params, cv=cv, scoring='accuracy', n_jobs = -1)


In [8]:
clf_nb_text.fit(x_text_train, y_text_train)

In [14]:
y_pred_nb_text = clf_nb_text.predict(x_text_test)

In [15]:
print(classification_report(y_text_test, y_pred_nb_text, digits=4))

              precision    recall  f1-score   support

           0     0.8642    0.8721    0.8681      9776
           1     0.8377    0.8280    0.8328      7791

    accuracy                         0.8526     17567
   macro avg     0.8509    0.8501    0.8505     17567
weighted avg     0.8524    0.8526    0.8525     17567



In [16]:
clf_nb_text_params = clf_nb_text.best_params_

In [50]:
clf_nb_text_params

{'tfidf__max_df': 0.9, 'tfidf__max_features': None, 'tfidf__min_df': 5}

In [18]:
%store clf_nb_text_params

Stored 'clf_nb_text_params' (dict)


### LogReg

In [23]:
from sklearn.linear_model import LogisticRegression

tfidf_lr = Pipeline([('tfidf', TfidfVectorizer()),
                     ('lr', LogisticRegression())])
tfidf_lr_params = {
    'tfidf__min_df': (2, 5, 10),
    'tfidf__max_df': (0.6,0.7, 0.8, 0.9),
    'tfidf__max_features': (None, 10000, 15000, 20000)
}

In [25]:
clf_lr_text = GridSearchCV(tfidf_lr, tfidf_lr_params, cv=3, scoring='accuracy', n_jobs = -1)

clf_lr_text.fit(x_text_train, y_text_train)

y_pred_lr_text = clf_lr_text.predict(x_text_test)


              precision    recall  f1-score   support

           0       0.94      0.94      0.94      9776
           1       0.93      0.93      0.93      7791

    accuracy                           0.94     17567
   macro avg       0.94      0.94      0.94     17567
weighted avg       0.94      0.94      0.94     17567



In [26]:
print(classification_report(y_text_test, y_pred_lr_text, digits=4))

              precision    recall  f1-score   support

           0     0.9447    0.9443    0.9445      9776
           1     0.9301    0.9307    0.9304      7791

    accuracy                         0.9382     17567
   macro avg     0.9374    0.9375    0.9374     17567
weighted avg     0.9382    0.9382    0.9382     17567



In [27]:
clf_lr_text_params = clf_lr_text.best_params_

In [28]:
clf_lr_text_params

{'tfidf__max_df': 0.9, 'tfidf__max_features': 15000, 'tfidf__min_df': 2}

In [29]:
%store clf_lr_text_params

Stored 'clf_lr_text_params' (dict)


### SVM

In [52]:
from sklearn.svm import SVC

tfidf_svm = Pipeline([('tfidf', TfidfVectorizer()),
                     ('svm', SVC())])
tfidf_svm_params = {
    'tfidf__min_df': (2, 5, 10),
    'tfidf__max_df': (0.6,0.7, 0.8, 0.9),
    'tfidf__max_features': (None, 10000, 15000, 20000)
}

In [53]:
cv = RepeatedStratifiedKFold(n_splits=3,random_state=42)

clf_svm_text = GridSearchCV(tfidf_svm, tfidf_svm_params, cv=3, scoring='accuracy', n_jobs = -1)

In [54]:
clf_svm_text.fit(x_text_train, y_text_train)

In [None]:
y_pred_svm_text = clf_svm_text.predict(x_text_test)

In [67]:
print(classification_report(y_text_test, y_pred_svm_text, digits=4))

              precision    recall  f1-score   support

           0     0.9521    0.9532    0.9526      9776
           1     0.9411    0.9398    0.9405      7791

    accuracy                         0.9472     17567
   macro avg     0.9466    0.9465    0.9465     17567
weighted avg     0.9472    0.9472    0.9472     17567



In [68]:
clf_svm_text_params = clf_svm_text.best_params_

{'tfidf__max_df': 0.6, 'tfidf__max_features': 15000, 'tfidf__min_df': 10}

In [51]:
clf_svm_text_params

NameError: name 'clf_svm_text_params' is not defined

In [None]:
%store clf_svm_text_params

## TF-IDF on title column:

In [30]:
# Corpus Selection
x_title = titledata.title.astype(str)

# Target Selection
y_title = titledata.label

In [31]:
#split title data into training and test sets 
x_title_train, x_title_test, y_title_train, y_title_test = train_test_split(
    x_title, y_title,
    test_size=0.3,
    random_state=42
)

### Naive-Bayes - title

In [33]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.model_selection import train_test_split, GridSearchCV
tfidf_nb = Pipeline([('tfidf', TfidfVectorizer()),
                     ('clf', MultinomialNB())])
tfidf_nb_params = {
    'tfidf__min_df': (2, 5, 10),
    'tfidf__max_df': (0.6, 0.7, 0.8, 0.9),
    'tfidf__max_features': (None, 10000, 15000, 20000)
}

In [34]:
cv = RepeatedStratifiedKFold(n_splits=3,random_state=42)

clf_nb_title = GridSearchCV(tfidf_nb, tfidf_nb_params, cv=cv, scoring='accuracy', n_jobs = -1)

clf_nb_title.fit(x_title_train, y_title_train)

y_pred_nb_title = clf_nb_title.predict(x_title_test)

print(classification_report(y_title_test, y_pred_nb_title, digits=4))

              precision    recall  f1-score   support

           0     0.8621    0.9001    0.8807      9776
           1     0.8673    0.8194    0.8427      7791

    accuracy                         0.8643     17567
   macro avg     0.8647    0.8597    0.8617     17567
weighted avg     0.8644    0.8643    0.8638     17567



In [35]:
clf_nb_title_params = clf_nb_title.best_params_

In [36]:
clf_nb_title_params

{'tfidf__max_df': 0.6, 'tfidf__max_features': None, 'tfidf__min_df': 2}

In [37]:
%store clf_nb_title_params

Stored 'clf_nb_title_params' (dict)


### LogReg - title

In [38]:
from sklearn.linear_model import LogisticRegression

tfidf_lr = Pipeline([('tfidf', TfidfVectorizer()),
                     ('lr', LogisticRegression())])
tfidf_lr_params = {
    'tfidf__min_df': (2, 5, 10),
    'tfidf__max_df': (0.6, 0.7, 0.8, 0.9),
    'tfidf__max_features': (None, 10000, 15000, 20000)
}

In [40]:
cv = RepeatedStratifiedKFold(n_splits=3,random_state=42)

clf_lr_title = GridSearchCV(tfidf_lr, tfidf_lr_params, cv=cv, scoring='accuracy', n_jobs = -1)

clf_lr_title.fit(x_title_train, y_title_train)

y_pred_lr_title = clf_lr_title.predict(x_title_test)

print(classification_report(y_title_test, y_pred_lr_title, digits=4))

              precision    recall  f1-score   support

           0     0.8924    0.9023    0.8973      9776
           1     0.8757    0.8634    0.8695      7791

    accuracy                         0.8851     17567
   macro avg     0.8840    0.8829    0.8834     17567
weighted avg     0.8850    0.8851    0.8850     17567



In [42]:
clf_lr_title_params = clf_lr_title.best_params_

In [43]:
clf_lr_title_params

{'tfidf__max_df': 0.6, 'tfidf__max_features': None, 'tfidf__min_df': 2}

In [44]:
%store clf_lr_title_params

Stored 'clf_lr_title_params' (dict)


### SVM - title

In [45]:
from sklearn.svm import SVC

tfidf_svm = Pipeline([('tfidf', TfidfVectorizer()),
                     ('svm', SVC())])
tfidf_svm_params = {
    'tfidf__min_df': (2, 5, 10),
    'tfidf__max_df': (0.6, 0.7, 0.8, 0.9),
    'tfidf__max_features': (None, 10000, 15000, 20000)
}

In [46]:
cv = RepeatedStratifiedKFold(n_splits=3,random_state=42)

clf_svm_title = GridSearchCV(tfidf_svm, tfidf_svm_params, cv=cv, scoring='accuracy', n_jobs = -1)

clf_svm_title.fit(x_title_train, y_title_train)

y_pred_svm_title = clf_svm_title.predict(x_title_test)

print(classification_report(y_title_test, y_pred_svm_title, digits=4))

              precision    recall  f1-score   support

           0     0.9047    0.9050    0.9048      9776
           1     0.8807    0.8804    0.8805      7791

    accuracy                         0.8941     17567
   macro avg     0.8927    0.8927    0.8927     17567
weighted avg     0.8941    0.8941    0.8941     17567



In [47]:
clf_svm_title_params = clf_svm_title.best_params_

In [48]:
clf_svm_title_params

{'tfidf__max_df': 0.6, 'tfidf__max_features': None, 'tfidf__min_df': 2}

In [49]:
%store clf_svm_title_params

Stored 'clf_svm_title_params' (dict)


### Create and store variables for later use:

In [67]:
x_text_train_tf, x_text_test_tf, y_text_train_tf, y_text_test_tf = x_text_train, x_text_test, y_text_train, y_text_test

In [68]:
%store x_text_train_tf
%store x_text_test_tf
%store y_text_train_tf
%store y_text_test_tf

Stored 'x_text_train_tf' (csr_matrix)
Stored 'x_text_test_tf' (csr_matrix)
Stored 'y_text_train_tf' (Series)
Stored 'y_text_test_tf' (Series)


In [69]:
x_title_train_tf, x_title_test_tf, y_title_train_tf, y_title_test_tf = x_title_train, x_title_test, y_title_train, y_title_test

In [70]:
%store x_title_train_tf
%store x_title_test_tf
%store y_title_train_tf
%store y_title_test_tf

Stored 'x_title_train_tf' (csr_matrix)
Stored 'x_title_test_tf' (csr_matrix)
Stored 'y_title_train_tf' (Series)
Stored 'y_title_test_tf' (Series)


In [71]:
x_title_test_tf

<17567x4249 sparse matrix of type '<class 'numpy.float64'>'
	with 138016 stored elements in Compressed Sparse Row format>