In [12]:
import numpy as np
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import LabelBinarizer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import precision_score, recall_score, accuracy_score,f1_score

In [2]:
pipeline=Pipeline([
    ('vect',TfidfVectorizer(stop_words='english')),
    ('clf',LogisticRegression())])

In [3]:
parameters = {
   'vect__max_df': (0.25, 0.5, 0.75),
   'vect__stop_words': ('english', None),
   'vect__max_features': (2500, 5000, 10000, None),
    'vect__ngram_range': ((1, 1), (1, 2)),
    'vect__use_idf': (True, False),
     'vect__norm': ('l1', 'l2'),
     'clf__penalty': ('l1', 'l2'),
    'clf__C': (0.01, 0.1, 1, 10),
 }

In [4]:
grid_search=GridSearchCV(pipeline,param_grid=parameters,n_jobs=-1,verbose=1,scoring='accuracy',cv=3)

In [5]:
data=pd.read_csv('/home/amal/Current_work/Hyper_parameter_tuning/smsspamcollection/SMSSpamCollection',delimiter='\t',header=None)

In [6]:
x=data.iloc[:,1]
y=data.iloc[:,0]
x_train,x_test,y_train,y_test=train_test_split(x,y,random_state=42)

In [7]:
print(f' x_train size:{len(x_train)}\n y_train size:{len(y_train)}')

 x_train size:4179
 y_train size:4179


In [8]:
lb=LabelBinarizer()
y_train=np.array([number[0] for number in lb.fit_transform(y_train)])
y_test=np.array([number[0] for number in lb.fit_transform(y_test)])

In [9]:
grid_search.fit(x_train,y_train)

Fitting 3 folds for each of 1536 candidates, totalling 4608 fits


2304 fits failed out of a total of 4608.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
2304 fits failed with the following error:
Traceback (most recent call last):
  File "/home/amal/anaconda3/lib/python3.9/site-packages/sklearn/model_selection/_validation.py", line 680, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/home/amal/anaconda3/lib/python3.9/site-packages/sklearn/pipeline.py", line 394, in fit
    self._final_estimator.fit(Xt, y, **fit_params_last_step)
  File "/home/amal/anaconda3/lib/python3.9/site-packages/sklearn/linear_model/_logistic.py", line 1461, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  File "/home/amal/anaconda3/lib/python3.9/site-packages/sklearn/linear

GridSearchCV(cv=3,
             estimator=Pipeline(steps=[('vect',
                                        TfidfVectorizer(stop_words='english')),
                                       ('clf', LogisticRegression())]),
             n_jobs=-1,
             param_grid={'clf__C': (0.01, 0.1, 1, 10),
                         'clf__penalty': ('l1', 'l2'),
                         'vect__max_df': (0.25, 0.5, 0.75),
                         'vect__max_features': (2500, 5000, 10000, None),
                         'vect__ngram_range': ((1, 1), (1, 2)),
                         'vect__norm': ('l1', 'l2'),
                         'vect__stop_words': ('english', None),
                         'vect__use_idf': (True, False)},
             scoring='accuracy', verbose=1)

In [10]:
print ('Best score: %0.3f' % grid_search.best_score_)
print ('Best parameters set:')
best_parameters = grid_search.best_estimator_.get_params()
for param_name in sorted(parameters.keys()):
    print ('\t%s: %r' % (param_name, best_parameters[param_name]))

Best score: 0.984
Best parameters set:
	clf__C: 10
	clf__penalty: 'l2'
	vect__max_df: 0.5
	vect__max_features: 10000
	vect__ngram_range: (1, 2)
	vect__norm: 'l2'
	vect__stop_words: None
	vect__use_idf: True


In [13]:
predictions = grid_search.predict(x_test)
print ('Accuracy:', accuracy_score(y_test, predictions))
print ('Precision:', precision_score(y_test, predictions))
print ('Recall:', recall_score(y_test, predictions))
print ('F1 Score:', f1_score(y_test, predictions))

Accuracy: 0.9899497487437185
Precision: 0.9886363636363636
Recall: 0.9354838709677419
F1 Score: 0.9613259668508287
