In [10]:
import numpy as np
import pandas as pd
import nltk
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.corpus import stopwords
stop_words = stopwords.words('english')

In [11]:
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV

stop_words = set(stopwords.words('english'))

In [12]:
from sklearn.model_selection import train_test_split

data_df = pd.read_csv("spam.csv")
data_df.head(3)

Unnamed: 0,label,text
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...


In [13]:
data_X = data_df[['text']].values
data_y = data_df[['label']].values
X_train, X_test, y_train, y_test = train_test_split(data_X, data_y, test_size=0.2)

In [14]:
train_X = [x[0].strip() for x in X_train.tolist()]
test_X = [x[0].strip() for x in X_test.tolist()]

In [15]:
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(stop_words=stop_words)),
    ('clf', LogisticRegression())])
parameters = {
    'tfidf__max_df': (0.25, 0.5, 0.75),
    'tfidf__ngram_range': [(1, 1)]
}

In [16]:
grid_search_tune = GridSearchCV(
    pipeline, parameters, cv=2, n_jobs=2, verbose=3)
grid_search_tune.fit(train_X, y_train)

Fitting 2 folds for each of 3 candidates, totalling 6 fits


  y = column_or_1d(y, warn=True)


In [17]:
print("Best parameters set:")
print(grid_search_tune.best_params_)

Best parameters set:
{'tfidf__max_df': 0.25, 'tfidf__ngram_range': (1, 1)}


In [18]:
from sklearn import metrics

best_clf = grid_search_tune.best_estimator_
predictions = best_clf.predict(test_X)

print(metrics.classification_report(y_test, predictions))

              precision    recall  f1-score   support

         ham       0.96      1.00      0.98       958
        spam       0.99      0.77      0.87       157

    accuracy                           0.97      1115
   macro avg       0.98      0.88      0.92      1115
weighted avg       0.97      0.97      0.97      1115



## Test Predictions

In [19]:
best_clf.predict(["Text FA to 87121 to receive entry question(std txt rate)T&C's apply 08452810075over18's"])

array(['spam'], dtype=object)

In [20]:
best_clf.predict(["Reply to claim your reward of $20,000"])

array(['spam'], dtype=object)

In [21]:
best_clf.predict(["This is to inform you that you have a meeting scheduled today at 4:00 pm"])

array(['ham'], dtype=object)

In [22]:
best_clf.predict(["Congratulations on successfully completing an online course on Machine Learning Application offered by Great Learning Academy."])

array(['ham'], dtype=object)