In [1]:
import pandas as pd

In [2]:
df = pd.read_csv('SPAM text message 20170820 - Data.csv')

In [3]:
df.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [4]:
import re
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

In [5]:
lemmatizer = WordNetLemmatizer()

In [6]:
df

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will ü b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


In [7]:
for i in range(0, len(df.Message)):
    regex = re.sub(r'[^a-zA-Z]', ' ', df.Message[i])
    regex = regex.lower()
    tokens = regex.split()
    clean_tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in set(stopwords.words('english'))]
    df.Message[i] = ' '.join(clean_tokens)

You are setting values through chained assignment. Currently this works in certain cases, but when using Copy-on-Write (which will become the default behaviour in pandas 3.0) this will never work to update the original DataFrame or Series, because the intermediate object on which we are setting values will behave as a copy.
A typical example is when you are setting values in a column of a DataFrame, like:

df["col"][row_indexer] = value

Use `df.loc[row_indexer, "col"] = values` instead, to perform the assignment in a single step and ensure this keeps updating the original `df`.

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

  df.Message[i] = ' '.join(clean_tokens)


In [8]:
df

Unnamed: 0,Category,Message
0,ham,go jurong point crazy available bugis n great ...
1,ham,ok lar joking wif u oni
2,spam,free entry wkly comp win fa cup final tkts st ...
3,ham,u dun say early hor u c already say
4,ham,nah think go usf life around though
...,...,...
5567,spam,nd time tried contact u u pound prize claim ea...
5568,ham,b going esplanade fr home
5569,ham,pity mood suggestion
5570,ham,guy bitching acted like interested buying some...


In [9]:
X = df.Message
y = df.Category
y = y.map({'ham':0, 'spam':1})

In [10]:
# creating bag of words
from sklearn.feature_extraction.text import CountVectorizer

In [11]:
# For binary bow set binary to True
bow = CountVectorizer(
    max_features=5000,
    ngram_range=(1,2)
)

In [12]:
X = bow.fit_transform(X).toarray()

In [13]:
X

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [14]:
bow.vocabulary_

{'go': 1614,
 'point': 3222,
 'crazy': 835,
 'available': 236,
 'bugis': 449,
 'great': 1717,
 'world': 4882,
 'la': 2209,
 'cine': 653,
 'got': 1689,
 'wat': 4696,
 'ok': 2968,
 'lar': 2226,
 'joking': 2115,
 'wif': 4784,
 'oni': 2999,
 'ok lar': 2972,
 'free': 1415,
 'entry': 1211,
 'wkly': 4850,
 'comp': 750,
 'win': 4794,
 'fa': 1275,
 'cup': 851,
 'final': 1341,
 'tkts': 4322,
 'st': 3958,
 'may': 2554,
 'text': 4184,
 'receive': 3446,
 'question': 3365,
 'std': 3983,
 'txt': 4433,
 'rate': 3396,
 'apply': 159,
 'free entry': 1424,
 'entry wkly': 1214,
 'wkly comp': 4851,
 'cup final': 852,
 'text fa': 4191,
 'std txt': 3984,
 'txt rate': 4445,
 'rate apply': 3398,
 'dun': 1140,
 'say': 3614,
 'early': 1154,
 'already': 96,
 'nah': 2804,
 'think': 4242,
 'usf': 4573,
 'life': 2320,
 'around': 181,
 'though': 4263,
 'think go': 4248,
 'freemsg': 1445,
 'hey': 1858,
 'darling': 894,
 'week': 4730,
 'word': 4869,
 'back': 265,
 'like': 2332,
 'fun': 1482,
 'still': 3986,
 'tb': 4128,

In [15]:
bow.transform(['good go great got happy morning come said give sorry ask get said later year find new tell anything yeah work let anything thanks you good morning see please thanks anything']).toarray()

array([[0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [16]:
y.shape

(5572,)

In [17]:
X.shape

(5572, 5000)

In [18]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score

In [19]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

In [20]:
from sklearn.naive_bayes import MultinomialNB

In [21]:
model = MultinomialNB().fit(X_train, y_train)

In [22]:
y_pred_train = model.predict(X_train)
y_pred_test = model.predict(X_test)

In [23]:
print('Training data Validation')
print(f'Accuracy: {accuracy_score(y_train, y_pred_train)}')
print(f'Confusion Matrix: {confusion_matrix(y_train,y_pred_train)}')

Training data Validation
Accuracy: 0.9899497487437185
Confusion Matrix: [[3603   15]
 [  27  534]]


In [24]:
print('Testing data Validation')
print(f'Accuracy: {accuracy_score(y_test, y_pred_test)}')
print(f'Confusion Matrix: {confusion_matrix(y_test,y_pred_test)}')

Testing data Validation
Accuracy: 0.9827709978463748
Confusion Matrix: [[1196   11]
 [  13  173]]


In [25]:
# text = 'congratulations you have become a winner rupees lottery win won get got scam request urgent urgent requesting'
# text = 'today is meeting come hurry up'
text = 'congrats you have become a winner'
vect = bow.transform([text]).toarray()
model.predict(vect)

array([1], dtype=int64)

In [26]:
import pickle

In [27]:
with open('model.pkl', 'wb') as file:
    pickle.dump(model, file)

with open('bow.pkl', 'wb') as file:
    pickle.dump(bow, file)

# Using TF-IDF

In [28]:
import pandas as pd

In [29]:
df = pd.read_csv('SPAM text message 20170820 - Data.csv')

In [30]:
df.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [33]:
lemmatizer = WordNetLemmatizer()

In [34]:
for i in range(0, len(df.Message)):
    regex = re.sub(r'[^a-zA-Z]', ' ', df.Message[i])
    regex = regex.lower()
    tokens = regex.split()
    clean_tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in set(stopwords.words('english'))]
    df.Message[i] = ' '.join(clean_tokens)

You are setting values through chained assignment. Currently this works in certain cases, but when using Copy-on-Write (which will become the default behaviour in pandas 3.0) this will never work to update the original DataFrame or Series, because the intermediate object on which we are setting values will behave as a copy.
A typical example is when you are setting values in a column of a DataFrame, like:

df["col"][row_indexer] = value

Use `df.loc[row_indexer, "col"] = values` instead, to perform the assignment in a single step and ensure this keeps updating the original `df`.

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

  df.Message[i] = ' '.join(clean_tokens)


In [32]:
df.head()

Unnamed: 0,Category,Message
0,ham,go jurong point crazy available bugis n great ...
1,ham,ok lar joking wif u oni
2,spam,free entry wkly comp win fa cup final tkts st ...
3,ham,u dun say early hor u c already say
4,ham,nah think go usf life around though


In [42]:
X = df.Message
y = df.Category
y = y.map({'ham':0, 'spam':1})

In [43]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [44]:
tfidf = TfidfVectorizer(
    max_features=5000,
    ngram_range=(1,2)
)

In [45]:
X = tfidf.fit_transform(X).toarray()

In [46]:
X

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [47]:
tfidf.vocabulary_

{'go': 1554,
 'point': 3199,
 'crazy': 809,
 'available': 211,
 'bugis': 424,
 'great': 1708,
 'world': 4887,
 'la': 2225,
 'cine': 628,
 'got': 1679,
 'wat': 4716,
 'ok': 2954,
 'lar': 2243,
 'joking': 2150,
 'wif': 4803,
 'oni': 2985,
 'ok lar': 2958,
 'free': 1386,
 'entry': 1155,
 'wkly': 4858,
 'comp': 725,
 'win': 4813,
 'fa': 1232,
 'cup': 825,
 'final': 1312,
 'tkts': 4342,
 'st': 3981,
 'may': 2568,
 'text': 4209,
 'receive': 3429,
 'question': 3336,
 'std': 4012,
 'txt': 4453,
 'rate': 3375,
 'apply': 148,
 'free entry': 1395,
 'entry wkly': 1158,
 'wkly comp': 4859,
 'cup final': 826,
 'std txt': 4014,
 'txt rate': 4465,
 'rate apply': 3377,
 'dun': 1084,
 'say': 3612,
 'early': 1098,
 'already': 86,
 'nah': 2799,
 'think': 4276,
 'usf': 4593,
 'life': 2336,
 'around': 170,
 'though': 4292,
 'think go': 4279,
 'freemsg': 1416,
 'hey': 1872,
 'darling': 866,
 'week': 4750,
 'word': 4874,
 'back': 240,
 'like': 2348,
 'fun': 1453,
 'still': 4018,
 'tb': 4167,
 'xxx': 4943,
 's

In [48]:
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, confusion_matrix

In [49]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

In [50]:
tfmodel = MultinomialNB().fit(X_train, y_train)

In [51]:
y_pred_train = tfmodel.predict(X_train)
y_pred_test = tfmodel.predict(X_test)

In [52]:
print('Training data Validation')
print(f'Accuracy: {accuracy_score(y_train, y_pred_train)}')
print(f'Confusion Matrix: {confusion_matrix(y_train,y_pred_train)}')

Training data Validation
Accuracy: 0.9803780808805934
Confusion Matrix: [[3618    0]
 [  82  479]]


In [53]:
print('Testing data Validation')
print(f'Accuracy: {accuracy_score(y_test, y_pred_test)}')
print(f'Confusion Matrix: {confusion_matrix(y_test,y_pred_test)}')

Testing data Validation
Accuracy: 0.9763101220387652
Confusion Matrix: [[1207    0]
 [  33  153]]
