In [24]:
import pandas as pd

messages = pd.read_csv(
    "SMSSpamCollection",
    sep="\t",
    names=["label", "message"]
)

print(messages.head())
print(messages['label'].value_counts())


  label                                            message
0   ham                   Hey, are we still meeting today?
1  spam  Congratulations! You won a free iPhone! Click ...
2   ham         Don't forget to bring the notes for class.
3  spam          WINNER! Claim your $1000 gift card today!
4   ham                        I'll call you in 5 minutes.
label
ham     54
spam    54
Name: count, dtype: int64


In [30]:
## Data cleaning and Preprocessing
import re
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/pandeyraj/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [40]:
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
wordlemmatizer=WordNetLemmatizer()
stop_words = set(stopwords.words('english'))  

In [51]:
corpus=[]
for i in range(0,len(messages)):
    review=re.sub('[^a-zA-z]',' ',messages['message'][i])
    review=review.lower()
    review=review.split()
    review=[wordlemmatizer.lemmatize(word) for word in review if word not in stop_words]
    review = ' '.join(review)  
    corpus.append(review)
    

In [52]:
corpus

['hey still meeting today',
 'congratulation free iphone click',
 'forget bring note class',
 'winner claim gift card today',
 'call minute',
 'selected free vacation reply yes',
 'pick grocery way home',
 'get cheap med limited time offer',
 'meeting postponed pm',
 'act cash',
 'see game last night',
 'exclusive deal buy',
 'coming party tomorrow',
 'claim free gift voucher hurry',
 'let grab lunch later',
 'account compromised reset password immediately',
 'send homework',
 'lottery call number immediately',
 'thanks help yesterday',
 'free ticket concert reply claim',
 'late meeting',
 'pre approved credit card',
 'meeting today',
 'act fast limited offer luxury watch',
 'call get message',
 'congratulation free cruise',
 'finish report',
 'online sweepstakes',
 'review document',
 'get free sample today',
 'looking forward trip next week',
 'exclusive win brand new car reply',
 'pick pm',
 'lucky winner claim prize',
 'forget submit assignment',
 'act free gift card available',
 '

## Create TF-IDF And NGrams ##

In [53]:
from sklearn.feature_extraction.text import TfidfVectorizer


In [54]:
tfidf=TfidfVectorizer(max_features=100)
X=tfidf.fit_transform(corpus).toarray()

In [None]:
import numpy as np
np.set_printoptions(edgeitems=30, linewidth=100000, formatter=dict(float=lambda x: "%.3g" % x))

In [55]:
X

array([[0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.51980459, 0.        ,
        0.        ]], shape=(108, 100))

## N-Grams ##

In [56]:
tfidf=TfidfVectorizer(max_features=100, ngram_range=(2,2))
X=tfidf.fit_transform(corpus).toarray()

In [57]:
tfidf.vocabulary_

{'still meeting': np.int64(74),
 'meeting today': np.int64(31),
 'congratulation free': np.int64(7),
 'forget bring': np.int64(10),
 'note class': np.int64(39),
 'winner claim': np.int64(99),
 'gift card': np.int64(16),
 'selected free': np.int64(62),
 'free vacation': np.int64(14),
 'reply yes': np.int64(60),
 'pick grocery': np.int64(49),
 'way home': np.int64(94),
 'med limited': np.int64(25),
 'limited time': np.int64(19),
 'time offer': np.int64(85),
 'meeting postponed': np.int64(29),
 'postponed pm': np.int64(54),
 'exclusive deal': np.int64(8),
 'party tomorrow': np.int64(47),
 'claim free': np.int64(4),
 'free gift': np.int64(12),
 'voucher hurry': np.int64(90),
 'lunch later': np.int64(23),
 'password immediately': np.int64(48),
 'send homework': np.int64(63),
 'lottery call': np.int64(21),
 'number immediately': np.int64(40),
 'thanks help': np.int64(80),
 'ticket concert': np.int64(84),
 'reply claim': np.int64(59),
 'pre approved': np.int64(55),
 'act fast': np.int64(0),
 

In [58]:
X

array([[0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.70710678,
        0.        ]], shape=(108, 100))