In [1]:
import pandas as pd
messages = pd.read_csv('SMSSpamCollection', sep='\t', names=['label', 'messages'])
messages.sample(5)

Unnamed: 0,label,messages
129,ham,HEY GIRL. HOW R U? HOPE U R WELL ME AN DEL R B...
985,ham,Yo guess what I just dropped
4825,ham,Not thought bout it... || Drink in tap & spile...
326,ham,No calls..messages..missed calls
2446,ham,The guy (kadeem) hasn't been selling since the...


In [2]:
# data cleaning and processing 
import re 
import nltk 
nltk.download('stopwords')
from nltk.corpus import stopwords 
from nltk.stem.porter import PorterStemmer

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\dev_4\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


In [3]:
ps = PorterStemmer()
corpus = []

for i in range(0, len(messages)):
  review = re.sub('[^a-zA-Z]', ' ', messages['messages'][i]) # removing non-alpha-numeric characters
  review = review.lower()
  review = review.split()
  review = [ps.stem(word) for word in review if not word in stopwords.words('english')]
  review = ' '.join(review)
  corpus.append(review)

corpus[0]

'go jurong point crazi avail bugi n great world la e buffet cine got amor wat'

In [4]:
# creating bag of words model 
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features = 2500)
x = cv.fit_transform(corpus).toarray()

pd.DataFrame(x)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,2490,2491,2492,2493,2494,2495,2496,2497,2498,2499
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5567,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5568,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5569,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5570,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [5]:
y = pd.get_dummies(messages['label'])
y = y.iloc[:, 1].values
y

array([0, 0, 1, ..., 0, 0, 0], dtype=uint8)

In [6]:
# train and test split 
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=4)

In [7]:
!pip install lazypredict

Collecting numpy==1.19.1
  Using cached numpy-1.19.1-cp38-cp38-win_amd64.whl (13.0 MB)
Installing collected packages: numpy
  Attempting uninstall: numpy
    Found existing installation: numpy 1.19.5
    Uninstalling numpy-1.19.5:
      Successfully uninstalled numpy-1.19.5
  Rolling back uninstall of numpy
  Moving to c:\users\dev_4\appdata\roaming\python\python38\scripts\f2py.exe
   from C:\Users\dev_4\AppData\Local\Temp\pip-uninstall-o20vxmo9\f2py.exe
  Moving to c:\users\dev_4\appdata\roaming\python\python38\site-packages\numpy-1.19.5.dist-info\
   from C:\Users\dev_4\AppData\Roaming\Python\Python38\site-packages\~umpy-1.19.5.dist-info


ERROR: Could not install packages due to an OSError: [WinError 5] Access is denied: 'C:\\ProgramData\\Anaconda3\\Lib\\site-packages\\numpy\\LICENSE.txt'
Consider using the `--user` option or check the permissions.



  Moving to c:\users\dev_4\appdata\roaming\python\python38\site-packages\numpy\
   from C:\Users\dev_4\AppData\Roaming\Python\Python38\site-packages\~umpy


In [8]:
from lazypredict.Supervised import LazyClassifier
classifier = LazyClassifier(verbose=0,ignore_warnings=True, custom_metric=None, predictions=True)
models,predictions = classifier.fit(x_train, x_test, y_train, y_test)

100%|██████████████████████████████████████████████████████████████████████████████████| 29/29 [05:55<00:00, 12.27s/it]


In [10]:
print(models)

                               Accuracy  Balanced Accuracy  ROC AUC  F1 Score  \
Model                                                                           
PassiveAggressiveClassifier        0.96               0.95     0.95      0.96   
LinearSVC                          0.95               0.95     0.95      0.95   
LGBMClassifier                     0.98               0.95     0.95      0.98   
RandomForestClassifier             0.98               0.94     0.94      0.98   
Perceptron                         0.96               0.94     0.94      0.96   
LogisticRegression                 0.98               0.94     0.94      0.98   
XGBClassifier                      0.98               0.94     0.94      0.98   
BernoulliNB                        0.98               0.94     0.94      0.98   
ExtraTreesClassifier               0.98               0.94     0.94      0.98   
DecisionTreeClassifier             0.97               0.93     0.93      0.97   
AdaBoostClassifier          