In [298]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression
import warnings
import re

warnings.filterwarnings('ignore')

In [299]:
df = pd.read_csv('spam.csv', encoding='ISO-8859-1')

In [300]:
df

Unnamed: 0,text,lable,Unnamed: 2,Unnamed: 3
0,"Go until jurong point, crazy.. Available only ...",ham,,
1,Ok lar... Joking wif u oni...,ham,,
2,Free entry in 2 a wkly comp to win FA Cup fina...,spam,,
3,U dun say so early hor... U c already then say...,ham,,
4,"Nah I don't think he goes to usf, he lives aro...",ham,,
...,...,...,...,...
5567,This is the 2nd time we have tried 2 contact u...,spam,,
5568,Will Ì_ b going to esplanade fr home?,ham,,
5569,"Pity, * was in mood for that. So...any other s...",ham,,
5570,The guy did some bitching but I acted like i'd...,ham,,


In [301]:
df=df.drop(columns=['Unnamed: 2','Unnamed: 3'])

In [302]:
lables=df['lable']
texts=df['text']

In [303]:
texts

0       Go until jurong point, crazy.. Available only ...
1                           Ok lar... Joking wif u oni...
2       Free entry in 2 a wkly comp to win FA Cup fina...
3       U dun say so early hor... U c already then say...
4       Nah I don't think he goes to usf, he lives aro...
                              ...                        
5567    This is the 2nd time we have tried 2 contact u...
5568                Will Ì_ b going to esplanade fr home?
5569    Pity, * was in mood for that. So...any other s...
5570    The guy did some bitching but I acted like i'd...
5571                           Rofl. Its true to its name
Name: text, Length: 5572, dtype: object

In [304]:
lables

0        ham
1        ham
2       spam
3        ham
4        ham
        ... 
5567    spam
5568     ham
5569     ham
5570     ham
5571     ham
Name: lable, Length: 5572, dtype: object

In [305]:
len(texts)

5572

In [306]:
def tokenize(sentences):
    sentences=sentences.lower()
    sentences=re.sub(r'[^\w\s]', '', sentences)
    return sentences.split()

In [307]:
tokenized_sentences=[tokenize(t) for t in texts]
tokenized_sentences

[['go',
  'until',
  'jurong',
  'point',
  'crazy',
  'available',
  'only',
  'in',
  'bugis',
  'n',
  'great',
  'world',
  'la',
  'e',
  'buffet',
  'cine',
  'there',
  'got',
  'amore',
  'wat'],
 ['ok', 'lar', 'joking', 'wif', 'u', 'oni'],
 ['free',
  'entry',
  'in',
  '2',
  'a',
  'wkly',
  'comp',
  'to',
  'win',
  'fa',
  'cup',
  'final',
  'tkts',
  '21st',
  'may',
  '2005',
  'text',
  'fa',
  'to',
  '87121',
  'to',
  'receive',
  'entry',
  'questionstd',
  'txt',
  'ratetcs',
  'apply',
  '08452810075over18s'],
 ['u', 'dun', 'say', 'so', 'early', 'hor', 'u', 'c', 'already', 'then', 'say'],
 ['nah',
  'i',
  'dont',
  'think',
  'he',
  'goes',
  'to',
  'usf',
  'he',
  'lives',
  'around',
  'here',
  'though'],
 ['freemsg',
  'hey',
  'there',
  'darling',
  'its',
  'been',
  '3',
  'weeks',
  'now',
  'and',
  'no',
  'word',
  'back',
  'id',
  'like',
  'some',
  'fun',
  'you',
  'up',
  'for',
  'it',
  'still',
  'tb',
  'ok',
  'xxx',
  'std',
  'chgs',

In [308]:
def build_vocabulary(tokenized_sentences):
    vocabulary = set()
    for sentence in tokenized_sentences:
        vocabulary.update(sentence)
    return list(vocabulary)


In [309]:
vocabulary = build_vocabulary(tokenized_sentences)
vocabulary[:10]

['role',
 'jb',
 '415',
 'andor',
 'rough',
 'cardiff',
 'itsnot',
 'collection',
 'needy',
 'headache']

In [310]:
len(vocabulary)

9564

In [311]:
def create_bow_vector(sentence, vocabulary): 
    vec = [0]*len(vocabulary)
    for word in sentence:
        if word in vocabulary:
            idx = vocabulary.index(word)
            vec[idx] = 1
    return vec

In [312]:
bow_vectors = [create_bow_vector(sentence, vocabulary) for sentence in tokenized_sentences]
len(bow_vectors)

5572

In [313]:
encoded_vectors = np.array(bow_vectors)
encoded_vectors

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [314]:
cols = [i for i in vocabulary]

In [315]:
df[cols] = encoded_vectors

KeyboardInterrupt: 

In [294]:
df['text']=texts
df

Unnamed: 0,text,lable,role,jb,415,andor,rough,cardiff,itsnot,collection,...,s89,pocked,08714714011,endowed,get,meive,otbox,athletic,530,hiphop
0,"Go until jurong point, crazy.. Available only ...",ham,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,Ok lar... Joking wif u oni...,ham,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,Free entry in 2 a wkly comp to win FA Cup fina...,spam,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,U dun say so early hor... U c already then say...,ham,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,"Nah I don't think he goes to usf, he lives aro...",ham,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5567,This is the 2nd time we have tried 2 contact u...,spam,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5568,Will Ì_ b going to esplanade fr home?,ham,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5569,"Pity, * was in mood for that. So...any other s...",ham,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5570,The guy did some bitching but I acted like i'd...,ham,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [295]:
model=LogisticRegression()

model.fit(encoded_vectors,df['lable'])

model.score(encoded_vectors,df['lable'])

0.9969490308686288

In [296]:
sen = "WINNER!! As a valued network customer you have been selected to receivea �900 prize reward! To claim call 09061701461. Claim code KL341. Valid 12 hours only."
tokenize(sen)

test_sen = create_bow_vector(sen, vocabulary)
test_sen = np.array(test_sen).reshape(1,-1)
test_sen

model.predict(test_sen)

array(['ham'], dtype=object)

In [297]:
from sklearn.feature_extraction.text import CountVectorizer



vectorizer = CountVectorizer()

bow_matrix = vectorizer.fit_transform(texts)

bow_df = pd.DataFrame(bow_matrix.toarray(), columns=vectorizer.get_feature_names_out())

bow_df['label'] = lables

bow_df

Unnamed: 0,00,000,000pes,008704050406,0089,0121,01223585236,01223585334,0125698789,02,...,û_,û_thanks,ûªm,ûªt,ûªve,ûï,ûïharry,ûò,ûówell,label
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,ham
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,ham
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,spam
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,ham
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,ham
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5567,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,spam
5568,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,ham
5569,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,ham
5570,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,ham
