Dataset https://www.kaggle.com/uciml/sms-spam-collection-dataset

In [1]:
import numpy as np
import pandas as pd

In [2]:
# Cargamos los datos
df = pd.read_csv('spam.csv', encoding='ISO-8859-1')
df.head(3)

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,


In [3]:
# Reestructuramos el dataset
df['sms'] = df['v2']
df['spam'] = np.where(df['v1'] == 'spam', 1, 0)
df.head(3)

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4,sms,spam
0,ham,"Go until jurong point, crazy.. Available only ...",,,,"Go until jurong point, crazy.. Available only ...",0
1,ham,Ok lar... Joking wif u oni...,,,,Ok lar... Joking wif u oni...,0
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,,Free entry in 2 a wkly comp to win FA Cup fina...,1


In [4]:
# Nos quedamos con las columnas spam y sms
df = df[['sms','spam']]
df.head()

Unnamed: 0,sms,spam
0,"Go until jurong point, crazy.. Available only ...",0
1,Ok lar... Joking wif u oni...,0
2,Free entry in 2 a wkly comp to win FA Cup fina...,1
3,U dun say so early hor... U c already then say...,0
4,"Nah I don't think he goes to usf, he lives aro...",0


In [25]:
# Cantidad de sms
len(df) 

5572

In [26]:
# Dividir en spam y ham (no spam)
spam_df = df.loc[df['spam'] == 1]
ham_df = df.loc[df['spam'] == 0]
print(len(spam_df))
print(len(ham_df))


747
4825


In [27]:
# Usaremos el vectorizador TfidfVectorizer de sklearn para observar algunas 
# palabras importantes en los mensajes de spam y elegir una para incorporarla 
# a nuestra fórmula:

from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer_spam = TfidfVectorizer(stop_words='english', max_features=30)
vectorizer_spam.fit(spam_df['sms'])
vectorizer_spam.vocabulary_

{'150p': 0,
 '16': 1,
 '18': 2,
 '50': 3,
 'cash': 4,
 'claim': 5,
 'com': 6,
 'contact': 7,
 'free': 8,
 'guaranteed': 9,
 'just': 10,
 'mobile': 11,
 'msg': 12,
 'new': 13,
 'nokia': 14,
 'prize': 15,
 'reply': 16,
 'send': 17,
 'service': 18,
 'stop': 19,
 'text': 20,
 'tone': 21,
 'txt': 22,
 'uk': 23,
 'ur': 24,
 'urgent': 25,
 'week': 26,
 'win': 27,
 'won': 28,
 'www': 29}

In [69]:
# Necesitamos elegir una palabra para usar en nuestra fórmula (win)

# Calculamos P(w|s)

#word = 'win'
#word = "cash"
word = 'www'
spam_count = 0
spam_with_word_count = 0
for idx,row in spam_df.iterrows():
    spam_count += 1
    
    if word in row.sms:
        spam_with_word_count += 1
probability_of_word_given_spam = spam_with_word_count / spam_count # / spam_with_word_count
print(probability_of_word_given_spam)
print(spam_count)
print(spam_with_word_count)

0.1285140562248996
747
96


In [70]:
# P(s)

probability_of_spam = len(spam_df) / (len(df))
print(probability_of_spam)

0.13406317300789664


In [71]:
# P(w)

sms_count = 0
word_in_sms_count = 0
for idx,row in df.iterrows():
    sms_count += 1
    
    if word in row.sms:
        word_in_sms_count += 1
probability_of_word = word_in_sms_count / sms_count
print(probability_of_word)

0.01776740847092606


In [72]:
# Resultado P(s|w)

(probability_of_word_given_spam * probability_of_spam) / probability_of_word

0.9696969696969696

Lo que esto nos dice es que si un SMS contiene la palabra "win" (ganar), hay un 69% de probabilidad de que el mensaje sea spam.