In [173]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report

In [174]:
data = pd.read_csv('spam.csv')
print(data.columns)

Index(['v1', 'v2', 'Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4'], dtype='object')


In [175]:
print(data.head(4))
print(data.columns)

     v1                                                 v2 Unnamed: 2  \
0   ham  Go until jurong point, crazy.. Available only ...        NaN   
1   ham                      Ok lar... Joking wif u oni...        NaN   
2  spam  Free entry in 2 a wkly comp to win FA Cup fina...        NaN   
3   ham  U dun say so early hor... U c already then say...        NaN   

  Unnamed: 3 Unnamed: 4  
0        NaN        NaN  
1        NaN        NaN  
2        NaN        NaN  
3        NaN        NaN  
Index(['v1', 'v2', 'Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4'], dtype='object')


In [176]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   v1          5572 non-null   object
 1   v2          5572 non-null   object
 2   Unnamed: 2  50 non-null     object
 3   Unnamed: 3  12 non-null     object
 4   Unnamed: 4  6 non-null      object
dtypes: object(5)
memory usage: 217.8+ KB


In [177]:
data.isnull().sum()

v1               0
v2               0
Unnamed: 2    5522
Unnamed: 3    5560
Unnamed: 4    5566
dtype: int64

In [178]:
data.duplicated().sum()

403

In [179]:
df = data.drop_duplicates(keep='first')
df.shape

(5169, 5)

In [180]:
df = df.iloc[:,0:2]
df.head()

Unnamed: 0,v1,v2
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [181]:
new_col_name = {'v1':'etiqueta','v2':'texto'}
df.rename(columns=new_col_name,inplace=True)


In [182]:
X_train, X_test, y_train, y_test = train_test_split(df['texto'],df['etiqueta'], test_size=0.2, random_state=42)
vectorizer = CountVectorizer()
X_train_vectorized = vectorizer.fit_transform(X_train)
X_test_vectorized = vectorizer.transform(X_test)

In [183]:
clf = MultinomialNB(alpha=0.1)
clf.fit(X_train_vectorized, y_train)

predictions = clf.predict(X_test_vectorized)

accuracy = accuracy_score(y_test, predictions)
print(f'Precisión del modelo: {accuracy}')
print('Informe de clasificación:\n', classification_report(y_test, predictions))

if isinstance(clf, MultinomialNB):
    print("Probabilidades logarítmicas de características dada cada clase:")
    print(clf.feature_log_prob_)

Precisión del modelo: 0.9825918762088974
Informe de clasificación:
               precision    recall  f1-score   support

         ham       0.99      0.99      0.99       889
        spam       0.95      0.92      0.94       145

    accuracy                           0.98      1034
   macro avg       0.97      0.96      0.96      1034
weighted avg       0.98      0.98      0.98      1034

Probabilidades logarítmicas de características dada cada clase:
[[-13.07265829 -13.07265829 -10.67476302 ... -10.67476302 -10.67476302
  -13.07265829]
 [ -7.477421    -6.48782745 -11.74010088 ... -11.74010088 -11.74010088
   -9.3422056 ]]


In [193]:
nuevo_texto = ["Special offer only for today. Win a 1000 dollars now!"]
nuevo_texto1 = ["i love u mom"]
nuevo_texto_vectorizado = vectorizer.transform(nuevo_texto)
prediccion = clf.predict(nuevo_texto_vectorizado)

if 'spam' in prediccion:
    print("El texto es spam.")
else:
    print("El texto no es spam.")

El texto es spam.
