In [1]:
import pandas as pd
import numpy as np
import pickle
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer

In [2]:
data = pd.read_csv('labelled_nostr_events_20230225000.csv').dropna();
data.head()

Unnamed: 0,label,text
0,ham,"gosta mais de ""o anti"" é?\n"
1,ham,Yo! What are y’all working on?
2,ham,We are Damus volunteers. You can choose to fol...
3,ham,I assume it was regulatory issues. This is the...
4,ham,proof of #nostrchain\n\nhttps://nostr.build/i/...


In [3]:
data['label'] = np.where(data['label']=='spam',1, 0)

In [4]:
data.head(10)

Unnamed: 0,label,text
0,0,"gosta mais de ""o anti"" é?\n"
1,0,Yo! What are y’all working on?
2,0,We are Damus volunteers. You can choose to fol...
3,0,I assume it was regulatory issues. This is the...
4,0,proof of #nostrchain\n\nhttps://nostr.build/i/...
5,0,ItsATrap.gif
6,0,Sunset\n\nhttps://nostr.build/i/3386.jpeg
7,0,"Right now you can still get 471,974 sats for $..."
8,0,Invoice here...\n\nlnbc10u1p3m6m5npp5n5n0ywzw8...
9,0,Followed. #Plebchain


In [5]:
X_train, X_test, Y_train, Y_test = train_test_split(data['text'], 
                                                    data['label'], 
                                                    random_state=0)

In [6]:
# extract features
vectorizer = CountVectorizer(ngram_range=(1, 2)).fit(X_train)
X_train_vectorized = vectorizer.transform(X_train)
X_train_vectorized.toarray().shape

(69485, 387957)

In [7]:
model = MultinomialNB(alpha=0.1)
model.fit(X_train_vectorized, Y_train)

In [8]:
predictions = model.predict(vectorizer.transform(X_test))
print("Accuracy:", 100 * sum(predictions == Y_test) / len(predictions), '%')

Accuracy: 98.2643985838874 %


In [9]:
# Export model
filename = 'models/MultinomialNB.sav'
pickle.dump((model, vectorizer), open(filename, 'wb'))

In [10]:
(loaded_model, vectorizer) = pickle.load(open(filename, 'rb'))
loaded_model.predict_proba(vectorizer.transform(
    [
        "如果无法私信 可以联系 https://t.me/Jpaibot",
        "582898.com 无码高清全站免费的极品黄站",
        "Hello",
        "PV",
        "What is spam?"
    ]
))

array([[4.39378688e-22, 1.00000000e+00],
       [1.17017618e-12, 1.00000000e+00],
       [9.82697167e-01, 1.73028333e-02],
       [9.76251055e-01, 2.37489450e-02],
       [9.99999999e-01, 1.29857296e-09]])