In [1]:
import pandas as pd

In [2]:
from google.colab import files

In [3]:
uploaded = files.upload()

Saving archive (7).zip to archive (7).zip


In [5]:
!unzip /content/archive.zip -d /content/extracted

Archive:  /content/archive.zip
  inflating: /content/extracted/combined_data.csv  


In [6]:
df = pd.read_csv('extracted/combined_data.csv')

df

Unnamed: 0,label,text
0,1,ounce feather bowl hummingbird opec moment ala...
1,1,wulvob get your medircations online qnb ikud v...
2,0,computer connection from cnn com wednesday es...
3,1,university degree obtain a prosperous future m...
4,0,thanks for all your answers guys i know i shou...
...,...,...
83443,0,hi given a date how do i get the last date of ...
83444,1,now you can order software on cd or download i...
83445,1,dear valued member canadianpharmacy provides a...
83446,0,subscribe change profile contact us long term ...


In [7]:
import nltk, re, string
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

In [8]:
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...


True

In [9]:
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

In [10]:
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

In [11]:
def preprocess_nltk(text, reg=r'[^a-zA-Z\s]'):
    text = re.sub(reg, '', text.lower())
    tokens = nltk.word_tokenize(text)
    return [lemmatizer.lemmatize(w) for w in tokens if w not in stop_words]

In [12]:
def preprocess_nltk_without_lem(text, reg=r'[^a-zA-Z\s]'):
    text = re.sub(reg, '', text.lower())
    tokens = nltk.word_tokenize(text)
    return [w for w in tokens if w not in stop_words]

Пример: "Buy now!!!" → ['buy', 'now'].

In [13]:
df['nltk_tokens'] = df['text'].apply(preprocess_nltk)

In [14]:
df['tokens_punk'] = df['text'].apply(lambda x: preprocess_nltk(x, reg=r'[^a-zA-Z0-9\s\.,;:!?()""\'\-\–—]'))

In [15]:
df['tokens_without_lem'] = df['text'].apply(lambda x: preprocess_nltk_without_lem(x, reg=r'[^a-zA-Z0-9\s\.,;:!?()""\'\-\–—]'))

In [16]:
df

Unnamed: 0,label,text,nltk_tokens,tokens_punk,tokens_without_lem
0,1,ounce feather bowl hummingbird opec moment ala...,"[ounce, feather, bowl, hummingbird, opec, mome...","[ounce, feather, bowl, hummingbird, opec, mome...","[ounce, feather, bowl, hummingbird, opec, mome..."
1,1,wulvob get your medircations online qnb ikud v...,"[wulvob, get, medircations, online, qnb, ikud,...","[wulvob, get, medircations, online, qnb, ikud,...","[wulvob, get, medircations, online, qnb, ikud,..."
2,0,computer connection from cnn com wednesday es...,"[computer, connection, cnn, com, wednesday, es...","[computer, connection, cnn, com, wednesday, es...","[computer, connection, cnn, com, wednesday, es..."
3,1,university degree obtain a prosperous future m...,"[university, degree, obtain, prosperous, futur...","[university, degree, obtain, prosperous, futur...","[university, degree, obtain, prosperous, futur..."
4,0,thanks for all your answers guys i know i shou...,"[thanks, answer, guy, know, checked, rsync, ma...","[thanks, answer, guy, know, checked, rsync, ma...","[thanks, answers, guys, know, checked, rsync, ..."
...,...,...,...,...,...
83443,0,hi given a date how do i get the last date of ...,"[hi, given, date, get, last, date, month, data...","[hi, given, date, get, last, date, month, data...","[hi, given, date, get, last, date, month, data..."
83444,1,now you can order software on cd or download i...,"[order, software, cd, download, site, immediat...","[order, software, cd, download, site, immediat...","[order, software, cd, download, site, immediat..."
83445,1,dear valued member canadianpharmacy provides a...,"[dear, valued, member, canadianpharmacy, provi...","[dear, valued, member, canadianpharmacy, provi...","[dear, valued, member, canadianpharmacy, provi..."
83446,0,subscribe change profile contact us long term ...,"[subscribe, change, profile, contact, u, long,...","[subscribe, change, profile, contact, u, long,...","[subscribe, change, profile, contact, us, long..."


In [19]:
!pip install gensim

Collecting gensim
  Using cached gensim-4.4.0-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl.metadata (8.4 kB)
Using cached gensim-4.4.0-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl (27.9 MB)
Installing collected packages: gensim
Successfully installed gensim-4.4.0


In [20]:
from gensim.models import Word2Vec
import numpy as np

In [21]:
model = Word2Vec(df['nltk_tokens'].values, vector_size=100, window=5, min_count=1, workers=4)
def get_sentence_vector(text_tokens, model):
    vectors = [model.wv[w] for w in text_tokens if w in model.wv]
    return np.mean(vectors, axis=0) if vectors else np.zeros(100)

In [22]:
X = np.array([get_sentence_vector(t, model) for t in df['nltk_tokens']])
y = df['label']

In [23]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

In [24]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [25]:
clf = LogisticRegression()

In [26]:
clf.fit(X_train, y_train)

In [27]:
y_preds = clf.predict(X_test)

In [28]:
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score

In [29]:
acc = accuracy_score(y_test, y_preds)
f1 = f1_score(y_test, y_preds)
prec = precision_score(y_test, y_preds)
recall = recall_score(y_test, y_preds)

print(f'acc = {acc}\nf1 = {f1}\nprecision_score = {prec}\nrecall_score = {recall}')

acc = 0.9641701617735171
f1 = 0.9659220423979941
precision_score = 0.9615384615384616
recall_score = 0.9703457751316693


In [30]:
from sklearn.ensemble import RandomForestClassifier

In [31]:
forest = RandomForestClassifier(
    n_estimators=100,
    max_depth=10,
    random_state=42)

forest.fit(X_train, y_train)

preds_forest = forest.predict(X_test)

In [32]:
acc = accuracy_score(y_test, preds_forest)
f1 = f1_score(y_test, preds_forest)
prec = precision_score(y_test, preds_forest)
recall = recall_score(y_test, preds_forest)

print(f'acc = {acc}\nf1 = {f1}\nprecision_score = {prec}\nrecall_score = {recall}')

acc = 0.97309766327142
f1 = 0.9745825077837532
precision_score = 0.9638338371962826
recall_score = 0.9855736203343256


In [33]:
X_p = np.array([get_sentence_vector(t, model) for t in df['tokens_punk']])

X_train_p, X_test_p, y_train, y_test = train_test_split(X_p, y, test_size=0.2)

In [34]:
clf_p = LogisticRegression()

clf_p.fit(X_train_p, y_train)

y_predic_p = clf_p.predict(X_test_p)

In [35]:
acc = accuracy_score(y_test, y_predic_p)
f1 = f1_score(y_test, y_predic_p)
prec = precision_score(y_test, y_predic_p)
recall = recall_score(y_test, y_predic_p)

print(f'acc = {acc}\nf1 = {f1}\nprecision_score = {prec}\nrecall_score = {recall}')

acc = 0.9665068903535051
f1 = 0.9683196372910173
precision_score = 0.9655289330922242
recall_score = 0.9711265204046834


In [36]:
X_wl = np.array([get_sentence_vector(t, model) for t in df['tokens_without_lem']])

X_train_wl, X_test_wl, y_train, y_test = train_test_split(X_p, y, test_size=0.2)

clf_wl = LogisticRegression()

clf_wl.fit(X_train_wl, y_train)

y_preds_wl = clf.predict(X_test_wl)

In [37]:
acc = accuracy_score(y_test, y_preds_wl)
f1 = f1_score(y_test, y_preds_wl)
prec = precision_score(y_test, y_preds_wl)
recall = recall_score(y_test, y_preds_wl)

print(f'acc = {acc}\nf1 = {f1}\nprecision_score = {prec}\nrecall_score = {recall}')

acc = 0.9662073097663272
f1 = 0.9682217714672076
precision_score = 0.9638770473412609
recall_score = 0.9726058410685986


In [38]:
from nltk.stem import PorterStemmer

In [39]:
import spacy
nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner'])

stemmer = PorterStemmer()

def preprocess_spacy(text):
    doc = nlp(re.sub(r'[^a-zA-Z\s]', '', text.lower()))
    return [stemmer.stem(token.text) for token in doc
            if not token.is_stop and not token.is_punct and token.text]

In [40]:
df['spacy_tokens'] = df['text'].apply(preprocess_spacy)

In [41]:
df

Unnamed: 0,label,text,nltk_tokens,tokens_punk,tokens_without_lem,spacy_tokens
0,1,ounce feather bowl hummingbird opec moment ala...,"[ounce, feather, bowl, hummingbird, opec, mome...","[ounce, feather, bowl, hummingbird, opec, mome...","[ounce, feather, bowl, hummingbird, opec, mome...","[ounc, feather, bowl, hummingbird, opec, momen..."
1,1,wulvob get your medircations online qnb ikud v...,"[wulvob, get, medircations, online, qnb, ikud,...","[wulvob, get, medircations, online, qnb, ikud,...","[wulvob, get, medircations, online, qnb, ikud,...","[wulvob, medirc, onlin, qnb, ikud, viagra, esc..."
2,0,computer connection from cnn com wednesday es...,"[computer, connection, cnn, com, wednesday, es...","[computer, connection, cnn, com, wednesday, es...","[computer, connection, cnn, com, wednesday, es...","[ , comput, connect, cnn, com, wednesday, esca..."
3,1,university degree obtain a prosperous future m...,"[university, degree, obtain, prosperous, futur...","[university, degree, obtain, prosperous, futur...","[university, degree, obtain, prosperous, futur...","[univers, degre, obtain, prosper, futur, money..."
4,0,thanks for all your answers guys i know i shou...,"[thanks, answer, guy, know, checked, rsync, ma...","[thanks, answer, guy, know, checked, rsync, ma...","[thanks, answers, guys, know, checked, rsync, ...","[thank, answer, guy, know, check, rsync, manua..."
...,...,...,...,...,...,...
83443,0,hi given a date how do i get the last date of ...,"[hi, given, date, get, last, date, month, data...","[hi, given, date, get, last, date, month, data...","[hi, given, date, get, last, date, month, data...","[hi, given, date, date, month, data, form, yyy..."
83444,1,now you can order software on cd or download i...,"[order, software, cd, download, site, immediat...","[order, software, cd, download, site, immediat...","[order, software, cd, download, site, immediat...","[order, softwar, cd, download, site, immedi, \..."
83445,1,dear valued member canadianpharmacy provides a...,"[dear, valued, member, canadianpharmacy, provi...","[dear, valued, member, canadianpharmacy, provi...","[dear, valued, member, canadianpharmacy, provi...","[dear, valu, member, canadianpharmaci, provid,..."
83446,0,subscribe change profile contact us long term ...,"[subscribe, change, profile, contact, u, long,...","[subscribe, change, profile, contact, u, long,...","[subscribe, change, profile, contact, us, long...","[subscrib, chang, profil, contact, long, term,..."


In [43]:
!python -m spacy download en_core_web_md

Collecting en-core-web-md==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_md-3.8.0/en_core_web_md-3.8.0-py3-none-any.whl (33.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m33.5/33.5 MB[0m [31m19.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: en-core-web-md
Successfully installed en-core-web-md-3.8.0
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_md')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [44]:
nlp_md = spacy.load("en_core_web_md")

def get_token_vectors(text):
    doc = nlp(text)
    vectors = [token.vector for token in doc if token.has_vector]
    return np.mean(vectors, axis=0) if vectors else np.zeros(300)

df['mean_token_vector'] = df['text'].apply(get_token_vectors)
X_spacy = np.stack(df['mean_token_vector'].values)

KeyboardInterrupt: 

In [None]:
X_spacy[0]

https://github.com/Leta1603/ML04/blob/main/Homework%2030/Homework.ipynb - nlp theory 1
https://github.com/MariaEgorenko/courses_ml/blob/l30/lesson30/tools.ipynb - nlp theory 2