In [1]:
import pandas as pd
import numpy as np
import nltk
from nltk.tokenize import word_tokenize
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
from gensim.models import Word2Vec

nltk.download('punkt')

data = pd.read_csv("spam.csv", encoding='ISO-8859-1')
data.drop(columns=['Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4'], inplace=True)
data.rename(columns={"v1": "label", "v2": "text"}, inplace=True)
data["label"] = data["label"].map({"ham": 1, "spam": 0})

def tokenize_text(text):
    return word_tokenize(text.lower())

data['tokens'] = data['text'].apply(tokenize_text)

sentences = data['tokens'].tolist()
w2v_model = Word2Vec(sentences=sentences, vector_size=100, window=5, min_count=1, workers=4)

def get_sentence_vector(sentence, model):
    vector = np.zeros(model.vector_size)
    count = 0
    for word in sentence:
        if word in model.wv:
            vector += model.wv[word]
            count += 1
    if count != 0:
        vector /= count
    return vector

def apply_sentence_vectorization(dataframe, model):
    sentence_vectors = []
    for tokens in dataframe['tokens']:
        sentence_vector = get_sentence_vector(tokens, model)
        sentence_vectors.append(sentence_vector)
    return sentence_vectors

data['sentence_vector'] = apply_sentence_vectorization(data, w2v_model)

X = np.vstack(data['sentence_vector'].values)
y = data['label']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

classifier = LogisticRegression()
classifier.fit(X_train, y_train)

y_pred = classifier.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")

print(classification_report(y_test, y_pred))

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\ADMIN\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Accuracy: 0.92
              precision    recall  f1-score   support

           0       0.92      0.45      0.60       150
           1       0.92      0.99      0.96       965

    accuracy                           0.92      1115
   macro avg       0.92      0.72      0.78      1115
weighted avg       0.92      0.92      0.91      1115



In [2]:
from sklearn.ensemble import RandomForestClassifier

In [3]:
random_model=RandomForestClassifier(n_estimators=100)
random_model.fit(X_train,y_train)

In [4]:
rand_pred=random_model.predict(X_test)

In [5]:
accuracy_score(rand_pred,y_test)

0.9614349775784753

In [6]:
np.arange(1,20,1)

array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17,
       18, 19])

In [7]:
from sklearn.model_selection import GridSearchCV

In [8]:
param_dist = {
    'n_estimators': [50,100],
    'criterion': ['gini', 'entropy'],
    'max_depth': np.arange(10, 100, 10),
    'min_samples_split': np.arange(2, 20, 2),
    'min_samples_leaf': np.arange(1, 10, 1),
    'max_features': ['sqrt'],
}

In [9]:
random_model=RandomForestClassifier()

In [10]:
rand_grid=GridSearchCV(estimator=random_model,param_grid=param_dist,cv=5)

In [None]:
rand_grid.fit(X_train,y_train)