#Import

In [2]:
import pandas as pd
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import numpy as np

In [3]:
# بارگذاری داده‌ها
url = "https://raw.githubusercontent.com/justmarkham/DAT8/master/data/sms.tsv"
data = pd.read_csv(url, sep='\t', header=None, names=['label', 'message'])

# نمایش چند سطر از داده‌ها برای بررسی
print(data.head())

# نمایش اطلاعات کلی از داده‌ها
print(data.info())


  label                                            message
0   ham  Go until jurong point, crazy.. Available only ...
1   ham                      Ok lar... Joking wif u oni...
2  spam  Free entry in 2 a wkly comp to win FA Cup fina...
3   ham  U dun say so early hor... U c already then say...
4   ham  Nah I don't think he goes to usf, he lives aro...
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   label    5572 non-null   object
 1   message  5572 non-null   object
dtypes: object(2)
memory usage: 87.2+ KB
None


In [4]:
import numpy as np

# بررسی وجود مقادیر NaN
print(data.isna().sum())

# حذف سطرهایی که برچسب یا پیام آنها NaN است
data.dropna(subset=['label', 'message'], inplace=True)

# بررسی تعداد داده‌ها پس از حذف NaN
print(f"Number of samples after removing NaN values: {len(data)}")


label      0
message    0
dtype: int64
Number of samples after removing NaN values: 5572


In [None]:
import numpy as np
import pandas as pd
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# بارگذاری داده‌ها
url = "https://raw.githubusercontent.com/justmarkham/DAT8/master/data/sms.tsv"
data = pd.read_csv(url, sep='\t', header=None, names=['label', 'message'])

# تبدیل برچسب‌ها به اعداد (0 برای ham و 1 برای spam)
data['label'] = data['label'].map({'ham': 0, 'spam': 1})

# حذف سطرهایی که برچسب یا پیام آنها NaN است
data.dropna(subset=['label', 'message'], inplace=True)

# بررسی تعداد داده‌ها پس از حذف NaN
print(f"Number of samples after removing NaN values: {len(data)}")

# تقسیم داده‌ها به ویژگی‌ها و برچسب‌ها
X = data['message'].values
y = data['label'].values

# تبدیل متن‌ها به توالی‌های اعداد
vocab_size = 10000
tokenizer = Tokenizer(num_words=vocab_size)
tokenizer.fit_on_texts(X)
X = tokenizer.texts_to_sequences(X)

# پدینگ کردن توالی‌ها به طول ثابت
maxlen = 100
X = pad_sequences(X, maxlen=maxlen)

# تقسیم داده‌ها به مجموعه آموزش و تست
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# بررسی وجود NaN در y_test
if np.any(np.isnan(y_test)):
    print("NaN values found in y_test.")
else:
    print("No NaN values found in y_test.")

def create_model(hidden_layers, hidden_size, activation='relu'):
    model = Sequential()
    model.add(Embedding(input_dim=vocab_size, output_dim=64, input_length=maxlen))
    model.add(LSTM(64, return_sequences=False))  # return_sequences=False برای آخرین لایه LSTM
    for _ in range(hidden_layers):
        model.add(Dense(hidden_size, activation=activation))
    model.add(Dense(1, activation='sigmoid'))
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
    return model

# ارزیابی تاثیر تعداد لایه‌های مخفی
hidden_layers_options = [0, 1, 2, 3, 4, 5]
results = []

for hidden_layers in hidden_layers_options:
    model = create_model(hidden_layers, hidden_size=64)
    model.fit(X_train, y_train, epochs=5, batch_size=32, validation_split=0.2, verbose=0)
    y_pred = (model.predict(X_test) > 0.5).astype("int32")
    accuracy = accuracy_score(y_test, y_pred)
    results.append((hidden_layers, accuracy))
    print(f'Hidden Layers: {hidden_layers}, Accuracy: {accuracy:.4f}')

# نمایش نتایج
for hidden_layers, accuracy in results:
    print(f'Hidden Layers: {hidden_layers}, Accuracy: {accuracy:.4f}')


Number of samples after removing NaN values: 5572
No NaN values found in y_test.
Hidden Layers: 0, Accuracy: 0.9883
Hidden Layers: 1, Accuracy: 0.9892
Hidden Layers: 2, Accuracy: 0.9883
Hidden Layers: 3, Accuracy: 0.9892
Hidden Layers: 4, Accuracy: 0.9892
Hidden Layers: 5, Accuracy: 0.9892
Hidden Layers: 0, Accuracy: 0.9883
Hidden Layers: 1, Accuracy: 0.9892
Hidden Layers: 2, Accuracy: 0.9883
Hidden Layers: 3, Accuracy: 0.9892
Hidden Layers: 4, Accuracy: 0.9892
Hidden Layers: 5, Accuracy: 0.9892


In [5]:
import numpy as np
import pandas as pd
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# بارگذاری داده‌ها
url = "https://raw.githubusercontent.com/justmarkham/DAT8/master/data/sms.tsv"
data = pd.read_csv(url, sep='\t', header=None, names=['label', 'message'])

# تبدیل برچسب‌ها به اعداد (0 برای ham و 1 برای spam)
data['label'] = data['label'].map({'ham': 0, 'spam': 1})

# حذف سطرهایی که برچسب یا پیام آنها NaN است
data.dropna(subset=['label', 'message'], inplace=True)

# بررسی تعداد داده‌ها پس از حذف NaN
print(f"Number of samples after removing NaN values: {len(data)}")

# تقسیم داده‌ها به ویژگی‌ها و برچسب‌ها
X = data['message'].values
y = data['label'].values

# تبدیل متن‌ها به توالی‌های اعداد
vocab_sizes = [8000, 5000, 2000]
results = []

for vocab_size in vocab_sizes:
    tokenizer = Tokenizer(num_words=vocab_size)
    tokenizer.fit_on_texts(X)
    X_seq = tokenizer.texts_to_sequences(X)

    # پدینگ کردن توالی‌ها به طول ثابت
    maxlen = 100
    X_pad = pad_sequences(X_seq, maxlen=maxlen)

    # تقسیم داده‌ها به مجموعه آموزش و تست
    X_train, X_test, y_train, y_test = train_test_split(X_pad, y, test_size=0.2, random_state=42)

    def create_model(hidden_layers, hidden_size, activation='relu'):
        model = Sequential()
        model.add(Embedding(input_dim=vocab_size, output_dim=64, input_length=maxlen))
        model.add(LSTM(64, return_sequences=False))  # return_sequences=False برای آخرین لایه LSTM
        for _ in range(hidden_layers):
            model.add(Dense(hidden_size, activation=activation))
        model.add(Dense(1, activation='sigmoid'))
        model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
        return model

    # ارزیابی تاثیر تعداد لایه‌های مخفی
    hidden_layers = 2  # ثابت نگه داشتن تعداد لایه‌های مخفی
    hidden_size = 64  # ثابت نگه داشتن اندازه لایه‌های مخفی
    model = create_model(hidden_layers, hidden_size)
    model.fit(X_train, y_train, epochs=5, batch_size=32, validation_split=0.2, verbose=0)
    y_pred = (model.predict(X_test) > 0.5).astype("int32")
    accuracy = accuracy_score(y_test, y_pred)
    results.append((vocab_size, accuracy))
    print(f'Vocab Size: {vocab_size}, Accuracy: {accuracy:.4f}')

# نمایش نتایج
for vocab_size, accuracy in results:
    print(f'Vocab Size: {vocab_size}, Accuracy: {accuracy:.4f}')


Number of samples after removing NaN values: 5572
Vocab Size: 8000, Accuracy: 0.9892
Vocab Size: 5000, Accuracy: 0.9874
Vocab Size: 2000, Accuracy: 0.9910
Vocab Size: 8000, Accuracy: 0.9892
Vocab Size: 5000, Accuracy: 0.9874
Vocab Size: 2000, Accuracy: 0.9910


In [6]:
import numpy as np
import pandas as pd
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# بارگذاری داده‌ها
url = "https://raw.githubusercontent.com/justmarkham/DAT8/master/data/sms.tsv"
data = pd.read_csv(url, sep='\t', header=None, names=['label', 'message'])

# تبدیل برچسب‌ها به اعداد (0 برای ham و 1 برای spam)
data['label'] = data['label'].map({'ham': 0, 'spam': 1})

# حذف سطرهایی که برچسب یا پیام آنها NaN است
data.dropna(subset=['label', 'message'], inplace=True)

# بررسی تعداد داده‌ها پس از حذف NaN
print(f"Number of samples after removing NaN values: {len(data)}")

# تقسیم داده‌ها به ویژگی‌ها و برچسب‌ها
X = data['message'].values
y = data['label'].values

# تبدیل متن‌ها به توالی‌های اعداد
vocab_size = 10000
tokenizer = Tokenizer(num_words=vocab_size)
tokenizer.fit_on_texts(X)
X_seq = tokenizer.texts_to_sequences(X)

# پدینگ کردن توالی‌ها به طول ثابت
maxlen = 100
X_pad = pad_sequences(X_seq, maxlen=maxlen)

# تقسیم داده‌ها به مجموعه آموزش و تست
X_train, X_test, y_train, y_test = train_test_split(X_pad, y, test_size=0.2, random_state=42)

def create_model(hidden_layers, hidden_size, activation='relu'):
    model = Sequential()
    model.add(Embedding(input_dim=vocab_size, output_dim=64, input_length=maxlen))
    model.add(LSTM(64, return_sequences=False))  # return_sequences=False برای آخرین لایه LSTM
    for _ in range(hidden_layers):
        model.add(Dense(hidden_size, activation=activation))
    model.add(Dense(1, activation='sigmoid'))
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
    return model

# ارزیابی تاثیر توابع فعال‌سازی مختلف
activation_functions = ['relu', 'sigmoid', 'linear']
results = []

for activation in activation_functions:
    hidden_layers = 2  # ثابت نگه داشتن تعداد لایه‌های مخفی
    hidden_size = 64  # ثابت نگه داشتن اندازه لایه‌های مخفی
    model = create_model(hidden_layers, hidden_size, activation=activation)
    model.fit(X_train, y_train, epochs=5, batch_size=32, validation_split=0.2, verbose=0)
    y_pred = (model.predict(X_test) > 0.5).astype("int32")
    accuracy = accuracy_score(y_test, y_pred)
    results.append((activation, accuracy))
    print(f'Activation Function: {activation}, Accuracy: {accuracy:.4f}')

# نمایش نتایج
for activation, accuracy in results:
    print(f'Activation Function: {activation}, Accuracy: {accuracy:.4f}')


Number of samples after removing NaN values: 5572
Activation Function: relu, Accuracy: 0.9910
Activation Function: sigmoid, Accuracy: 0.9901
Activation Function: linear, Accuracy: 0.9883
Activation Function: relu, Accuracy: 0.9910
Activation Function: sigmoid, Accuracy: 0.9901
Activation Function: linear, Accuracy: 0.9883
