In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential, load_model
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
import re
from keras.regularizers import l2


from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import nltk
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [5]:
# Load the dataset
file_path = '/content/drive/MyDrive/Projects/Arabic_Sentiment_Tweets/Tweets.txt'
df = pd.read_csv(file_path, delimiter='\t', header=None, names=['tweet', 'class'])

In [6]:
df = df[~df['class'].isin(['NEUTRAL', 'OBJ'])]

In [7]:
df['class'].value_counts()

NEG    1642
POS     777
Name: class, dtype: int64

In [8]:
le = LabelEncoder()
df['class_encoded'] = le.fit_transform(df['class'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['class_encoded'] = le.fit_transform(df['class'])


In [9]:
def remove_links(text):
    url_pattern = re.compile(r'https?://\S+|www\.\S+')

    cleaned_text = re.sub(url_pattern, '', text)

    return cleaned_text

df['tweet'] = df['tweet'].apply(remove_links)


In [10]:
stop_words = set(stopwords.words('arabic'))

def remove_stopwords(text):
    tokens = word_tokenize(text)
    filtered_tokens = [word for word in tokens if word.lower() not in stop_words]
    return ' '.join(filtered_tokens)

df['tweet'] = df['tweet'].apply(remove_stopwords)

In [11]:
df['tweet'] = df['tweet'].str.replace('#', '')
df['tweet'] = df['tweet'].str.replace('_', '')
df['tweet'] = df['tweet'].str.replace('-', '')
df['tweet'] = df['tweet'].str.replace(';', '')
df['tweet'] = df['tweet'].str.replace(')', '')
df['tweet'] = df['tweet'].str.replace('(', '')
df['tweet'] = df['tweet'].str.replace('؟', '')

  df['tweet'] = df['tweet'].str.replace(')', '')
  df['tweet'] = df['tweet'].str.replace('(', '')


In [12]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(df['tweet'])
X = tokenizer.texts_to_sequences(df['tweet'])
X = pad_sequences(X, maxlen = 400)

In [14]:
len(tokenizer.word_index)

15921

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, df['class_encoded'], test_size=0.2, random_state=42)

In [None]:
print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

(1935, 400)
(1935,)
(484, 400)
(484,)


In [None]:
# Build the LSTM model
model = Sequential()
model.add(Embedding(input_dim = len(tokenizer.word_index) + 1, output_dim = 100, input_length = 400))
model.add(LSTM(20))
model.add(Dense(10, kernel_regularizer = l2(0.01)))
model.add(Dropout(0.65))
model.add(Dense(1, activation='sigmoid'))

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])


In [None]:
# Train the model
model.fit(X_train, y_train, epochs = 10, batch_size = 64)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.src.callbacks.History at 0x7d9d263835b0>

In [None]:
model_loaded = load_model("/content/drive/MyDrive/Projects/Arabic_Sentiment_Tweets/model2.h5")

In [None]:
pre = model_loaded.predict(X_test)
pre = (pre > 0.5).astype(np.int32)
print(f"Accuracy: {round(accuracy_score(y_test, pre) * 100, 3)}")
print(f"Precision: {round(precision_score(y_test, pre) * 100, 3)}")
print(f"Recall: {round(recall_score(y_test, pre) * 100, 3)}")
print(f"F1_score: {round(f1_score(y_test, pre) * 100, 3)}")

Accuracy: 75.0
Precision: 67.48
Recall: 50.61
F1_score: 57.84
