<h1>Import Libraries</h1>

In [1]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import re 
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import os
from gensim.models import Word2Vec

In [2]:
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to C:\Users\Abdal
[nltk_data]     Maged\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to C:\Users\Abdal
[nltk_data]     Maged\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to C:\Users\Abdal
[nltk_data]     Maged\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

<h1>Read Data</h1>

In [None]:
df = pd.read_csv(r"data.csv")
df

Unnamed: 0,label,content
0,1,مثير للدهشة حتى بالنسبة لغير اللاعب: كان هذا ا...
1,1,أفضل مقطع صوتي على الإطلاق لأي شيء: أقرأ الكثي...
2,1,مذهل!: هذه الموسيقى التصويرية هي موسيقاي المفض...
3,1,الموسيقى التصويرية الممتازة: أحب هذا الموسيقى ...
4,1,تذكر ، اسحب فكك عن الأرض بعد سماعها: إذا كنت ق...
...,...,...
114995,0,DOA: فتح العلامة التجارية الجديدة من Box.تم تث...
114996,0,شركة صعبة التعامل معها: المنتج كان على ما يرام...
114997,0,SDK Sansa Leather Case: فقير للغاية.لم يتم الإ...
114998,0,حسنًا ، لكن ليس رائعًا: حسنًا ، لقد اشتريت هذا...


<h1>Preprocessing Function</h1>

In [None]:
def preprocess_text(text):
    # Tokenization
    tokens = word_tokenize(text)

    # Removing Stopwords
    stop_words = set(stopwords.words('arabic'))
    tokens = [word for word in tokens if word not in stop_words]

    # Removing Punctuation and Special Characters
    tokens = [word for word in tokens if word.isalnum()]

    # Stemming or Lemmatization
    lemmatizer = WordNetLemmatizer()
    lemmatized_tokens = [lemmatizer.lemmatize(word) for word in tokens]

    # Filter only Arabic words using regex
    arabic_words = [word for word in lemmatized_tokens if re.fullmatch('[\u0600-\u06FF]+', word)]
    
    return arabic_words

<h1>Applying Preprocessing

In [None]:
df['preprossing'] = df['content'].apply(preprocess_text)

In [None]:
print(df['preprossing'].iloc[0])
print(len(df['preprossing'].iloc[0]))

print(df['content'].iloc[0])
print(len(df['content'].iloc[0]))

['مثير', 'للدهشة', 'بالنسبة', 'لغير', 'اللاعب', 'المسار', 'الصوتي', 'يرسم', 'أعضاء', 'مجلس', 'الشيوخ', 'عقلك', 'وأوصي', 'للأشخاص', 'يكرهون', 'اللعبة', 'لقد', 'لعبت', 'لعبة', 'الألعاب', 'لعبتها', 'الإطلاق', 'لديها', 'أفضل', 'موسيقى', 'يعود', 'لوحة', 'المفاتيح', 'الخام', 'ويأخذ', 'خطوة', 'طازجة', 'القيثارات', 'الصار', 'والأوركسترا', 'شأنه', 'يثير', 'إعجاب', 'شخص', 'يهتم', 'بالاستماع']
41
مثير للدهشة حتى بالنسبة لغير اللاعب: كان هذا المسار الصوتي جميلًا!إنه يرسم أعضاء مجلس الشيوخ في عقلك جيدًا ، وأوصي حتى للأشخاص الذين يكرهون فيد.موسيقى اللعبة!لقد لعبت لعبة Chrono Cross ولكن من بين جميع الألعاب التي لعبتها على الإطلاق لديها أفضل موسيقى!إنه يعود بعيدًا عن لوحة المفاتيح الخام ويأخذ خطوة طازجة مع القيثارات الصار والأوركسترا العاطفية.من شأنه أن يثير إعجاب أي شخص يهتم بالاستماع!^_^
395


<h1>Word Embedding</h1>

In [None]:
def embbeding(sentences):
    # Filter out empty sentences
    sentences = [sentence for sentence in sentences if sentence]

    # Word2Vec model training
    w2v_model = Word2Vec(sentences, min_count=1, vector_size=100)  # Adjust vector_size as needed
    return w2v_model

# Example usage
sentences = list(df['preprossing'])
w2v_model = embbeding(sentences)

<h1>Data Spliting</h1>

In [None]:
X = np.array(df['preprossing'])
y = np.array(df['label'])
X
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)

In [None]:
from gensim.models import Word2Vec
# Define the Word2Vec model
model = Word2Vec(sentences=X, vector_size=100, window=5, min_count=1, workers=4)

# Training the Word2Vec model
model.train(X, total_examples=len(X), epochs=10)

# Save the trained model to a file
model.save("word2vec_model.model")

In [None]:
print('Train_data_shape: ', X_train.shape)
print('Test_data_shape: ', X_test.shape)

Train_data_shape:  (103500,)
Test_data_shape:  (11500,)


In [None]:
# Load the pre-trained Word2Vec model
model = Word2Vec.load("word2vec_model.model")

# Function to convert a list of words into a fixed-size vector
def sentence_to_vector(sentence, model):
    word_vectors = [model.wv[word] for word in sentence if word in model.wv]
    if not word_vectors:
        return np.zeros(model.vector_size)
    return np.mean(word_vectors, axis=0)

# Convert X_train and X_test to vectors
X_train_vectors = [sentence_to_vector(sentence, model) for sentence in X_train]
X_test_vectors = [sentence_to_vector(sentence, model) for sentence in X_test]

# Convert the lists to NumPy arrays
X_train_vectors = np.array(X_train_vectors)
X_test_vectors = np.array(X_test_vectors)

In [None]:
from keras.models import Sequential
from keras.layers import Dense

word_size = 100
# Create a simple feedforward neural network
model = Sequential()
model.add(Dense(128, input_shape=(word_size,), activation='relu'))
model.add(Dense(64, activation='relu'))
model.add(Dense(1, activation='sigmoid'))

# Compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

In [None]:
# Train the model using your data
model.fit(X_train_vectors, y_train, epochs=40, batch_size=32, validation_data=(X_test_vectors, y_test))

Epoch 1/40
Epoch 2/40
Epoch 3/40
Epoch 4/40
Epoch 5/40
Epoch 6/40
Epoch 7/40
Epoch 8/40
Epoch 9/40
Epoch 10/40
Epoch 11/40
Epoch 12/40
Epoch 13/40
Epoch 14/40
Epoch 15/40
Epoch 16/40
Epoch 17/40
Epoch 18/40
Epoch 19/40
Epoch 20/40
Epoch 21/40
Epoch 22/40
Epoch 23/40
Epoch 24/40
Epoch 25/40
Epoch 26/40
Epoch 27/40
Epoch 28/40
Epoch 29/40
Epoch 30/40
Epoch 31/40
Epoch 32/40
Epoch 33/40
Epoch 34/40
Epoch 35/40
Epoch 36/40
Epoch 37/40
Epoch 38/40
Epoch 39/40
Epoch 40/40


<keras.callbacks.History at 0x22200301e10>

In [None]:
# Predict labels for the test set
y_pred = model.predict(X_test_vectors)

# Convert predicted probabilities to binary labels (0 or 1)
y_pred_binary = [1 if pred > 0.5 else 0 for pred in y_pred]

# Evaluate the model performance
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
accuracy = accuracy_score(y_test, y_pred_binary)
conf_matrix = confusion_matrix(y_test, y_pred_binary)
classification_report_str = classification_report(y_test, y_pred_binary)

print(f"Accuracy: {accuracy}")
print("Confusion Matrix:")
print(conf_matrix)
print("Classification Report:")
print(classification_report_str)

Accuracy: 0.808
Confusion Matrix:
[[4483 1138]
 [1070 4809]]
Classification Report:
              precision    recall  f1-score   support

           0       0.81      0.80      0.80      5621
           1       0.81      0.82      0.81      5879

    accuracy                           0.81     11500
   macro avg       0.81      0.81      0.81     11500
weighted avg       0.81      0.81      0.81     11500



In [None]:
import numpy as np
from gensim.models import Word2Vec

# Load the pre-trained Word2Vec model
word2vec_model = Word2Vec.load("word2vec_model.model")

# Load the pre-trained neural network model
# nn_model = load_model(r"I:\University\$$$$Forth_Year$$$$\semester_1\NLP\Project\your_model_name.h5")  # Replace with the actual path to your saved model

def sentence_to_vector(sentence, model):
    word_vectors = [model.wv[word] for word in sentence.split() if word in model.wv]
    if not word_vectors:
        return np.zeros(model.vector_size)
    return np.mean(word_vectors, axis=0)

def predict_sentiment(user_input, word2vec_model, nn_model):
    # Convert user input to Word2Vec vectors
    user_input_vector = sentence_to_vector(user_input, word2vec_model)

    # Reshape the vector to match the input shape of the neural network
    user_input_vector = user_input_vector.reshape(1, -1)

    # Make prediction
    prediction = nn_model.predict(user_input_vector)

    # Convert predicted probability to a binary label
    predicted_label = 1 if prediction > 0.5 else 0

    return predicted_label, prediction

# Example usage
user_input = input("Enter a sentence: ")
predicted_sentiment,prediction = predict_sentiment(user_input, word2vec_model, model)

print(f"Predicted Sentiment: {predicted_sentiment}")
print(f"Predicted Sentiment: {prediction}")


Predicted Sentiment: 1
Predicted Sentiment: [[0.9999141]]


<h1>---------------------------------------------

<h1>Save Model

In [None]:
# # Save the trained model
# model.save(r'I:\University\$$$$Forth_Year$$$$\semester_1\NLP\Project\your_model_name.h5')

<h1>Load Model

In [None]:
# Load the saved model
# model = load_model(r'I:\University\$$$$Forth_Year$$$$\semester_1\NLP\Project\your_model_name.h5')