In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import re

# Define the URLs of the webpages to scrape
urls = [
    'https://jadalbookstore.com/archives/7037',
    "https://www.arageek.com/bio/haruki-murakami",
    "https://diffah.alaraby.co.uk/diffah/secondbank/2022/12/10/%D8%A7%D9%84%D9%83%D8%AA%D8%A7%D8%A8%D8%A9-%D9%83%D9%85%D9%87%D9%86%D8%A9-%D9%87%D8%A7%D8%B1%D9%88%D9%83%D9%8A-%D9%85%D9%88%D8%B1%D8%A7%D9%83%D8%A7%D9%85%D9%8A-%D9%85%D9%86-%D8%B5%D8%A7%D8%AD%D8%A8-%D9%85%D9%82%D9%87%D9%89-%D8%A5%D9%84%D9%89-%D8%B1%D9%88%D8%A7%D8%A6%D9%8A",
    "https://nippontimes.net/%D9%87%D8%A7%D8%B1%D9%88%D9%83%D9%8A-%D9%85%D9%88%D8%B1%D8%A7%D9%83%D8%A7%D9%85%D9%8A/"# Add more URLs as needed
]

# Define headers to mimic a regular browser request
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
}

# Initialize an empty list to store paragraphs and scores from all pages
all_data = []

# Loop through each URL
for url in urls:
    # Fetch the HTML content of the webpage
    response = requests.get(url, headers=headers)

    # Check if the request was successful
    if response.status_code == 200:
        # Parse the HTML content using BeautifulSoup
        soup = BeautifulSoup(response.text, 'html.parser')

        # Find all <p> tags
        p_tags = soup.find_all('p')
        
        # Extract text from each <p> tag and store in a list
        paragraphs = [p.get_text().strip() for p in p_tags]

        # Cleaning steps
        cleaned_paragraphs = []
        for para in paragraphs:
            # Remove URLs
            para = re.sub(r'http\S+', '', para)
            para = re.sub(r'www\S+', '', para)
            
            # Remove unwanted special characters but keep Arabic characters and punctuation
            para = re.sub(r'[^ء-ي\s.,?!\'"()،؛-]', '', para)
            
            # Remove extra whitespace
            para = ' '.join(para.split())
            
            # Append to the cleaned list if not empty
            if para:
                cleaned_paragraphs.append(para)
        
        # Assign a score of 0 to each paragraph
        scores = [0] * len(cleaned_paragraphs)
        
        # Combine paragraphs and scores into a list of tuples
        data = list(zip(cleaned_paragraphs, scores))
        
        # Extend the list of paragraphs and scores from this page to the overall list
        all_data.extend(data)
    else:
        print(f"Failed to retrieve the webpage {url}. Status code: {response.status_code}")

# Create a Pandas DataFrame from the list of all paragraphs and scores
df = pd.DataFrame(all_data, columns=['Paragraph Text', 'Score'])

# Print the DataFrame
print(df)


                                        Paragraph Text  Score
0    يعد هاروكي موراكامي أكثر الكتاب اليابانيين شهر...      0
1    ولد موراكامي عام في مدينة كيوتو، عاصمة اليابان...      0
2    شحذ موراكامي ذوقه الروائي ما بعد الحداثي وهو ط...      0
3    صار موراكامي منذ ذلك أكثر الكتاب المشهورين في ...      0
4    يقع مكتب موراكامي في طوكيو قبالة الطريق الرئيس...      0
..                                                 ...    ...
567                                    المقالة التالية      0
568  قائمة أفضل الأغاني وأكثرها رواجا في اليابان لل...      0
569  سجل بريدك الإلكتروني هنا للحصول على أحدث المقا...      0
570                                  البريد الإلكتروني      0
571                                             اشتراك      0

[572 rows x 2 columns]


In [2]:
# Assuming df is your DataFrame containing paragraphs and scores

# Remove duplicate rows based on the 'Paragraph Text' column
df = df.drop_duplicates(subset='Paragraph Text')

# Save the DataFrame to a CSV file, overwriting the existing file
df.to_csv('paragraphs.csv', index=False)


In [3]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer
import pandas as pd

# Download NLTK resources (if not already downloaded)
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

# Read the CSV file containing paragraphs and scores
df = pd.read_csv('paragraphs_with_scores.csv')

# Tokenization
df['Tokenized_Text'] = df['Paragraph Text'].apply(word_tokenize)

# Remove stop words
stop_words = set(stopwords.words('english'))  # Assuming English stop words, you can change it to Arabic
df['Filtered_Text'] = df['Tokenized_Text'].apply(lambda tokens: [word for word in tokens if word.lower() not in stop_words])

# Stemming
porter = PorterStemmer()
df['Stemmed_Text'] = df['Filtered_Text'].apply(lambda tokens: [porter.stem(word) for word in tokens])

# Lemmatization
lemmatizer = WordNetLemmatizer()
df['Lemmatized_Text'] = df['Filtered_Text'].apply(lambda tokens: [lemmatizer.lemmatize(word) for word in tokens])

# Discretization (Bag of Words representation)
vectorizer = CountVectorizer()
bow_matrix = vectorizer.fit_transform(df['Stemmed_Text'].apply(lambda x: ' '.join(x)))
bow_df = pd.DataFrame(bow_matrix.toarray(), columns=vectorizer.get_feature_names_out())

# Print the processed DataFrame
print(df.head())


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\HP\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\HP\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\HP\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


                                      Paragraph Text  Score  \
0  يعد هاروكي موراكامي أكثر الكتاب اليابانيين شهر...     10   
1  ولد موراكامي عام في مدينة كيوتو، عاصمة اليابان...     10   
2  شحذ موراكامي ذوقه الروائي ما بعد الحداثي وهو ط...     10   
3  صار موراكامي منذ ذلك أكثر الكتاب المشهورين في ...     10   
4  يقع مكتب موراكامي في طوكيو قبالة الطريق الرئيس...     10   

                                      Tokenized_Text  \
0  [يعد, هاروكي, موراكامي, أكثر, الكتاب, الياباني...   
1  [ولد, موراكامي, عام, في, مدينة, كيوتو،, عاصمة,...   
2  [شحذ, موراكامي, ذوقه, الروائي, ما, بعد, الحداث...   
3  [صار, موراكامي, منذ, ذلك, أكثر, الكتاب, المشهو...   
4  [يقع, مكتب, موراكامي, في, طوكيو, قبالة, الطريق...   

                                       Filtered_Text  \
0  [يعد, هاروكي, موراكامي, أكثر, الكتاب, الياباني...   
1  [ولد, موراكامي, عام, في, مدينة, كيوتو،, عاصمة,...   
2  [شحذ, موراكامي, ذوقه, الروائي, ما, بعد, الحداث...   
3  [صار, موراكامي, منذ, ذلك, أكثر, الكتاب, المشهو...   
4  [

In [8]:
pip install --upgrade tensorflow

  You can safely remove it manually.
  You can safely remove it manually.



Collecting tensorflow
  Downloading tensorflow-2.16.1-cp311-cp311-win_amd64.whl.metadata (3.5 kB)
Collecting tensorflow-intel==2.16.1 (from tensorflow)
  Downloading tensorflow_intel-2.16.1-cp311-cp311-win_amd64.whl.metadata (5.0 kB)
Collecting ml-dtypes~=0.3.1 (from tensorflow-intel==2.16.1->tensorflow)
  Downloading ml_dtypes-0.3.2-cp311-cp311-win_amd64.whl.metadata (20 kB)
Collecting tensorboard<2.17,>=2.16 (from tensorflow-intel==2.16.1->tensorflow)
  Downloading tensorboard-2.16.2-py3-none-any.whl.metadata (1.6 kB)
Collecting keras>=3.0.0 (from tensorflow-intel==2.16.1->tensorflow)
  Downloading keras-3.3.3-py3-none-any.whl.metadata (5.7 kB)
Collecting namex (from keras>=3.0.0->tensorflow-intel==2.16.1->tensorflow)
  Downloading namex-0.0.8-py3-none-any.whl.metadata (246 bytes)
Collecting optree (from keras>=3.0.0->tensorflow-intel==2.16.1->tensorflow)
  Downloading optree-0.11.0-cp311-cp311-win_amd64.whl.metadata (46 kB)
     ---------------------------------------- 0.0/46.2 kB 

In [9]:
import numpy as np
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, GRU, Bidirectional, Dense
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.wrappers.scikit_learn import KerasClassifier
from sklearn.model_selection import GridSearchCV

# Assuming df is your DataFrame containing preprocessed text data and scores

# Split the data into features (X) and target (y)
X = df['Stemmed_Text']  # Change this to the appropriate column if needed
y = df['Score']  # Change this to the appropriate column if needed

# Convert text to sequences (assuming you already have tokenized and padded data)
# You can use tokenizer to convert text to sequences if you haven't already done it

# Split data into train, validation, and test sets
X_train, X_val_test, y_train, y_val_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_val_test, y_val_test, test_size=0.5, random_state=42)
all_tokens = [token for sublist in X_train for token in sublist]

# Calculate the vocabulary size
vocab_size = len(set(all_tokens))

# Define a function to create the RNN model
def create_rnn_model(units=100, embedding_dim=100):
    model = Sequential([
        Embedding(input_dim=vocab_size, output_dim=embedding_dim, input_length=max_length),
        LSTM(units),
        Dense(1, activation='sigmoid')
    ])
    return model

# Wrap Keras model for use with scikit-learn GridSearchCV
model = KerasClassifier(build_fn=create_rnn_model, verbose=0)

# Define hyperparameters to tune
param_grid = {
    'units': [50, 100, 150],  # Number of LSTM units
    'embedding_dim': [50, 100, 150],  # Dimensionality of word embeddings
}

# Perform grid search with cross-validation
grid = GridSearchCV(estimator=model, param_grid=param_grid, cv=3)
grid_result = grid.fit(X_train, y_train)

# Summarize results
print(f"Best Score: {grid_result.best_score_} using {grid_result.best_params_}")

# Evaluate the best model on test data
best_model = grid_result.best_estimator_
test_loss, test_acc = best_model.evaluate(X_test, y_test)
print(f"Test Accuracy: {test_acc}")


ModuleNotFoundError: No module named 'tensorflow.keras.wrappers'

In [None]:
import nltk
from nltk.translate.bleu_score import corpus_bleu

# Assuming you have reference and candidate (generated) texts for each language model
references = [[ref_text_1, ref_text_2, ...],  # List of reference texts for model 1
              [ref_text_1, ref_text_2, ...],  # List of reference texts for model 2
              ...]  # Repeat for each model

candidates = [gen_text_1, gen_text_2, ...]  # List of generated texts for all models

# Calculate BLEU score for each model
bleu_scores = []
for refs in references:
    bleu_score = corpus_bleu([refs] * len(candidates), candidates)
    bleu_scores.append(bleu_score)

# Print BLEU scores for each model
for i, bleu_score in enumerate(bleu_scores):
    print(f"Model {i+1} BLEU Score: {bleu_score}")
