<a href="https://colab.research.google.com/github/Zakaria1298727/LAB3/blob/main/Part1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import os
import requests
from bs4 import BeautifulSoup
import re
import csv
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

# Étape 1 : Scraping
def scrape_arabic_text_from_page(page_number):
    url = f"https://www.kooora.com/?n=0&o=n&pg={page_number}"
    response = requests.get(url)
    if response.status_code != 200:
        print(f"Failed to retrieve page {page_number}. Status code: {response.status_code}")
        return None

    soup = BeautifulSoup(response.text, "html.parser")

    # Find all text content from the website
    all_text = soup.find_all(text=True)

    # Regular expression to match Arabic text
    arabic_text_pattern = re.compile(r"[\u0600-\u06FF]+")

    # Extract and filter only Arabic text
    arabic_text = []
    for text in all_text:
        matches = arabic_text_pattern.findall(text)
        if matches:
            arabic_text.append(" ".join(matches))

    return " ".join(arabic_text)

def save_text_to_file(text, filename):
    if not text:
        print(f"No text found to save in {filename}.")
        return

    with open(filename, "w", encoding="utf-8-sig") as file:
        file.write(text)
    print(f"Saved text to {filename}")

# Étape 2 : Charger et analyser les fichiers
def load_text_files(folder_path):
    """Load text content from all files in a given folder."""
    file_contents = {}
    for filename in os.listdir(folder_path):
        if filename.endswith(".txt"):
            with open(os.path.join(folder_path, filename), "r", encoding="utf-8") as file:
                file_contents[filename] = file.read()
    return file_contents

def calculate_similarity(file_contents):
    """Calculate pairwise similarity scores between text files."""
    filenames = list(file_contents.keys())
    texts = list(file_contents.values())

    # Compute TF-IDF vectors
    vectorizer = TfidfVectorizer()
    tfidf_matrix = vectorizer.fit_transform(texts)

    # Compute cosine similarity
    similarity_matrix = cosine_similarity(tfidf_matrix)

    # Normalize scores to a scale of 0 to 10
    normalized_scores = np.interp(similarity_matrix, (similarity_matrix.min(), similarity_matrix.max()), (0, 10))

    # Assign an average similarity score to each file
    scores = {filenames[i]: round(np.mean(normalized_scores[i]), 2) for i in range(len(filenames))}
    return scores

# Étape 3 : Créer le dataset CSV
def create_dataset_csv(file_contents, scores, output_file="dataset.csv"):
    with open(output_file, "w", encoding="utf-8-sig", newline="") as csvfile:
        writer = csv.writer(csvfile)
        writer.writerow(["Filename", "Content", "Score"])
        for filename, content in file_contents.items():
            writer.writerow([filename, content, scores.get(filename, 0)])
    print(f"Dataset saved to {output_file}")

def main():
    # Scraping and saving data
    folder_path = "scraped_pages"
    os.makedirs(folder_path, exist_ok=True)

    for i in range(2, 500):
        print(f"Scraping page {i}...")
        text = scrape_arabic_text_from_page(i)
        if text:
            save_text_to_file(text, os.path.join(folder_path, f"page_{i}.txt"))

    # Analyzing similarities
    print("Analyzing similarities...")
    file_contents = load_text_files(folder_path)
    if not file_contents:
        print("No files to analyze.")
        return

    scores = calculate_similarity(file_contents)

    # Creating the dataset
    print("Creating dataset...")
    create_dataset_csv(file_contents, scores)

if __name__ == "__main__":
    main()


Scraping page 2...


  all_text = soup.find_all(text=True)


Saved text to scraped_pages/page_2.txt
Scraping page 3...
Saved text to scraped_pages/page_3.txt
Scraping page 4...
Saved text to scraped_pages/page_4.txt
Scraping page 5...
Saved text to scraped_pages/page_5.txt
Scraping page 6...
Saved text to scraped_pages/page_6.txt
Scraping page 7...
Saved text to scraped_pages/page_7.txt
Scraping page 8...
Saved text to scraped_pages/page_8.txt
Scraping page 9...
Saved text to scraped_pages/page_9.txt
Scraping page 10...
Saved text to scraped_pages/page_10.txt
Scraping page 11...
Saved text to scraped_pages/page_11.txt
Scraping page 12...
Saved text to scraped_pages/page_12.txt
Scraping page 13...
Saved text to scraped_pages/page_13.txt
Scraping page 14...
Saved text to scraped_pages/page_14.txt
Scraping page 15...
Saved text to scraped_pages/page_15.txt
Scraping page 16...
Saved text to scraped_pages/page_16.txt
Scraping page 17...
Saved text to scraped_pages/page_17.txt
Scraping page 18...
Saved text to scraped_pages/page_18.txt
Scraping page 1

In [None]:
import pandas as pd

# Lire le fichier CSV en s'assurant d'utiliser le bon encodage
df = pd.read_csv("dataset.csv", encoding="utf-8")
print(df.head())


       Filename                                            Content  Score
0  page_123.txt  ﻿كووورة الموقع العربي الرياضي الأول أخبار جمال...   0.81
1   page_90.txt  ﻿كووورة الموقع العربي الرياضي الأول أخبار شباب...   0.95
2  page_459.txt  ﻿كووورة الموقع العربي الرياضي الأول أخبار لاعب...   0.57
3  page_194.txt  ﻿كووورة الموقع العربي الرياضي الأول أخبار محطا...   0.88
4   page_11.txt  ﻿كووورة الموقع العربي الرياضي الأول أخبار عودة...   0.59


In [None]:
import nltk

# Téléchargez le tokenizer "punkt" qui est nécessaire pour la tokenisation
nltk.download('punkt')


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [None]:
import nltk
import pandas as pd
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer

# Assurez-vous que les ressources nécessaires sont téléchargées
nltk.download('punkt')
nltk.download('stopwords')

# Charger le fichier CSV
df = pd.read_csv('dataset.csv')

# Prétraiter le texte
def preprocess_text(text):
    # Convertir en minuscules
    text = text.lower()

    # Retirer les chiffres et les caractères non-arabes
    text = re.sub(r'[^a-zA-Z\u0600-\u06FF\s]', '', text)

    # Tokenisation
    tokens = word_tokenize(text)

    # Enlever les stop words
    stop_words = set(stopwords.words('arabic'))  # Utilisation des stop words arabes
    tokens = [word for word in tokens if word not in stop_words]

    # Stemming (ici, nous utilisons le stemmer anglais par défaut, pour l'arabe il existe des solutions comme `farasa` ou `camel-stemmer`)
    stemmer = PorterStemmer()
    tokens = [stemmer.stem(word) for word in tokens]

    # Joindre les tokens traités pour obtenir le texte final
    return ' '.join(tokens)

# Appliquer le prétraitement sur chaque texte dans la colonne 'Content'
df['Processed_Content'] = df['Content'].apply(preprocess_text)

# Sauvegarder le dataset mis à jour dans un nouveau fichier CSV
df.to_csv('processed_dataset.csv', index=False)

print("Prétraitement terminé et dataset sauvegardé.")


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
import torch

# Appliquer TF-IDF pour la vectorisation des textes
vectorizer = TfidfVectorizer(max_features=5000)  # Limite le nombre de features à 5000
X = vectorizer.fit_transform(df['Processed_Content']).toarray()

# Convertir en tensor PyTorch
X_tensor = torch.tensor(X, dtype=torch.float32)


In [None]:
from sklearn.model_selection import train_test_split

# Diviser les données en ensembles d'entraînement et de test
X_train, X_test, y_train, y_test = train_test_split(X_tensor, df['Score'].values, test_size=0.2, random_state=42)

# Convertir en tensors PyTorch
X_train_tensor = torch.tensor(X_train, dtype=torch.float32)
X_test_tensor = torch.tensor(X_test, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train, dtype=torch.float32)
y_test_tensor = torch.tensor(y_test, dtype=torch.float32)


In [None]:
import torch.nn as nn

class LSTMModel(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(LSTMModel, self).__init__()

        # Définir la couche LSTM
        self.lstm = nn.LSTM(input_size, hidden_size, batch_first=True)
        # Définir la couche de classification (Dense)
        self.fc = nn.Linear(hidden_size, output_size)

    def forward(self, x):
        # Passer par la couche LSTM
        lstm_out, (hn, cn) = self.lstm(x)
        # Utiliser la dernière sortie de la séquence pour la classification
        out = self.fc(hn[-1])
        return out

In [None]:
# Initialiser le modèle
input_size = X_train.shape[1]  # Le nombre de features (par exemple 5000)
hidden_size = 64  # Nombre d'unités cachées
output_size = 1  # Pour la classification binaire

model = LSTMModel(input_size, hidden_size, output_size)

# Définir la fonction de perte et l'optimiseur
criterion = nn.BCEWithLogitsLoss()  # Pour la classification binaire
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

# Entraînement
num_epochs = 5
for epoch in range(num_epochs):
    model.train()

    # Passer les données d'entraînement à travers le modèle
    outputs = model(X_train_tensor)

    # Calculer la perte
    loss = criterion(outputs.squeeze(), y_train_tensor)

    # Mettre à jour les poids
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

    print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}")


In [None]:
# Évaluation du modèle
model.eval()
with torch.no_grad():
    outputs = model(X_test_tensor)
    predicted = torch.round(torch.sigmoid(outputs.squeeze()))  # Convertir la sortie en classe binaire
    accuracy = (predicted == y_test_tensor).float().mean()
    print(f"Test Accuracy: {accuracy:.4f}")

In [None]:
# Pour prédire sur de nouvelles données
new_data = vectorizer.transform(["بدأ فريق ليستر سيتي مشواره تحت قيادة المدير الفني الجديد رود فان نيستلروي بالفوز على وست هام يونايتد"]).toarray()  # Exemple avec TF-IDF
new_data_tensor = torch.tensor(new_data, dtype=torch.float32)

# Prédiction
model.eval()
with torch.no_grad():
    output = model(new_data_tensor)
    prediction = torch.round(torch.sigmoid(output.squeeze()))  # Si la prédiction > 0.5, c'est 1 (positif)
    print(f"Prediction: {prediction.item()}")
