In [None]:
#Run only 1 time for every new session
!apt update
!apt install enchant --fix-missing
!apt install -qq enchant
!pip install pyenchant
!apt-get install myspell-fr
!pip install transformers
!pip install mtranslate


In [None]:
import requests
from bs4 import BeautifulSoup
from collections import Counter
import numpy as np
import enchant
import requests
import json
import concurrent.futures
import math
import nltk

dictionary = enchant.Dict("fr")
word_freqs = Counter()
current_size = 0

# Set maximum size max is 10/12 gb (collab limitation)
max_size = 9 * 1024 * 1024 * 1024

# Define the number of articles you want to retrieve (>2000 for decent results)
n_articles = 50

urls = [] 
def fetch_random_article():
    response = requests.get("https://fr.wikipedia.org/api/rest_v1/page/random/summary")
    data = json.loads(response.text)
    url = data["content_urls"]["desktop"]["page"]
    return url

with concurrent.futures.ThreadPoolExecutor() as executor:
    future_to_url = {executor.submit(fetch_random_article): i for i in range(n_articles)}
    for future in concurrent.futures.as_completed(future_to_url):
        url = future.result()
        urls.append(url)

for url in urls:
    response = requests.get(url)
    soup = BeautifulSoup(response.content, "html.parser")
    text = soup.get_text()
    text = ''.join([i for i in text if not i.isdigit()])
    words = text.split()
    for word in words:
        if dictionary.check(word):
            word_freqs[word] += 1
    current_size += len(text)
    if current_size >= max_size:
        break

# Zipf's law
if len(word_freqs) > 1:
    ranks = range(1, len(word_freqs) + 1)
    freqs = [count for _, count in word_freqs.items()]
    log_ranks = np.log(ranks)
    log_freqs = np.log(freqs)
    a, b = np.polyfit(log_ranks, log_freqs, 1)
else:
    a, b = None, None

# Identify simple words
simple_words = set()
word_freqs_list = word_freqs.most_common()
threshold_frequency = 7 # Set a threshold frequency to define what constitutes a simple word
for word, freq in word_freqs_list:
    if freq > threshold_frequency:
        simple_words.add(word)

In [None]:
import math
import nltk
import torch
from transformers import AutoTokenizer, AutoModel
from sklearn.cluster import KMeans

tokenizer = AutoTokenizer.from_pretrained("dbmdz/bert-base-french-europeana-cased")
model = AutoModel.from_pretrained("dbmdz/bert-base-french-europeana-cased")
nltk.download('punkt')
def extract_phrases(text):
    phrases = nltk.sent_tokenize(text)
    simple_phrases = []
    for phrase in phrases:
        words = phrase.split()
        simple = True
        for word in words:
            if not dictionary.check(word):
                simple = False
                break
        if simple:
            simple_phrases.append(phrase)
    return simple_phrases

def encode_phrase(phrase):
    input_ids = tokenizer.encode(phrase, return_tensors='pt')
    with torch.no_grad():
        encoded_phrase = model(input_ids).last_hidden_state[0][0].numpy()
    return encoded_phrase

simple_phrases = []
for url in urls:
    response = requests.get(url)
    soup = BeautifulSoup(response.content, "html.parser")
    text = soup.get_text()
    text = ''.join([i for i in text if not i.isdigit()])
    simple_phrases += extract_phrases(text)

encoded_phrases = []
for phrase in simple_phrases:
    encoded_phrase = encode_phrase(phrase)
    encoded_phrases.append(encoded_phrase)


encoded_phrases = np.array(encoded_phrases)

kmeans = KMeans(n_clusters=30)
kmeans.fit(encoded_phrases)

cluster_labels = kmeans.labels_


# Save the simplest phrases to a file
with open('/content/drive/MyDrive/Colab Notebooks/output.txt', 'w') as f:
    for cluster_label in set(cluster_labels):
        cluster = np.where(cluster_labels == cluster_label)[0]
        for i in cluster:
            f.write(f'{simple_phrases[i]}\n')

In [None]:
min_words = 3
max_words = 8

# Read the file
with open("/content/drive/MyDrive/Colab Notebooks/output.txt", "r") as file:
    lines = file.readlines()

# Remove duplicates and limit the number of words
lines = list(set(line.strip() for line in lines if len(line.strip().split()) >= min_words and len(line.strip().split()) <= max_words))

# Write the cleaned sentences back to the file
with open("/content/drive/MyDrive/Colab Notebooks/output.txt", "w") as file:
    file.writelines("\n".join(lines))


In [None]:
from mtranslate import translate
with open('/content/drive/MyDrive/Colab Notebooks/output.txt', 'r') as f:
    with open('/content/drive/MyDrive/Colab Notebooks/output2.txt', 'w') as out:
        for line in f:
            translated_line = translate(line.strip(),"english")  
            out.write(translated_line + '\n')

In [None]:
import spacy

nlp = spacy.load('en_core_web_sm')

def check_grammar(sentence):
    doc = nlp(sentence)

    
    # Check for incomplete or fragmented sentences
    if len(doc) < 2:
        return "Incomplete sentence"
    for sent in doc.sents:
        if len(sent) < 2:
            return "Fragmented sentence"
    
    # Check for missing subjects
    for token in doc:
        if token.dep_ == 'nsubj' and not token.is_punct:

            break
    else:
        return "Missing subject"
    
    # Check for missing verbs
    for token in doc:
        if token.pos_ == 'VERB' and not token.is_punct:
            break
    else:
        return "Missing verb"
    
    # Check for other grammatical issues
    for token in doc:
        if token.pos_ == 'VERB' and token.dep_ == 'aux' and not token.is_punct:
            return "Auxiliary verb should not be used"
        if token.tag_ in ['VBD', 'VBN'] and not token.is_punct:
            return "Use present tense instead of past participle or past tense"
        if token.dep_ == 'nsubjpass' and not token.is_punct:
            return "Passive voice should not be used"
        if token.tag_ == 'NEG' and not token.is_punct:
            return "Negative sentence"
    
    # If no grammatical issues found, return None
    return None

with open('/content/drive/MyDrive/Colab Notebooks/output2.txt', 'r') as input_file, open('/content/drive/MyDrive/Colab Notebooks/output3.txt', 'w') as output_file:
    # Iterate through the lines in the input file
    for line in input_file:
        # Parse the line into sentences using spaCy
        doc = nlp(line)
        for sent in doc.sents:
            # Check the grammar of the sentence
            result = check_grammar(sent.text)
            # If the sentence is correct, save it in the output file
            if not result:
                output_file.write(sent.text + '\n')


In [None]:
from mtranslate import translate
with open('/content/drive/MyDrive/Colab Notebooks/output3.txt', 'r') as f:
    with open('/content/drive/MyDrive/Colab Notebooks/output4.txt', 'w') as out:
        for line in f:
            translated_line = translate(line.strip(),"fr","en")  
            out.write(translated_line + '\n')
min_words = 3
max_words = 6

# Read the file
with open("/content/drive/MyDrive/Colab Notebooks/output4.txt", "r") as file:
    lines = file.readlines()

# Remove duplicates and limit the number of words
lines = list(set(line.strip() for line in lines if len(line.strip().split()) >= min_words and len(line.strip().split()) <= max_words))

# Write the cleaned sentences back to the file
with open("/content/drive/MyDrive/Colab Notebooks/output4.txt", "w") as file:
    file.writelines("\n".join(lines))