In [1]:
import language_tool_python
tool = language_tool_python.LanguageTool('en-US')

In [2]:
sentence = "I'm afraid for snakes."

In [3]:
matches = tool.check(sentence)
matches

[]

In [4]:
len(matches)

0

In [19]:
from language_tool_python import LanguageTool

def check_grammar(text):
    tool = LanguageTool('en-US')
    matches = tool.check(text)
    
    preposition_errors = []
    for match in matches:
        if 'preposition' in match.message.lower():
            preposition_errors.append({
                'error': match.message,
                'suggestion': match.replacements,
                'context': match.context
            })
    
    return preposition_errors

# Example usage
errors = check_grammar("I‚Äôm afraid from snakes.")
for error in errors:
    print(error)

In [18]:
def replace_sampled_prepositions(text, sample_size=5):
    """
    Replace a sample of prepositions in the text with random prepositions to make the text wrong.
    
    Args:
        text (str): The input text.
        nlp (spacy.language.Language): The loaded spaCy model.
        sample_size (int): The number of prepositions to replace.
    
    Returns:
        str: A JSON string with the original and replaced prepositions.
    """
    # Define a list of prepositions to use as replacements
    prepositions_list = ['about', 'accross', 'after', 'against', 'along', 'among', 'around', 'as', 'at', 'before', 'behind', 'beside', 'between', 'beyond', 'by', 'despite', 'down', 'during', 'except', 'for', 'in', 'inside', 'into', 'near', 'of', 'on', 'onto', 'outside', 'over', 'past', 'since', 'through', 'to', 'toward', 'under', 'underneath', 'until', 'up', 'upon', 'with', 'within', 'without']
    
    # Process the text with spaCy
    doc = nlp(text)
    
    # Identify all prepositions and their positions in sentences
    prepositions = [(token.text, token.i, sent_idx) for sent_idx, sent in enumerate(doc.sents) for token in sent if token.pos_ == 'ADP']
    
    # Handle case where there are fewer prepositions than the sample size
    sample_size = min(sample_size, len(prepositions))
    
    # Sample a subset of prepositions to replace
    indices_to_replace = random.sample(range(len(prepositions)), sample_size)
    
    # Create replacements list
    replacements = []
    for idx in indices_to_replace:
        original_preposition, token_index, sentence_num = prepositions[idx]
        new_preposition = random.choice(prepositions_list)
        
        # Ensure the new preposition is different from the original
        while new_preposition == original_preposition:
            new_preposition = random.choice(prepositions_list)
        
        replacements.append({
            "original": original_preposition,
            "error": new_preposition,
            "sentence": sentence_num + 1  # Adjusting to 1-based index for sentences
        })
    
    return replacements #json.dumps(replacements, indent=2)

In [7]:
import spacy
import random

nlp = spacy.load("en_core_web_sm")

In [8]:
text = "The cat jumped on the couch and slept by the window. She walked through the park and stopped at the bench near the fountain."

replacements = replace_sampled_prepositions(text)
replacements

[{'original': 'through', 'error': 'around', 'sentence': 2},
 {'original': 'at', 'error': 'of', 'sentence': 2},
 {'original': 'near', 'error': 'toward', 'sentence': 2},
 {'original': 'by', 'error': 'to', 'sentence': 1},
 {'original': 'on', 'error': 'under', 'sentence': 1}]

In [9]:
def apply_preposition_errors(text, replacements):
    """
    Replace specified correct prepositions in the text with incorrect ones, based on sentence index.

    Args:
        text (str): Original input text.
        replacements (list): List of dicts with keys: 'original', 'error', 'sentence'.

    Returns:
        str: Modified text with injected preposition errors.
    """
    doc = nlp(text)
    sentences = list(doc.sents)
    modified_sentences = []

    for i, sent in enumerate(sentences):
        sent_text = sent.text
        modified_sent = sent_text

        # Find replacements for this sentence
        sent_replacements = [r for r in replacements if r['sentence'] == i + 1]

        # Replace prepositions one by one (cautiously)
        for rep in sent_replacements:
            pattern = rf'\b{rep["original"]}\b'
            modified_sent = re.sub(pattern, rep["error"], modified_sent, count=1)

        modified_sentences.append(modified_sent)

    return " ".join(modified_sentences)

In [11]:
import re

In [12]:
corrupted_text = apply_preposition_errors(text, replacements)
print(corrupted_text)

The cat jumped under the couch and slept to the window. She walked around the park and stopped of the bench toward the fountain.


In [13]:
import spacy
from transformers import pipeline

nlp = spacy.load("en_core_web_sm")
fill_mask = pipeline("fill-mask", model="bert-base-uncased")

def detect_preposition_errors_bert(text, top_k=5, verbose=False):
    """
    Detect potentially incorrect prepositions using BERT masked language modeling.
    
    Args:
        text (str): The input text.
        top_k (int): Number of BERT suggestions to consider.
        verbose (bool): If True, print debug info.

    Returns:
        List of detected issues.
    """
    results = []
    doc = nlp(text)
    sentences = list(doc.sents)

    for sent in sentences:
        sent_tokens = [t.text for t in sent]
        for token in sent:
            if token.pos_ == "ADP" and token.is_alpha:
                rel_index = token.i - sent.start  # index within sentence
                original_prep = sent_tokens[rel_index]
                masked_tokens = sent_tokens.copy()
                masked_tokens[rel_index] = fill_mask.tokenizer.mask_token

                masked_sentence = " ".join(masked_tokens)
                predictions = fill_mask(masked_sentence)
                top_preds = [p["token_str"].strip().lower() for p in predictions[:top_k]]

                if verbose:
                    print(f"[DEBUG] '{original_prep}' vs {top_preds} in: {sent.text.strip()}")

                if original_prep.lower() not in top_preds:
                    results.append({
                        "sentence": sent.text.strip(),
                        "original": original_prep,
                        "suggestions": top_preds,
                        "masked_version": masked_sentence,
                        "note": f"'{original_prep}' may be incorrect. Suggestions: {top_preds}"
                    })

    return results


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [14]:
import spacy

nlp = spacy.load("en_core_web_sm")
doc = nlp("She is good in math.")
for token in doc:
    print(token.text, token.pos_, token.dep_)


She PRON nsubj
is AUX ROOT
good ADJ acomp
in ADP prep
math NOUN pobj
. PUNCT punct


In [15]:
import spacy
from transformers import pipeline

# Load models
nlp = spacy.load("en_core_web_sm")
fill_mask = pipeline("fill-mask", model="bert-base-uncased")

def detect_preposition_errors(text, top_k=1):
    results = []
    doc = nlp(text)
    for sent in doc.sents:
        sent_tokens = [token.text for token in sent]
        for i, token in enumerate(sent):
            if token.pos_ == "ADP":
                original = token.text
                masked_tokens = sent_tokens.copy()
                masked_tokens[i] = fill_mask.tokenizer.mask_token
                masked_text = " ".join(masked_tokens)

                try:
                    preds = fill_mask(masked_text)
                    top_preds = [p["token_str"].strip().lower() for p in preds[:top_k]]
                    if original.lower() not in top_preds:
                        results.append({
                            "sentence": sent.text,
                            "original": original,
                            "suggestions": top_preds,
                            "masked": masked_text
                        })
                except Exception as e:
                    print(f"Error with sentence: {masked_text}")
                    print(str(e))
    return results

# Run on text
text = """
I‚Äôm afraid from snakes.
"""

errors = detect_preposition_errors(text)

for e in errors:
    print(f"- ‚ùå '{e['original']}' in: \"{e['sentence'].strip()}\"")
    print(f"  üëâ Suggested: {e['suggestions']}")
    print()


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


- ‚ùå 'from' in: "I‚Äôm afraid from snakes."
  üëâ Suggested: ['of']



In [16]:
import spacy
nlp = spacy.load("en_core_web_sm")
doc = nlp("He arrived to the station late.")

for token in doc:
    print(f"{token.text:10} POS: {token.pos_}  DEP: {token.dep_}")


He         POS: PRON  DEP: nsubj
arrived    POS: VERB  DEP: ROOT
to         POS: ADP  DEP: prep
the        POS: DET  DEP: det
station    POS: NOUN  DEP: pobj
late       POS: ADV  DEP: advmod
.          POS: PUNCT  DEP: punct
