In [1]:
import spacy
import json
from collections import defaultdict, Counter

# Load the SpaCy model for English
nlp = spacy.load("en_core_web_sm")

# Function to process each sentence
def process_sentence(sentence, word_repository):
    doc = nlp(sentence)
    tokens_data = []
    dependencies = []

    for token in doc:
        # Add word to the repository with its POS tag
        word_repository[token.pos_][token.text] += 1

        # Collect token data
        tokens_data.append({
            "text": token.text,
            "pos": token.pos_,
            "dep": token.dep_,
            "head": token.head.text
        })

        # Collect dependency data
        dependencies.append({
            "dependent": token.text,
            "head": token.head.text,
            "relation": token.dep_
        })

    return {
        "sentence": sentence,
        "tokens": tokens_data,
        "dependencies": dependencies
    }

# Read sentences from a .txt file
file_path = 'input.txt'
with open(file_path, 'r') as file:
    text = file.read()

# Split the text into sentences
sentences = text.split('\n')

# Initialize the word repository and results list
word_repository = defaultdict(Counter)
results = []

# Process each sentence
for sentence in sentences:
    if sentence.strip():  # Ensure the sentence is not empty
        sentence_data = process_sentence(sentence, word_repository)
        results.append(sentence_data)

# Sort words in each POS tag by their counts
sorted_word_repository = {pos: sorted(words.items(), key=lambda item: item[1], reverse=True)
                          for pos, words in word_repository.items()}

# Output the results to a JSON file
output_path = 'output.json'
with open(output_path, 'w') as json_file:
    json.dump(results, json_file, indent=2)

# Output the sorted word repository to a JSON file
word_repository_path = 'word_repository.json'
with open(word_repository_path, 'w') as json_file:
    json.dump(sorted_word_repository, json_file, indent=2)

print(f"Processing complete. Results saved to {output_path} and {word_repository_path}")


ModuleNotFoundError: No module named 'spacy'

In [2]:
!pip install spacy

Collecting spacy
  Downloading spacy-3.7.5-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (6.5 MB)
[K     |████████████████████████████████| 6.5 MB 4.3 MB/s eta 0:00:01
[?25hCollecting cymem<2.1.0,>=2.0.2
  Downloading cymem-2.0.8-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (46 kB)
[K     |████████████████████████████████| 46 kB 70.3 MB/s eta 0:00:01
Collecting catalogue<2.1.0,>=2.0.6
  Downloading catalogue-2.0.10-py3-none-any.whl (17 kB)
Collecting weasel<0.5.0,>=0.1.0
  Downloading weasel-0.4.1-py3-none-any.whl (50 kB)
[K     |████████████████████████████████| 50 kB 100.6 MB/s ta 0:00:01
Collecting langcodes<4.0.0,>=3.2.0
  Downloading langcodes-3.3.0-py3-none-any.whl (181 kB)
[K     |████████████████████████████████| 181 kB 97.5 MB/s eta 0:00:01
[?25hCollecting pydantic!=1.8,!=1.8.1,<3.0.0,>=1.7.4
  Downloading pydantic-2.5.3-py3-none-any.whl (381 kB)
[K     |████████████████████████████████| 381 kB 52.6 MB/s eta 0:00:01
[?25hCollecting spacy-legacy

In [None]:
!python -m spacy download en_core_web_sm

In [None]:
import spacy

# Load the SpaCy model for English
nlp = spacy.load("en_core_web_sm")

# Test it on a sample text
doc = nlp("This is a test sentence.")
for token in doc:
    print(token.text, token.pos_, token.dep_, token.head.text)