In [13]:
import spacy
from collections import Counter
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from textblob import TextBlob

nlp = spacy.load('en_core_web_sm')

# read the file
with open('text_file.txt', 'r', encoding='utf-8') as file:
    text = file.read()

# sentence splitting
doc = nlp(text)
sentences = list(doc.sents)
num_sentences = len(sentences)

# word tokenization
words = word_tokenize(text.lower())
num_words = len(words)

# stopwords removal
stop_words = set(stopwords.words('english'))
filtered_words = [word for word in words if word.casefold() not in stop_words]
num_filtered_words = len(filtered_words)

# stemming
stemmer = PorterStemmer()
stems = [stemmer.stem(word) for word in filtered_words]

# unique words
unique_words = list(set(filtered_words))
num_unique_words = len(unique_words)

# type-token ratio
ttr = num_unique_words / num_filtered_words

# word length distribution
word_lengths = [len(word) for word in filtered_words]
word_lengths_count = Counter(word_lengths)

# sentence length distribution
sentence_lengths = [len(sent) for sent in sentences]
sentence_lengths_count = Counter(sentence_lengths)

# parts of speech distribution
pos_tags = [token.pos_ for token in doc]
pos_tags_count = Counter(pos_tags)

# named entity recognition
ner_tags = [(ent.label_, ent.text) for ent in doc.ents]
ner_tags_count = Counter(ner_tags)

# average word length
avg_word_length = sum(word_lengths) / num_filtered_words

# average syllables per word
syllable_counts = [TextBlob(word).syllables for word in filtered_words]
avg_syllables_per_word = sum(syllable_counts) / num_filtered_words

# punctuation distribution
punctuation = [char for char in text if char in '.,?!;:-()"\'']
punctuation_count = Counter(punctuation)

# capitalization distribution
capitalization = [char.isupper() for char in text if char.isalpha()]
capitalization_count = Counter(capitalization)

# sentiment analysis
blob = TextBlob(text)
polarity = blob.sentiment.polarity
subjectivity = blob.sentiment.subjectivity

# print the results
print(f"Number of sentences: {num_sentences}")
print(f"Number of words: {num_words}")
print(f"Number of filtered words: {num_filtered_words}")
print(f"Number of unique words: {num_unique_words}")
print(f"Type-token ratio: {ttr:.2f}")
print(f"Word length distribution: {word_lengths_count}")
print(f"Sentence length distribution: {sentence_lengths_count}")
print(f"Parts of speech distribution: {pos_tags_count}")
print(f"Named entity recognition: {ner_tags_count}")
print(f"Average word length: {avg_word_length:.2f}")
print(f"Average syllables per word: {avg_syllables_per_word:.2f}")
print(f"Punctuation distribution: {punctuation_count}")
print(f"Capitalization distribution: {capitalization_count}")
print(f"Sentiment polarity: {polarity:.2f}")
print(f"Sentiment subjectivity: {subjectivity:.2f}")


OSError: [E050] Can't find model 'en_core_web_sm'. It doesn't seem to be a Python package or a valid path to a data directory.

In [5]:
pip install spacy

Collecting spacyNote: you may need to restart the kernel to use updated packages.

  Downloading spacy-3.5.1-cp39-cp39-win_amd64.whl (12.2 MB)
Collecting wasabi<1.2.0,>=0.9.1
  Using cached wasabi-1.1.1-py3-none-any.whl (27 kB)
Collecting murmurhash<1.1.0,>=0.28.0
  Downloading murmurhash-1.0.9-cp39-cp39-win_amd64.whl (18 kB)
Collecting typer<0.8.0,>=0.3.0
  Using cached typer-0.7.0-py3-none-any.whl (38 kB)
Collecting preshed<3.1.0,>=3.0.2
  Downloading preshed-3.0.8-cp39-cp39-win_amd64.whl (96 kB)
Collecting srsly<3.0.0,>=2.4.3
  Downloading srsly-2.4.6-cp39-cp39-win_amd64.whl (482 kB)
Collecting pydantic!=1.8,!=1.8.1,<1.11.0,>=1.7.4
  Downloading pydantic-1.10.6-cp39-cp39-win_amd64.whl (2.2 MB)
Collecting langcodes<4.0.0,>=3.2.0
  Using cached langcodes-3.3.0-py3-none-any.whl (181 kB)
Collecting spacy-legacy<3.1.0,>=3.0.11
  Using cached spacy_legacy-3.0.12-py2.py3-none-any.whl (29 kB)
Collecting thinc<8.2.0,>=8.1.8
  Downloading thinc-8.1.9-cp39-cp39-win_amd64.whl (1.5 MB)
Collectin

In [7]:
pip install textblob

Collecting textblob
  Using cached textblob-0.17.1-py2.py3-none-any.whl (636 kB)
Installing collected packages: textblob
Successfully installed textblob-0.17.1
Note: you may need to restart the kernel to use updated packages.


In [10]:
python -m spacy download en_core_web_sm


SyntaxError: invalid syntax (Temp/ipykernel_13676/1553972540.py, line 1)

In [1]:
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import cmudict
from collections import Counter
from nltk import pos_tag, ne_chunk
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from nltk.stem import PorterStemmer
from nltk.probability import FreqDist
from nltk.collocations import BigramAssocMeasures, BigramCollocationFinder
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer
from nltk.tree import Tree
from textblob import TextBlob
import spacy

nlp = spacy.load('en_core_web_sm')

def read_file(file_path):
    with open(file_path, 'r') as file:
        text = file.read()
    return text

def get_word_count(text):
    words = word_tokenize(text)
    return len(words)

def get_sentence_count(text):
    sentences = sent_tokenize(text)
    return len(sentences)

def get_syllable_count(word):
    d = cmudict.dict()
    try:
        return [len(list(y for y in x if y[-1].isdigit())) for x in d[word.lower()]][0]
    except KeyError:
        return 0

def get_total_syllable_count(text):
    words = word_tokenize(text)
    syllable_count = 0
    for word in words:
        syllable_count += get_syllable_count(word)
    return syllable_count

def get_avg_sentence_length(text):
    words = word_tokenize(text)
    sentences = sent_tokenize(text)
    return len(words) / len(sentences)

def get_avg_word_length(text):
    words = word_tokenize(text)
    syllable_count = 0
    for word in words:
        syllable_count += get_syllable_count(word)
    return syllable_count / len(words)

def get_ttr(text):
    words = word_tokenize(text)
    return len(set(words)) / len(words)

def get_flesch_kincaid_grade_level(text):
    words = word_tokenize(text)
    sentences = sent_tokenize(text)
    syllable_count = get_total_syllable_count(text)
    return 0.39 * (len(words) / len(sentences)) + 11.8 * (syllable_count / len(words)) - 15.59

def get_word_length_distribution(text):
    words = word_tokenize(text)
    word_lengths = [len(word) for word in words]
    return FreqDist(word_lengths)

def get_sentence_length_distribution(text):
    sentences = sent_tokenize(text)
    sentence_lengths = [len(word_tokenize(sentence)) for sentence in sentences]
    return FreqDist(sentence_lengths)

def get_pos_distribution(text):
    words = word_tokenize(text)
    pos_tags = pos_tag(words)
    pos_counts = Counter(tag for word, tag in pos_tags)
    return pos_counts

def get_named_entity_recognition(text):
    named_entities = []
    for sentence in sent_tokenize(text):
        for chunk in ne_chunk(pos_tag(word_tokenize(sentence))):
            if hasattr(chunk, 'label'):
                named_entities.append(' '.join(c[0] for c in chunk))
    return named_entities

def get_punctuation_distribution(text):
    words = word_tokenize(text)
    punctuation_counts = Counter(word for word in words if word in string.punctuation)
    return punctuation_counts

def get_capitalization_distribution(text):
    words = word_tokenize(text)
    capitalization_counts = Counter(word for word in words if word[0].isupper())
    return capitalization_counts

def get_unique_words(text):
    words = word_tokenize(text)
    return len(set(words))

def get_word_stems(text):
    words = word_tokenize(text)
    stemmer = PorterStemmer()
    stems = [stemmer.stem(word) for word in words]
    return stems

def get_word_frequencies(text):
    words = word_tokenize(text)
    return FreqDist(words)

def get_collocation_frequencies(text):
    words = word_tokenize(text)
    bigram_measures = BigramAssocMeasures()
    finder = BigramCollocationFinder.from_words(words)
    return finder.nbest(bigram_measures.raw_freq, 10)

def get_ngram_frequencies(text, n):
    words = word_tokenize(text)
    ngrams = nltk.ngrams(words, n)
    return FreqDist(ngrams)

def get_function_word_frequencies(text):
    words = word_tokenize(text)
    stop_words = set(stopwords.words('english'))
    function_words = [word for word in words if word in stop_words]
    return FreqDist(function_words)

def get_stop_word_frequencies(text):
    words = word_tokenize(text)
    stop_words = set(stopwords.words('english'))
    stop_words_count = 0
    for word in words:
        if word in stop_words:
            stop_words_count += 1
    return stop_words_count / len(words)

def get_grammar_and_syntax_analysis(text):
    doc = nlp(text)
    return doc

def get_semantic_analysis(text):
    blob = TextBlob(text)
    return blob.sentiment

def get_sentiment_analysis(text):
    sid = SentimentIntensityAnalyzer()
    return sid.polarity_scores(text)


text = read_file('text_file.txt')
print('Total number of words:', get_word_count(text))
print('Total number of sentences:', get_sentence_count(text))
print('Total number of syllables:', get_total_syllable_count(text))
print('Average sentence length (in words):', get_avg_sentence_length(text))
print('Average word length (in syllables):', get_avg_word_length(text))
print('Type-token ratio (TTR):', get_ttr(text))
print('Flesch-Kincaid Grade Level:', get_flesch_kincaid_grade_level(text))
print('Word length distribution:', get_word_length_distribution(text))
print('Sentence length distribution:', get_sentence_length_distribution(text))
print('Parts of speech distribution:', get_pos_distribution(text))
print('Named entity recognition:', get_named_entity_recognition(text))
print('Punctuation distribution:', get_punctuation_distribution(text))
print('Capitalization distribution:', get_capitalization_distribution(text))
print('Number of unique words in the text:', get_unique_words(text))
print('Stem of each word in the text:', get_word_stems(text))
print('Average word frequency:', get_word_frequencies(text))
print('Collocation frequency:', get_collocation_frequencies(text))
print('N-gram frequency:', get_ngram_frequencies(text, 2))
print('Function word frequency:', get_function_word_frequencies(text))
print('Stop word frequency:', get_stop_word_frequencies(text))
print('Grammar and syntax analysis:', get_grammar_and_syntax_analysis(text))
print('Semantic analysis:', get_semantic_analysis(text))
print('Sentiment analysis:', get_sentiment_analysis(text))

OSError: [E050] Can't find model 'en_core_web_sm'. It doesn't seem to be a Python package or a valid path to a data directory.