In [13]:
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
from collections import Counter
from nltk import pos_tag, ne_chunk
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from nltk.stem import PorterStemmer
from nltk.probability import FreqDist
from nltk.collocations import BigramAssocMeasures, BigramCollocationFinder
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer
from nltk.tree import Tree
from textblob import TextBlob
import spacy
import string
from sumy.parsers.plaintext import PlaintextParser
from sumy.nlp.tokenizers import Tokenizer
from sumy.summarizers.lex_rank import LexRankSummarizer
import string
from sklearn.feature_extraction.text import CountVectorizer

# Load Spacy model
nlp = spacy.load('en_core_web_sm')

def read_file(file_path):
    """
    Reads a text file and returns the contents as a string.
    """
    with open(file_path, 'r', encoding='utf-8') as file:
        text = file.read()
    return text

def get_word_count(text):
    """
    Returns the total number of words in the text.
    """
    words = word_tokenize(text)
    return len(words)

def get_sentence_count(text):
    """
    Returns the total number of sentences in the text.
    """
    sentences = sent_tokenize(text)
    return len(sentences)

def get_avg_sentence_length(text):
    """
    Returns the average sentence length (in words) in the text.
    """
    words = word_tokenize(text)
    sentences = sent_tokenize(text)
    return len(words) / len(sentences)

def get_word_frequency(text):
    """
    Returns the frequency of each word in the text.
    """
    words = word_tokenize(text)
    words = [word.lower() for word in words if word.isalpha()]
    word_freq = FreqDist(words)
    return word_freq

def get_top_n_words(text, n):
    """
    Returns the n most frequent words in the text.
    """
    words = word_tokenize(text)
    words = [word.lower() for word in words if word.isalpha()]
    word_freq = FreqDist(words)
    top_n_words = word_freq.most_common(n)
    return top_n_words

def get_pos_tags(text):
    """
    Returns the part-of-speech tags for each word in the text.
    """
    words = word_tokenize(text)
    pos_tags = pos_tag(words)
    return pos_tags

def get_named_entities(text):
    """
    Returns the named entities in the text.
    """
    named_entities = []
    doc = nlp(text)
    for ent in doc.ents:
        named_entities.append((ent.text, ent.label_))
    return named_entities

def get_sentiment(text):
    """
    Returns the sentiment score for the text.
    """
    sia = SentimentIntensityAnalyzer()
    sentiment = sia.polarity_scores(text)
    return sentiment

def get_summary(text):
    """
    Returns a summary of the text using Sumy.
    """
    parser = PlaintextParser.from_string(text, Tokenizer("english"))
    summarizer = LexRankSummarizer()
    summary = summarizer(parser.document, sentences_count=3)
    summary_text = ""
    for sentence in summary:
        summary_text += str(sentence)
    return summary_text


def get_stemmed_words(text):
    """
    Returns the stemmed words in the text.
    """
    stemmer = PorterStemmer()
    words = word_tokenize(text)
    stemmed_words = [stemmer.stem(word) for word in words]
    return stemmed_words

def get_collocations(text):
    """
    Returns the collocations in the text.
    """
    words = word_tokenize(text)
    words = [word.lower() for word in words if word.isalpha()]
    bigram_measures = BigramAssocMeasures()
    finder = BigramCollocationFinder.from_words(words)
    collocations = finder.nbest(bigram_measures.pmi, 10)
    return collocations

def get_filtered_text(text):
    """
    Returns the filtered text with stopwords removed.
    """
    # initialize the stopwords variable
    stopwords_set = set(stopwords.words('english'))
    filtered_text = ""
    for word in text.split():
        if word.lower() not in stopwords_set:
            filtered_text += word + " "
    return filtered_text

def get_keywords(text):
    """
    Returns the most important keywords in the text using TextRank.
    """
    # tokenize the text
    tokenizer = RegexpTokenizer(r'\w+')
    words = tokenizer.tokenize(text)

    # remove stop words
    stop_words = set(stopwords.words('english'))
    words = [word for word in words if word.lower() not in stop_words]

    # create graph of words
    graph = {}
    for i, word in enumerate(words):
        if word not in graph:
            graph[word] = []
        if i > 0:
            previous_word = words[i - 1]
            if previous_word not in graph[word]:
                graph[word].append(previous_word)
        if i < len(words) - 1:
            next_word = words[i + 1]
            if next_word not in graph[word]:
                graph[word].append(next_word)

    # run TextRank algorithm
    scores = {}
    damping_factor = 0.85
    iterations = 100
    for word in graph:
        scores[word] = 1.0
    for i in range(iterations):
        for word in graph:
            score = 1 - damping_factor
            for other_word in graph[word]:
                count = len(graph[other_word])
                if count > 0:
                    score += damping_factor * (scores[other_word] / count)
            scores[word] = score

    # return top keywords
    return sorted(scores, key=scores.get, reverse=True)[:10]

def get_alliteration_count(text):
    """
    Returns the number of alliterations (repeated initial sounds) in the text.
    """
    words = word_tokenize(text)
    alliteration_count = 0
    for i in range(len(words)-1):
        if words[i][0].lower() == words[i+1][0].lower():
            alliteration_count += 1
    return alliteration_count

def get_rhyme_count(text):
    """
    Returns the number of rhymes in the text.
    """
    words = word_tokenize(text)
    rhyme_count = 0
    for i in range(len(words)-1):
        if words[i].lower().endswith(words[i+1].lower()[-2:]):
            rhyme_count += 1
    return rhyme_count

def get_repetition_count(text):
    """
    Returns the number of repeated words or phrases in the text.
    """
    words = word_tokenize(text)
    repetition_count = 0
    for i in range(len(words)-1):
        if words[i] == words[i+1]:
            repetition_count += 1
    return repetition_count

def get_lsa(text):
    """
    Returns a list of latent semantic analysis vectors for the text.
    """
    vectorizer = CountVectorizer(stop_words='english')
    X = vectorizer.fit_transform([text])
    if X.shape[1] == 0:
        return None
    svd = TruncatedSVD(n_components=2)
    X_lsa = svd.fit_transform(X)
    return X_lsa

def get_type_token_ratio(text):
    """
    Returns the type-token ratio of the text.
    """
    words = word_tokenize(text)
    types = set(words)
    return len(types) / len(words)


def print_features(file_path):
    """
    Reads a text file and prints all the extracted features.
    """
    text = read_file(file_path)
    print(f"Word count: {get_word_count(text)}")
    print(f"Sentence count: {get_sentence_count(text)}")
    print(f"Average sentence length: {get_avg_sentence_length(text)} words")
    print(f"Top 10 words: {get_top_n_words(text, 10)}")
    print(f"Part-of-speech tags: {get_pos_tags(text)}")
    print(f"Named entities: {get_named_entities(text)}")
    print(f"Sentiment: {get_sentiment(text)}")
    print(f"Summary: {get_summary(text)}")
    print(f"Stemmed words: {get_stemmed_words(text)}")
    print(f"Collocations: {get_collocations(text)}")
    print(f"Filtered text: {get_filtered_text(text)}")
    print(f"Keywords: {get_keywords(text)}")
    print(f"Alliteration count: {get_alliteration_count(text)}")
    print(f"Rhyme count: {get_rhyme_count(text)}")
    print(f"Repetition count: {get_repetition_count(text)}")
    print(f"LSA vectors: {get_lsa(text)}")
    print(f"Type Token Ratio: {get_type_token_ratio(text)}")


print_features('sample1.txt')

Word count: 754
Sentence count: 61
Average sentence length: 12.360655737704919 words
Top 10 words: [('the', 22), ('and', 17), ('hermione', 16), ('a', 16), ('her', 16), ('was', 15), ('she', 15), ('to', 13), ('you', 13), ('with', 11)]
Part-of-speech tags: [('The', 'DT'), ('sun', 'NN'), ('was', 'VBD'), ('setting', 'VBG'), ('behind', 'IN'), ('Hogwarts', 'NNP'), ('castle', 'NN'), (',', ','), ('casting', 'VBG'), ('long', 'JJ'), ('shadows', 'NNS'), ('across', 'IN'), ('the', 'DT'), ('grounds', 'NNS'), ('.', '.'), ('Hermione', 'NNP'), ('Granger', 'NNP'), (',', ','), ('Harry', 'NNP'), ('Potter', 'NNP'), (',', ','), ('and', 'CC'), ('Ron', 'NNP'), ('Weasley', 'NNP'), ('were', 'VBD'), ('walking', 'VBG'), ('back', 'RB'), ('from', 'IN'), ('a', 'DT'), ('grueling', 'NN'), ('Quidditch', 'NNP'), ('practice', 'NN'), ('session', 'NN'), (',', ','), ('exhausted', 'VBD'), ('but', 'CC'), ('exhilarated', 'VBD'), ('.', '.'), ('As', 'IN'), ('they', 'PRP'), ('neared', 'VBD'), ('the', 'DT'), ('entrance', 'NN'), ('t