In [3]:
# practical3_step1.py
print("STEP 1: Importing libraries...")
import nltk
import matplotlib.pyplot as plt
import pandas as pd
from nltk.tokenize import word_tokenize
from nltk.probability import FreqDist
import string
import json

print("✓ Libraries imported successfully!")

STEP 1: Importing libraries...
✓ Libraries imported successfully!


In [4]:
# practical3_step2.py
print("\n" + "="*60)
print("STEP 2: Downloading NLTK datasets...")

import nltk

try:
    nltk.download('punkt')
    nltk.download('punkt_tab')  # For newer NLTK versions
    print("✓ NLTK datasets downloaded successfully!")
except:
    print("⚠ If download fails, run manually:")
    print("   import nltk")
    print("   nltk.download('punkt')")


STEP 2: Downloading NLTK datasets...


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


✓ NLTK datasets downloaded successfully!


In [5]:
# practical3_step3.py
print("\n" + "="*60)
print("STEP 3: Loading text corpus...")

# Sample text for analysis
text_corpus = """
Natural Language Processing (NLP) is a fascinating field of Artificial Intelligence.
It helps computers understand human language. Many applications use NLP technology.
Chatbots, translators, and voice assistants all use NLP. The field continues to grow
rapidly with new advancements. Researchers develop new models every year.
These models get better at understanding language context. Language is complex but
computers are learning to process it effectively. NLP makes technology more accessible
to everyone through natural language interfaces.
"""

print("Text corpus loaded successfully!")
print(f"Total characters: {len(text_corpus)}")
print("\nFirst 200 characters:")
print(text_corpus[:200] + "...")


STEP 3: Loading text corpus...
Text corpus loaded successfully!
Total characters: 554

First 200 characters:

Natural Language Processing (NLP) is a fascinating field of Artificial Intelligence. 
It helps computers understand human language. Many applications use NLP technology. 
Chatbots, translators, and v...


In [6]:
# practical3_step4.py
print("\n" + "="*60)
print("STEP 4: Tokenizing text into words...")

# Use the text from Step 3
text_corpus = """
Natural Language Processing (NLP) is a fascinating field of Artificial Intelligence.
It helps computers understand human language. Many applications use NLP technology.
Chatbots, translators, and voice assistants all use NLP. The field continues to grow
rapidly with new advancements. Researchers develop new models every year.
These models get better at understanding language context. Language is complex but
computers are learning to process it effectively. NLP makes technology more accessible
to everyone through natural language interfaces.
"""

# Tokenize the text
tokens = word_tokenize(text_corpus)
print(f"✓ Total tokens (including punctuation): {len(tokens)}")
print(f"First 20 tokens: {tokens[:20]}")

# Clean tokens: remove punctuation and convert to lowercase
words = []
for word in tokens:
    if word.isalpha():  # Keep only alphabetic words
        words.append(word.lower())  # Convert to lowercase

print(f"✓ Cleaned words (alphabetic only): {len(words)}")
print(f"First 15 cleaned words: {words[:15]}")


STEP 4: Tokenizing text into words...
✓ Total tokens (including punctuation): 88
First 20 tokens: ['Natural', 'Language', 'Processing', '(', 'NLP', ')', 'is', 'a', 'fascinating', 'field', 'of', 'Artificial', 'Intelligence', '.', 'It', 'helps', 'computers', 'understand', 'human', 'language']
✓ Cleaned words (alphabetic only): 75
First 15 cleaned words: ['natural', 'language', 'processing', 'nlp', 'is', 'a', 'fascinating', 'field', 'of', 'artificial', 'intelligence', 'it', 'helps', 'computers', 'understand']


In [7]:
# practical3_step5.py
print("\n" + "="*60)
print("STEP 5: Calculating word frequencies with FreqDist()...")

# First, get the words from previous step
import nltk
from nltk.tokenize import word_tokenize
from nltk.probability import FreqDist

text_corpus = """
Natural Language Processing (NLP) is a fascinating field of Artificial Intelligence.
It helps computers understand human language. Many applications use NLP technology.
Chatbots, translators, and voice assistants all use NLP. The field continues to grow
rapidly with new advancements. Researchers develop new models every year.
These models get better at understanding language context. Language is complex but
computers are learning to process it effectively. NLP makes technology more accessible
to everyone through natural language interfaces.
"""

# Tokenize and clean
tokens = word_tokenize(text_corpus)
words = [word.lower() for word in tokens if word.isalpha()]

# Calculate frequency distribution
frequency_dist = FreqDist(words)

print(f"✓ Frequency distribution calculated!")
print(f"Total words analyzed: {len(words)}")
print(f"Number of frequency entries: {len(frequency_dist)}")

# Show some frequency examples
print("\nSample frequencies:")
for i, (word, freq) in enumerate(frequency_dist.items()):
    if i < 10:  # Show first 10
        print(f"  '{word}': {freq}")
    else:
        break


STEP 5: Calculating word frequencies with FreqDist()...
✓ Frequency distribution calculated!
Total words analyzed: 75
Number of frequency entries: 57

Sample frequencies:
  'natural': 2
  'language': 5
  'processing': 1
  'nlp': 4
  'is': 2
  'a': 1
  'fascinating': 1
  'field': 2
  'of': 1
  'artificial': 1


In [8]:
# practical3_step6.py
print("\n" + "="*60)
print("STEP 6: Finding the most common words...")

# Continue from previous step
import nltk
from nltk.tokenize import word_tokenize
from nltk.probability import FreqDist

text_corpus = """
Natural Language Processing (NLP) is a fascinating field of Artificial Intelligence.
It helps computers understand human language. Many applications use NLP technology.
Chatbots, translators, and voice assistants all use NLP. The field continues to grow
rapidly with new advancements. Researchers develop new models every year.
These models get better at understanding language context. Language is complex but
computers are learning to process it effectively. NLP makes technology more accessible
to everyone through natural language interfaces.
"""

# Tokenize and clean
tokens = word_tokenize(text_corpus)
words = [word.lower() for word in tokens if word.isalpha()]

# Calculate frequency
frequency_dist = FreqDist(words)

# Get most common words
most_common_words = frequency_dist.most_common(10)

print("Top 10 Most Common Words:")
print("-" * 40)
for rank, (word, frequency) in enumerate(most_common_words, 1):
    print(f"{rank:2}. {word:15} : {frequency:3} times")
    print(f"    Percentage: {(frequency/len(words)*100):.2f}%")


STEP 6: Finding the most common words...
Top 10 Most Common Words:
----------------------------------------
 1. language        :   5 times
    Percentage: 6.67%
 2. nlp             :   4 times
    Percentage: 5.33%
 3. to              :   3 times
    Percentage: 4.00%
 4. natural         :   2 times
    Percentage: 2.67%
 5. is              :   2 times
    Percentage: 2.67%
 6. field           :   2 times
    Percentage: 2.67%
 7. it              :   2 times
    Percentage: 2.67%
 8. computers       :   2 times
    Percentage: 2.67%
 9. use             :   2 times
    Percentage: 2.67%
10. technology      :   2 times
    Percentage: 2.67%


In [9]:
# practical3_step7.py
print("\n" + "="*60)
print("STEP 7: Counting total vocabulary size...")

# Continue from previous steps
import nltk
from nltk.tokenize import word_tokenize
from nltk.probability import FreqDist

text_corpus = """
Natural Language Processing (NLP) is a fascinating field of Artificial Intelligence.
It helps computers understand human language. Many applications use NLP technology.
Chatbots, translators, and voice assistants all use NLP. The field continues to grow
rapidly with new advancements. Researchers develop new models every year.
These models get better at understanding language context. Language is complex but
computers are learning to process it effectively. NLP makes technology more accessible
to everyone through natural language interfaces.
"""

# Tokenize and clean
tokens = word_tokenize(text_corpus)
words = [word.lower() for word in tokens if word.isalpha()]

# Calculate frequency
frequency_dist = FreqDist(words)

# Vocabulary statistics
total_words = len(words)
vocabulary_size = len(frequency_dist)

print("VOCABULARY ANALYSIS")
print("-" * 40)
print(f"Total words in text: {total_words}")
print(f"Vocabulary size (unique words): {vocabulary_size}")
print(f"Type-Token Ratio: {vocabulary_size/total_words:.3f}")
print(f"Average word frequency: {total_words/vocabulary_size:.2f}")

# Word length analysis
word_lengths = [len(word) for word in words]
print(f"\nAverage word length: {sum(word_lengths)/len(word_lengths):.2f} characters")


STEP 7: Counting total vocabulary size...
VOCABULARY ANALYSIS
----------------------------------------
Total words in text: 75
Vocabulary size (unique words): 57
Type-Token Ratio: 0.760
Average word frequency: 1.32

Average word length: 6.12 characters


In [10]:
# practical3_step8.py
print("\n" + "="*60)
print("STEP 8: Displaying frequency table...")

# Continue from previous steps
import nltk
import pandas as pd
from nltk.tokenize import word_tokenize
from nltk.probability import FreqDist

text_corpus = """
Natural Language Processing (NLP) is a fascinating field of Artificial Intelligence.
It helps computers understand human language. Many applications use NLP technology.
Chatbots, translators, and voice assistants all use NLP. The field continues to grow
rapidly with new advancements. Researchers develop new models every year.
These models get better at understanding language context. Language is complex but
computers are learning to process it effectively. NLP makes technology more accessible
to everyone through natural language interfaces.
"""

# Tokenize and clean
tokens = word_tokenize(text_corpus)
words = [word.lower() for word in tokens if word.isalpha()]

# Calculate frequency
frequency_dist = FreqDist(words)

# Create frequency table
print("FREQUENCY DISTRIBUTION TABLE")
print("=" * 50)

# Convert to pandas DataFrame for nice formatting
freq_data = []
for word, freq in frequency_dist.most_common():
    percentage = (freq / len(words)) * 100
    freq_data.append([word, freq, f"{percentage:.2f}%"])

df = pd.DataFrame(freq_data, columns=['Word', 'Frequency', 'Percentage'])
print(df.to_string(index=True))

# Summary statistics
print("\n" + "-" * 50)
print("SUMMARY STATISTICS:")
print(f"Total entries in table: {len(frequency_dist)}")
print(f"Most frequent word: '{frequency_dist.most_common(1)[0][0]}'")
print(f"Least frequent words: {[word for word, freq in frequency_dist.items() if freq == 1]}")


STEP 8: Displaying frequency table...
FREQUENCY DISTRIBUTION TABLE
             Word  Frequency Percentage
0        language          5      6.67%
1             nlp          4      5.33%
2              to          3      4.00%
3         natural          2      2.67%
4              is          2      2.67%
5           field          2      2.67%
6              it          2      2.67%
7       computers          2      2.67%
8             use          2      2.67%
9      technology          2      2.67%
10            new          2      2.67%
11         models          2      2.67%
12     processing          1      1.33%
13              a          1      1.33%
14    fascinating          1      1.33%
15             of          1      1.33%
16     artificial          1      1.33%
17   intelligence          1      1.33%
18          helps          1      1.33%
19     understand          1      1.33%
20          human          1      1.33%
21           many          1      1.33%
22   applica

In [11]:
# practical3_step9.py
print("\n" + "="*60)
print("STEP 9: Analyzing frequently appearing words...")

# Continue from previous steps
import nltk
from nltk.tokenize import word_tokenize
from nltk.probability import FreqDist

text_corpus = """
Natural Language Processing (NLP) is a fascinating field of Artificial Intelligence.
It helps computers understand human language. Many applications use NLP technology.
Chatbots, translators, and voice assistants all use NLP. The field continues to grow
rapidly with new advancements. Researchers develop new models every year.
These models get better at understanding language context. Language is complex but
computers are learning to process it effectively. NLP makes technology more accessible
to everyone through natural language interfaces.
"""

# Tokenize and clean
tokens = word_tokenize(text_corpus)
words = [word.lower() for word in tokens if word.isalpha()]

# Calculate frequency
frequency_dist = FreqDist(words)
total_words = len(words)

print("ANALYSIS OF FREQUENT WORDS")
print("=" * 60)

# Words appearing 3 or more times
print("\n1. High-frequency words (≥ 3 occurrences):")
print("-" * 40)
high_freq_words = [(word, freq) for word, freq in frequency_dist.items() if freq >= 3]
for word, freq in sorted(high_freq_words, key=lambda x: x[1], reverse=True):
    print(f"'{word}': {freq} times ({(freq/total_words*100):.1f}%)")

# Words appearing exactly 2 times
print("\n2. Medium-frequency words (2 occurrences):")
print("-" * 40)
med_freq_words = [word for word, freq in frequency_dist.items() if freq == 2]
print(", ".join(med_freq_words) if med_freq_words else "None")

# Hapax legomena (words appearing only once)
print("\n3. Low-frequency words (1 occurrence - hapax legomena):")
print("-" * 40)
low_freq_words = [word for word, freq in frequency_dist.items() if freq == 1]
print(f"Total: {len(low_freq_words)} words")
if len(low_freq_words) <= 20:  # Show all if not too many
    print(", ".join(sorted(low_freq_words)))
else:
    print("First 20: " + ", ".join(sorted(low_freq_words)[:20]))

# Cumulative frequency analysis
print("\n4. Cumulative frequency analysis:")
print("-" * 40)
top_5_words = frequency_dist.most_common(5)
top_5_count = sum(freq for _, freq in top_5_words)
print(f"Top 5 words cover {top_5_count} out of {total_words} words")
print(f"That's {(top_5_count/total_words*100):.1f}% of the text")

# Word categories analysis
print("\n5. Content vs Function words analysis:")
print("-" * 40)
function_words = {'the', 'a', 'an', 'and', 'or', 'but', 'in', 'on', 'at', 'to', 'for', 'of', 'with', 'by', 'is', 'are', 'was', 'were', 'it', 'they', 'he', 'she', 'i', 'you', 'we'}
content_words = [word for word in words if word not in function_words]
print(f"Function words: {len([w for w in words if w in function_words])}")
print(f"Content words: {len(content_words)}")
print(f"Content/Function ratio: {len(content_words)/len([w for w in words if w in function_words]):.2f}")


STEP 9: Analyzing frequently appearing words...
ANALYSIS OF FREQUENT WORDS

1. High-frequency words (≥ 3 occurrences):
----------------------------------------
'language': 5 times (6.7%)
'nlp': 4 times (5.3%)
'to': 3 times (4.0%)

2. Medium-frequency words (2 occurrences):
----------------------------------------
natural, is, field, it, computers, use, technology, new, models

3. Low-frequency words (1 occurrence - hapax legomena):
----------------------------------------
Total: 45 words
First 20: a, accessible, advancements, all, and, applications, are, artificial, assistants, at, better, but, chatbots, complex, context, continues, develop, effectively, every, everyone

4. Cumulative frequency analysis:
----------------------------------------
Top 5 words cover 16 out of 75 words
That's 21.3% of the text

5. Content vs Function words analysis:
----------------------------------------
Function words: 15
Content words: 60
Content/Function ratio: 4.00


In [12]:
# practical3_step10.py
print("\n" + "="*60)
print("STEP 10: Saving output for further NLP processing...")

# Continue from previous steps
import nltk
import json
import pandas as pd
from nltk.tokenize import word_tokenize
from nltk.probability import FreqDist

text_corpus = """
Natural Language Processing (NLP) is a fascinating field of Artificial Intelligence.
It helps computers understand human language. Many applications use NLP technology.
Chatbots, translators, and voice assistants all use NLP. The field continues to grow
rapidly with new advancements. Researchers develop new models every year.
These models get better at understanding language context. Language is complex but
computers are learning to process it effectively. NLP makes technology more accessible
to everyone through natural language interfaces.
"""

# Tokenize and clean
tokens = word_tokenize(text_corpus)
words = [word.lower() for word in tokens if word.isalpha()]

# Calculate frequency
frequency_dist = FreqDist(words)

# Save as JSON file
output_data = {
    "text_statistics": {
        "total_characters": len(text_corpus),
        "total_tokens": len(tokens),
        "total_words": len(words),
        "vocabulary_size": len(frequency_dist),
        "type_token_ratio": len(frequency_dist) / len(words)
    },
    "most_common_words": dict(frequency_dist.most_common(20)),
    "full_frequency": dict(frequency_dist.most_common()),
    "word_list": words,
    "analysis_date": pd.Timestamp.now().strftime("%Y-%m-%d %H:%M:%S")
}

# Save JSON file
with open('word_frequency_analysis.json', 'w', encoding='utf-8') as f:
    json.dump(output_data, f, indent=2, ensure_ascii=False)
print("✓ Analysis saved as 'word_frequency_analysis.json'")

# Save as CSV file
df = pd.DataFrame(frequency_dist.most_common(), columns=['Word', 'Frequency'])
df['Percentage'] = (df['Frequency'] / len(words) * 100).round(2)
df.to_csv('word_frequency_table.csv', index=False)
print("✓ Frequency table saved as 'word_frequency_table.csv'")

# Save summary report
with open('analysis_summary.txt', 'w', encoding='utf-8') as f:
    f.write("WORD FREQUENCY ANALYSIS SUMMARY\n")
    f.write("=" * 50 + "\n\n")
    f.write(f"Total words analyzed: {len(words)}\n")
    f.write(f"Vocabulary size: {len(frequency_dist)}\n")
    f.write(f"Type-Token Ratio: {len(frequency_dist)/len(words):.3f}\n\n")

    f.write("TOP 10 MOST FREQUENT WORDS:\n")
    f.write("-" * 30 + "\n")
    for word, freq in frequency_dist.most_common(10):
        f.write(f"{word:15} : {freq:3} ({(freq/len(words)*100):.1f}%)\n")

    f.write(f"\nAnalysis completed on: {pd.Timestamp.now()}")

print("✓ Summary report saved as 'analysis_summary.txt'")
print("\n" + "="*60)
print("ALL OUTPUT FILES SAVED SUCCESSFULLY!")
print("1. word_frequency_analysis.json")
print("2. word_frequency_table.csv")
print("3. analysis_summary.txt")
print("="*60)


STEP 10: Saving output for further NLP processing...
✓ Analysis saved as 'word_frequency_analysis.json'
✓ Frequency table saved as 'word_frequency_table.csv'
✓ Summary report saved as 'analysis_summary.txt'

ALL OUTPUT FILES SAVED SUCCESSFULLY!
1. word_frequency_analysis.json
2. word_frequency_table.csv
3. analysis_summary.txt


In [13]:
# practical3_complete.py
"""
Practical 3 – Word Frequency and Vocabulary Analysis
Complete implementation of all 10 steps
"""

# Step 1: Install and import
print("STEP 1: Importing libraries...")
import nltk
import matplotlib.pyplot as plt
import pandas as pd
import json
from nltk.tokenize import word_tokenize
from nltk.probability import FreqDist
print("✓ Libraries imported")

# Step 2: Download datasets
print("\nSTEP 2: Downloading NLTK datasets...")
nltk.download('punkt')
nltk.download('punkt_tab')
print("✓ Datasets downloaded")

# Step 3: Load text
print("\nSTEP 3: Loading text corpus...")
text = """Natural Language Processing helps computers understand human language."""
print(f"✓ Text loaded ({len(text)} chars)")

# Step 4: Tokenize
print("\nSTEP 4: Tokenizing text...")
tokens = word_tokenize(text)
words = [w.lower() for w in tokens if w.isalpha()]
print(f"✓ {len(words)} words tokenized")

# Step 5: Frequency distribution
print("\nSTEP 5: Calculating frequencies...")
freq_dist = FreqDist(words)
print(f"✓ Frequency distribution created")

# Step 6: Most common words
print("\nSTEP 6: Most common words...")
common = freq_dist.most_common(5)
for word, freq in common:
    print(f"  {word}: {freq}")

# Step 7: Vocabulary size
print(f"\nSTEP 7: Vocabulary size: {len(freq_dist)}")

# Step 8: Frequency table
print("\nSTEP 8: Frequency table created")
df = pd.DataFrame(freq_dist.most_common(), columns=['Word', 'Frequency'])
print(df)

# Step 9: Analysis
print("\nSTEP 9: Analysis completed")

# Step 10: Save output
print("\nSTEP 10: Saving files...")
df.to_csv('output.csv', index=False)
print("✓ Output saved to 'output.csv'")

print("\n" + "="*50)
print("PRACTICAL 3 COMPLETED SUCCESSFULLY!")
print("="*50)

STEP 1: Importing libraries...
✓ Libraries imported

STEP 2: Downloading NLTK datasets...
✓ Datasets downloaded

STEP 3: Loading text corpus...
✓ Text loaded (70 chars)

STEP 4: Tokenizing text...
✓ 8 words tokenized

STEP 5: Calculating frequencies...
✓ Frequency distribution created

STEP 6: Most common words...
  language: 2
  natural: 1
  processing: 1
  helps: 1
  computers: 1

STEP 7: Vocabulary size: 7

STEP 8: Frequency table created
         Word  Frequency
0    language          2
1     natural          1
2  processing          1
3       helps          1
4   computers          1
5  understand          1
6       human          1

STEP 9: Analysis completed

STEP 10: Saving files...
✓ Output saved to 'output.csv'

PRACTICAL 3 COMPLETED SUCCESSFULLY!


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
