<a href="https://colab.research.google.com/github/ar19z/TextAnalysis/blob/main/Intro_to_Text_Analysis_with_Python.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Welcome to Intro to Text Analysis with Python!**
Today, we're going to act as literary detectives. We have a collection of 26 classic British novels, and we'll use a Python "recipe" to uncover some interesting patterns within them.

# Step 1: Setup Our Kitchen
First, we need to get our kitchen ready. This code block does three things:

Imports the specialized tools (libraries) we need.

Downloads the raw ingredients (our 26 novels) from the web.

Unzips the file so we can access the novels.

Click the "play" button on the left to run this cell.

In [None]:
# Import our "kitchen tools" (libraries)
import requests      # For downloading files from the internet
import zipfile       # For unzipping files
import io            # For handling the downloaded data
import os            # For navigating files and folders
import shutil        # For copying, moving, deleting, and archiving files or folders
import re            # For removing punctuation using regular expressions
import nltk          # The main tool for natural language processing
import matplotlib.pyplot as plt # For making charts and graphs

# This command downloads the specific tools we need from NLTK
nltk.download('punkt_tab')
nltk.download('stopwords')

print("Setup complete! All tools and ingredients are ready.")

USER   = "ar19z"
REPO   = "TextAnalysis"
BRANCH = "b9fcd9ddb42004cca0804868ae2d4650cee3c4d4"  # commit hash
SUBDIR = "A_Small_Collection_of_British_Fiction-master/corpus"

# Download the repo at that commit
url = f"https://github.com/{USER}/{REPO}/archive/{BRANCH}.zip"
r = requests.get(url)
r.raise_for_status()

# Clean old target
target = "/content/corpus"
if os.path.exists(target):
    shutil.rmtree(target)
os.makedirs(target, exist_ok=True)

# Extract only SUBDIR
with zipfile.ZipFile(io.BytesIO(r.content)) as z:
    prefix = f"{REPO}-{BRANCH}/{SUBDIR}/"
    names = [n for n in z.namelist() if n.startswith(prefix)]
    if not names:
        raise SystemExit(f"Folder '{SUBDIR}' not found. Check the path.")

    for n in names:
        if n.endswith("/"):
            continue
        rel = n[len(prefix):]             # strip the prefix
        out_path = os.path.join(target, rel)
        os.makedirs(os.path.dirname(out_path), exist_ok=True)
        with z.open(n) as src, open(out_path, "wb") as dst:
            dst.write(src.read())

print("Extracted to:", target)
print("Count:", len([f for f in os.listdir(target) if f.endswith('.txt')]))
print("Sample:", sorted(os.listdir(target))[:5])

# Step 2: Point to Our Ingredients
Now we need to tell our recipe where to find the ingredients. We've placed the novels in a folder called text_data/26_novels.

If you wanted to use your *own* .txt files, you would upload them to Colab, put them in a folder, and just change the path in the line below!

In [None]:
# <<< STUDENTS: THIS IS THE LINE YOU WOULD CHANGE FOR YOUR OWN TEXTS >>>
folder_path = "/content/corpus"

# Let's list the files to make sure they're there
file_list = os.listdir(folder_path)
print(f"Found {len(file_list)} files in the folder.")
print("Here are the first 5:")
print(file_list[:5])

# Step 3: Prep the Ingredients (Preprocessing)
Raw text is messy! It has capital letters, punctuation, and lots of common "filler" words (like 'the', 'a', 'is', 'of'). To get a good analysis, we need to clean it up. This is our "text preprocessing" step.

This block of code will:



1.   Read each text file.
2.   Make all text lowercase.
3.   Break the text into a list of individual words (tokenize).
4.   Remove punctuation and common English stopwords.

The result will be a clean list of "significant" words for each novel.

In [None]:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

# Get the list of common English "filler" words
stop_words = set(stopwords.words('english'))

# We'll store all our processed words from all novels in this one giant list
all_processed_words = []
# We'll also store the original, raw text for later
raw_text_corpus = ""


print("Processing all novels... this might take a moment.")

# Loop through every file in our folder
for file_name in os.listdir(folder_path):
    if file_name.endswith('.txt'): # Make sure we're only reading text files
        file_path = os.path.join(folder_path, file_name)

        with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
            text = f.read()
            raw_text_corpus += text # Add the raw text to our corpus for later

            # 1. Make text lowercase
            text = text.lower()

            # 2. Remove punctuation (keeps only letters and spaces)
            text = re.sub(r'[^a-z\s]', '', text)

            # 3. Tokenize (split text into a list of words)
            words = word_tokenize(text)

            # 4. Remove stopwords and short words
            processed_words = [word for word in words if word not in stop_words and len(word) > 2]

            # Add the cleaned words from this book to our master list
            all_processed_words.extend(processed_words)

print(f"\n Processing complete!")
print(f"We have a total of {len(all_processed_words)} significant words across all 26 novels.")
print("\nHere's a sample of the first 20 processed words:")
print(all_processed_words[:20])

# Step 4: The First Course - Word Frequency
Now that our ingredients are prepped, let's do our first analysis! We'll count every single word to see which ones appear most often across all the novels.

We'll use NLTK's FreqDist (Frequency Distribution) tool to do the counting and then plot the top 20 words on a bar chart.

In [None]:
# Use NLTK's FreqDist to count the words
freq_dist = nltk.FreqDist(all_processed_words)

# Let's look at the 20 most common words
most_common_words = freq_dist.most_common(20)

print("Top 20 Most Common Words:")
for word, count in most_common_words:
    print(f"{word}: {count}")

# Now, let's visualize this!
# We need to separate the words and their counts for the chart
words_to_plot, counts_to_plot = zip(*most_common_words)

# Create the bar chart
plt.figure(figsize=(12, 6)) # Make the chart a bit bigger
plt.bar(words_to_plot, counts_to_plot, color='skyblue')
plt.title('Top 20 Most Frequent Words in 26 British Novels')
plt.xlabel('Words')
plt.ylabel('Frequency (Count)')
plt.xticks(rotation=45) # Rotate the x-axis labels so they don't overlap
plt.show()

# Step 5: The Main Course - Finding Context (Concordance)
A word count is useful, but context is everything. How are these words actually being used? A concordance shows us every occurrence of a given word, along with the words that surround it.

This allows us to see patterns in how language is used. We'll load our raw, unprocessed text into a special NLTK object to generate the concordance.

Try changing the word in .concordance("money") to other words like "love", "man", "woman", or "house" and re-running the cell!

In [None]:
# We need to create an NLTK Text object from our raw text to use the concordance tool
# First, we tokenize the raw corpus
raw_tokens = nltk.word_tokenize(raw_text_corpus)
nltk_text = nltk.Text(raw_tokens)

print("Displaying concordance for the word 'money':\n")
# The concordance function will print its own output
nltk_text.concordance("money", width=80, lines=25)

print("\n\nNow, let's try 'love':\n")
nltk_text.concordance("love", width=80, lines=25)

In [None]:
import glob

# Load each .txt file separately
files = sorted(glob.glob("/content/corpus/*.txt"))

for file in files:
    with open(file, "r", encoding="utf-8", errors="ignore") as f:
        raw_text = f.read()

    tokens = nltk.word_tokenize(raw_text)
    nltk_text = nltk.Text(tokens)

    print("="*80)
    print(f"Concordance for 'money' in {file}:\n")
    nltk_text.concordance("money", width=80, lines=5)  # adjust lines as needed

    print("\nConcordance for 'love':\n")
    nltk_text.concordance("love", width=80, lines=5)
    print("="*80, "\n")

In [None]:
import os, glob, nltk, pandas as pd
from pathlib import Path

# Ensure tokenizer is available
nltk.download('punkt', quiet=True)

CORPUS_DIR = "/content/corpus"  # change if needed
files = sorted(glob.glob(os.path.join(CORPUS_DIR, "*.txt")))
assert files, f"No .txt files found in {CORPUS_DIR}"

rows = []
for fp in files:
    with open(fp, "r", encoding="utf-8", errors="ignore") as f:
        raw = f.read()
    tokens = [t.lower() for t in nltk.word_tokenize(raw)]
    total = len(tokens)

    love_count  = sum(1 for t in tokens if t == "love")
    money_count = sum(1 for t in tokens if t == "money")

    # Metrics
    diff  = love_count - money_count
    ratio = love_count / (money_count if money_count > 0 else 1e-9)  # avoid div-by-zero
    love_per_10k  = (love_count  / total) * 10000 if total else 0
    money_per_10k = (money_count / total) * 10000 if total else 0

    rows.append({
        "title": Path(fp).stem,
        "path": fp,
        "tokens": total,
        "love": love_count,
        "money": money_count,
        "love_minus_money": diff,
        "love_to_money_ratio": ratio,
        "love_per_10k": love_per_10k,
        "money_per_10k": money_per_10k
    })

df = pd.DataFrame(rows)

# Winners
by_diff  = df.sort_values("love_minus_money", ascending=False).reset_index(drop=True)
by_ratio = df[df["money"] > 0].sort_values("love_to_money_ratio", ascending=False).reset_index(drop=True)

print("=== Winner by difference (love - money) ===")
print(by_diff.loc[0, ["title", "love", "money", "love_minus_money"]], "\n")

if not by_ratio.empty:
    print("=== Winner by ratio (love / money) ===")
    print(by_ratio.loc[0, ["title", "love", "money", "love_to_money_ratio"]], "\n")
else:
    print("No occurrences of 'money' found; ratio ranking not applicable.\n")

print("Top 10 by difference:")
display(by_diff[["title","love","money","love_minus_money","love_per_10k","money_per_10k"]].head(10))

print("Top 10 by ratio (excluding zero-money):")
if not by_ratio.empty:
    display(by_ratio[["title","love","money","love_to_money_ratio","love_per_10k","money_per_10k"]].head(10))
else:
    print("(none)")

In [None]:
import os, glob, nltk, pandas as pd
from pathlib import Path

# Make sure NLTK tokenizer is ready
nltk.download('punkt', quiet=True)

# === Set your words here ===
WORD1 = "chicken"
WORD2 = "alfredo"

CORPUS_DIR = "/content/corpus"  # change if needed
files = sorted(glob.glob(os.path.join(CORPUS_DIR, "*.txt")))
assert files, f"No .txt files found in {CORPUS_DIR}"

rows = []
for fp in files:
    with open(fp, "r", encoding="utf-8", errors="ignore") as f:
        raw = f.read()
    tokens = [t.lower() for t in nltk.word_tokenize(raw)]
    total = len(tokens)

    w1_count = sum(1 for t in tokens if t == WORD1.lower())
    w2_count = sum(1 for t in tokens if t == WORD2.lower())

    diff  = w1_count - w2_count
    ratio = w1_count / (w2_count if w2_count > 0 else 1e-9)  # avoid div-by-zero
    w1_per_10k = (w1_count / total) * 10000 if total else 0
    w2_per_10k = (w2_count / total) * 10000 if total else 0

    rows.append({
        "title": Path(fp).stem,
        "tokens": total,
        WORD1: w1_count,
        WORD2: w2_count,
        f"{WORD1}_minus_{WORD2}": diff,
        f"{WORD1}_to_{WORD2}_ratio": ratio,
        f"{WORD1}_per_10k": w1_per_10k,
        f"{WORD2}_per_10k": w2_per_10k
    })

df = pd.DataFrame(rows)

# Ranking by difference and ratio
by_diff  = df.sort_values(f"{WORD1}_minus_{WORD2}", ascending=False).reset_index(drop=True)
by_ratio = df[df[WORD2] > 0].sort_values(f"{WORD1}_to_{WORD2}_ratio", ascending=False).reset_index(drop=True)

print(f"=== Winner by difference ({WORD1} - {WORD2}) ===")
print(by_diff.loc[0, ["title", WORD1, WORD2, f"{WORD1}_minus_{WORD2}"]], "\n")

if not by_ratio.empty:
    print(f"=== Winner by ratio ({WORD1} / {WORD2}) ===")
    print(by_ratio.loc[0, ["title", WORD1, WORD2, f"{WORD1}_to_{WORD2}_ratio"]], "\n")
else:
    print(f"No occurrences of '{WORD2}' found; ratio ranking not applicable.\n")

print("Top 10 by difference:")
display(by_diff.head(10))

print("Top 10 by ratio (excluding zero-counts for second word):")
if not by_ratio.empty:
    display(by_ratio.head(10))
else:
    print("(none)")