In [None]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
import string

# Download required NLTK data
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('punkt_tab') # Download the missing 'punkt_tab' data

def preprocess_text(text):
    """
    Perform complete text preprocessing including:
    1. Tokenization
    2. Lowercasing
    3. Punctuation removal
    4. Stopword removal
    5. Stemming

    Args:
        text (str): Input text to preprocess

    Returns:
        list: List of processed tokens
    """
    # 1. Tokenization
    tokens = word_tokenize(text)

    # 2. Convert to lowercase
    tokens = [word.lower() for word in tokens]

    # 3. Remove punctuation
    table = str.maketrans('', '', string.punctuation)
    stripped = [w.translate(table) for w in tokens]
    words = [word for word in stripped if word.isalpha()]

    # 4. Remove stopwords
    stop_words = set(stopwords.words('english'))
    words = [word for word in words if not word in stop_words]

    # 5. Stemming
    porter = PorterStemmer()
    stemmed = [porter.stem(word) for word in words]

    return stemmed

# Example text (replace with your case study text)
sample_text = """
Natural language processing (NLP) is a subfield of linguistics, computer science,
and artificial intelligence concerned with the interactions between computers
and human language. It focuses on how to program computers to process and
analyze large amounts of natural language data.
"""

# Perform preprocessing
processed_tokens = preprocess_text(sample_text)

# Display results
print("Original Text:")
print(sample_text)
print("\nPreprocessed Tokens:")
print(processed_tokens)
print("\nNumber of tokens:", len(processed_tokens))

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...


Original Text:

Natural language processing (NLP) is a subfield of linguistics, computer science, 
and artificial intelligence concerned with the interactions between computers 
and human language. It focuses on how to program computers to process and 
analyze large amounts of natural language data.


Preprocessed Tokens:
['natur', 'languag', 'process', 'nlp', 'subfield', 'linguist', 'comput', 'scienc', 'artifici', 'intellig', 'concern', 'interact', 'comput', 'human', 'languag', 'focus', 'program', 'comput', 'process', 'analyz', 'larg', 'amount', 'natur', 'languag', 'data']

Number of tokens: 25


[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
