## OCR — Optical Character Recognition

In [None]:

from PIL import Image
import pytesseract

# Открываем изображение (замени 'nlp_image.png' на нужное имя файла)
try:
    img = Image.open("nlp_image.png")
    text = pytesseract.image_to_string(img, lang="eng")
    print("Распознанный текст:")
    print(text)
except Exception as e:
    print("Ошибка при OCR:", e)


## Speech Recognition — Clean sound

In [None]:

import speech_recognition as sr

recognizer = sr.Recognizer()
try:
    with sr.AudioFile("clean_audio.wav") as source:
        audio = recognizer.record(source)
        text = recognizer.recognize_google(audio, language="en-US")
        print("Распознанный текст (clean):", text)
except Exception as e:
    print("Ошибка при распознавании clean audio:", e)


## Speech Recognition — Noisy sound

In [None]:

import speech_recognition as sr

recognizer = sr.Recognizer()
try:
    with sr.AudioFile("noisy_audio.wav") as source:
        # подавляем шум
        recognizer.adjust_for_ambient_noise(source)
        audio = recognizer.record(source)
        text = recognizer.recognize_google(audio, language="en-US")
        print("Распознанный текст (noisy):", text)
except Exception as e:
    print("Ошибка при распознавании noisy audio:", e)


## Scraping — Web page parsing

In [None]:

import requests
from bs4 import BeautifulSoup

url = "https://en.wikipedia.org/wiki/Natural_language_processing"
try:
    response = requests.get(url)
    soup = BeautifulSoup(response.text, "html.parser")
    paragraphs = soup.find_all("p")
    for p in paragraphs[:5]:
        print(p.get_text())
except Exception as e:
    print("Ошибка при web scraping:", e)


## Text preprocessing

In [None]:

import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

nltk.download('punkt')
nltk.download('stopwords')

text = "This is an example sentence, showing off the stop words filtration and stemming process!"

# Токенизация
tokens = nltk.word_tokenize(text.lower())

# Удаление стоп-слов
filtered = [w for w in tokens if w not in stopwords.words('english') and w.isalpha()]

# Стемминг
stemmer = PorterStemmer()
stems = [stemmer.stem(w) for w in filtered]

print("Исходный текст:", text)
print("Токены:", tokens)
print("После удаления стоп-слов:", filtered)
print("После стемминга:", stems)
