In [None]:
#22. Text Preprocessing – Cleaning, Lowercase, Tokenization, Stopwords, Spelling Correction
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from textblob import TextBlob

nltk.download('punkt')
nltk.download('stopwords')

with open("text.txt", "r") as file:
    text = file.read()

text = re.sub(r'[^a-zA-Zs]', '', text)
text = re.sub(r's+', ' ', text)
text = text.lower()
tokens = word_tokenize(text)
filtered = [w for w in tokens if w not in stopwords.words('english')]
corrected = [str(TextBlob(word).correct()) for word in filtered]

print(corrected)


#23. Text Preprocessing – Cleaning, Lowercase, Stemming, Lemmatization, 3-Word Phrases
from nltk.stem import PorterStemmer, WordNetLemmatizer
nltk.download('wordnet')

stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()

with open("text.txt", "r") as file:
    text = file.read()

text = re.sub(r'[^a-zA-Zs]', '', text)
text = re.sub(r's+', ' ', text)
text = text.lower()
tokens = word_tokenize(text)

stemmed = [stemmer.stem(w) for w in tokens]
lemmatized = [lemmatizer.lemmatize(w) for w in stemmed]

three_grams = [' '.join(lemmatized[i:i+3]) for i in range(len(lemmatized)-2)]
print(three_grams)


#24. NLP – One-Hot Encoding from 3 Text Files
from sklearn.preprocessing import OneHotEncoder

corpus = ""
for filename in ["file1.txt", "file2.txt", "file3.txt"]:
    with open(filename, "r") as file:
        corpus += file.read() + " "

words = list(set(re.sub(r'[^a-zA-Zs]', '', corpus.lower()).split()))
encoder = OneHotEncoder(sparse=False)
encoded = encoder.fit_transform([[w] for w in words])

print("Words:", words)
print("One-hot Encoding:")
print(encoded)


#25. NLP – Bag of Words for Movie Reviews (3 Text Files)
from sklearn.feature_extraction.text import CountVectorizer

documents = []
for file_name in ["review1.txt", "review2.txt", "review3.txt"]:
    with open(file_name, "r") as f:
        documents.append(f.read())

vectorizer = CountVectorizer()
X = vectorizer.fit_transform(documents)

print("Bag of Words Matrix:")
print(X.toarray())
print("Vocabulary:")
print(vectorizer.get_feature_names_out())

#26. NLP – TF-IDF for Tourist Place Descriptions (3 Text Files)
from sklearn.feature_extraction.text import TfidfVectorizer

documents = []
for file_name in ["place1.txt", "place2.txt", "place3.txt"]:
    with open(file_name, "r") as f:
        documents.append(f.read())

vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(documents)

print("TF-IDF Matrix:")
print(X.toarray())
print("Vocabulary:")
print(vectorizer.get_feature_names_out())