# Part1

In [None]:
import pandas as pd
import re
import nltk
import string
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.stem import PorterStemmer, WordNetLemmatizer

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

# df must already exist with column: text


# Task1

In [None]:
print(df["text"].head())
print(df["text"].apply(len).head())


# Task2

In [None]:
def basic_clean(text):
    text = text.lower()
    text = re.sub(r'\d+', '', text)
    text = text.translate(str.maketrans('', '', string.punctuation))
    text = re.sub(r'\s+', ' ', text).strip()
    return text

df["clean_text_basic"] = df["text"].apply(basic_clean)
df[["text","clean_text_basic"]].head()


# Part2

# Task3

In [None]:
def advanced_clean(text):
    text = re.sub(r'http\S+|www\S+', '', text)
    text = re.sub(r'\S+@\S+', '', text)
    text = re.sub(r'<.*?>', '', text)
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    return text

df["clean_text_advanced"] = df["clean_text_basic"].apply(advanced_clean)


# Task4

In [None]:
stop_words = set(stopwords.words("english"))

df["text_no_stopwords"] = df["clean_text_advanced"].apply(
    lambda x: " ".join([w for w in x.split() if w not in stop_words])
)


# Part3

# Task5

In [None]:
slang = {"u":"you", "gr8":"great"}

def normalize(text):
    text = re.sub(r'(.)\1+', r'\1\1', text)
    return " ".join([slang.get(w, w) for w in text.split()])

df["normalized_text"] = df["text_no_stopwords"].apply(normalize)


# Task6

In [None]:
df["word_tokens"] = df["normalized_text"].apply(word_tokenize)
df["sentence_tokens"] = df["text"].apply(sent_tokenize)

df[["word_tokens","sentence_tokens"]].head(3)


# Part4

# Task7

In [None]:
stemmer = PorterStemmer()
df["stemmed"] = df["word_tokens"].apply(lambda x: [stemmer.stem(w) for w in x])


# Task8

In [None]:
lemmatizer = WordNetLemmatizer()
df["lemmatized"] = df["word_tokens"].apply(lambda x: [lemmatizer.lemmatize(w) for w in x])


# Part5

# Task9

In [None]:
def nlp_preprocess(text):
    text = text.lower()
    text = re.sub(r'http\S+|www\S+|\S+@\S+|<.*?>', '', text)
    text = re.sub(r'[^a-z\s]', '', text)
    tokens = word_tokenize(text)
    tokens = [w for w in tokens if w not in stop_words]
    tokens = [lemmatizer.lemmatize(w) for w in tokens]
    return " ".join(tokens)

df["final_clean_text"] = df["text"].apply(nlp_preprocess)


# Task10

1. Difference between basic & advanced cleaning

Basic cleaning removes case, punctuation, numbers, and spaces.

Advanced cleaning removes noise like URLs, emails, emojis, HTML, stopwords and normalizes words → much cleaner and model-ready.

2. Why lemmatization is preferred over stemming

Stemming cuts words blindly (e.g., running → runn).

Lemmatization returns real dictionary words (e.g., running → run), preserving meaning.

3. Importance of preprocessing in NLP

Reduces noise

Improves model accuracy

Reduces vocabulary size

Makes text consistent and meaningful