# Part1

# Task1

In [None]:
import pandas as pd

df = pd.read_csv("text_data.csv")   # your dataset
df["text"].head()


In [None]:
# First 5 samples
print(df["text"].head())

# Length of each text
df["text_length"] = df["text"].astype(str).apply(len)
df[["text", "text_length"]].head()


# Task2

In [None]:
import re

def basic_clean(text):
    text = text.lower()
    text = re.sub(r"[^\w\s]", "", text)     # remove punctuation
    text = re.sub(r"\d+", "", text)         # remove numbers
    text = re.sub(r"\s+", " ", text).strip()
    return text

df["clean_text_basic"] = df["text"].astype(str).apply(basic_clean)

df[["text", "clean_text_basic"]].head()


# Part2

# Task3

In [None]:
def advanced_clean(text):
    text = re.sub(r"http\S+", "", text)       # URLs
    text = re.sub(r"\S+@\S+", "", text)       # Emails
    text = re.sub(r"<.*?>", "", text)         # HTML tags
    text = re.sub(r"[^\w\s]", "", text)       # special chars
    text = re.sub(r"\s+", " ", text).strip()
    return text

df["clean_text_advanced"] = df["clean_text_basic"].apply(advanced_clean)


# Task4

In [None]:
import nltk
nltk.download("stopwords")
from nltk.corpus import stopwords

stop_words = set(stopwords.words("english"))

df["text_no_stopwords"] = df["clean_text_advanced"].apply(
    lambda x: " ".join([w for w in x.split() if w not in stop_words])
)


# Task5

In [None]:
def normalize_repeats(text):
    return re.sub(r"(.)\1{2,}", r"\1\1", text)

slang = {"u":"you", "gr8":"great", "pls":"please"}

def replace_slang(text):
    return " ".join([slang.get(w, w) for w in text.split()])

df["text_normalized"] = df["text_no_stopwords"].apply(normalize_repeats).apply(replace_slang)


# Part3

# Task6

In [None]:
nltk.download("punkt")
from nltk.tokenize import word_tokenize, sent_tokenize

df["word_tokens"] = df["text_normalized"].apply(word_tokenize)
df["sentence_tokens"] = df["text"].apply(sent_tokenize)

df[["word_tokens", "sentence_tokens"]].head(3)


# Task7

In [None]:
from nltk.stem import PorterStemmer
ps = PorterStemmer()

df["stemmed"] = df["word_tokens"].apply(lambda x: [ps.stem(w) for w in x])


# Task8

In [None]:
nltk.download("wordnet")
from nltk.stem import WordNetLemmatizer

lem = WordNetLemmatizer()
df["lemmatized"] = df["word_tokens"].apply(lambda x: [lem.lemmatize(w) for w in x])


# Task9

In [None]:
def nlp_preprocess(text):
    text = basic_clean(text)
    text = advanced_clean(text)
    words = [w for w in text.split() if w not in stop_words]
    tokens = [lem.lemmatize(w) for w in words]
    return " ".join(tokens)

df["final_clean_text"] = df["text"].apply(nlp_preprocess)


# Task10

In [None]:
Basic cleaning removes visible noise.

Advanced cleaning removes web & hidden noise.

Lemmatization keeps real words (better than stemming).

Preprocessing improves ML accuracy.