<a href="https://colab.research.google.com/github/bunny346/Natural-learning-processing/blob/main/NLP-1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import re
import nltk
import spacy
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from collections import Counter

# -----------------------------
# Section A: Load & Explore
# -----------------------------
data = {
    "resume_text": [
        "John Doe\nSoftware Engineer • Skilled in Python, Java, SQL. Experience in cloud (AWS, Azure).",
        "Jane Smith - Data Analyst • Proficient in R, Python, Tableau, and machine learning techniques.",
        "Michael Johnson\nProject Manager • Expertise in Agile, Scrum, Leadership & Communication."
    ]
}
df = pd.DataFrame(data)

print("=== First 3 rows ===")
print(df.head(3), "\n")

print("=== Noisy characters in first resume ===")
print(set("".join(df['resume_text'][0])), "\n")


# Section B: NLTK Preprocessing

nltk.download("punkt")
nltk.download("stopwords")
nltk.download("punkt_tab")

stop_words = set(stopwords.words("english"))
stemmer = PorterStemmer()

def clean_and_tokenize(text):
    text = re.sub(r"[^a-zA-Z\s]", " ", text)  # remove digits & symbols
    text = text.lower()
    tokens = nltk.word_tokenize(text)
    tokens = [stemmer.stem(word) for word in tokens if word not in stop_words]
    return tokens

df["tokens_nltk"] = df["resume_text"].apply(clean_and_tokenize)

# Top 10 frequent stemmed words
all_tokens = [word for tokens in df["tokens_nltk"] for word in tokens]
freq_words_nltk = Counter(all_tokens).most_common(10)

print("=== Top 10 frequent stemmed words (NLTK) ===")
print(freq_words_nltk, "\n")


# Section C: spaCy Pipeline
nlp = spacy.load("en_core_web_sm")

def spacy_preprocess(text):
    doc = nlp(text.lower())
    lemmas = [
        token.lemma_ for token in doc
        if token.is_alpha and token.pos_ in ["NOUN", "VERB"]
    ]
    return lemmas

df["tokens_spacy"] = df["resume_text"].apply(spacy_preprocess)

# Top 10 frequent lemmas
all_lemmas = [lemma for lemmas in df["tokens_spacy"] for lemma in lemmas]
freq_words_spacy = Counter(all_lemmas).most_common(10)

print("=== Top 10 frequent lemmas (spaCy) ===")
print(freq_words_spacy, "\n")

=== First 3 rows ===
                                         resume_text
0  John Doe\nSoftware Engineer • Skilled in Pytho...
1  Jane Smith - Data Analyst • Proficient in R, P...
2  Michael Johnson\nProject Manager • Expertise i... 

=== Noisy characters in first resume ===
{'A', 'w', 'P', 'o', 'd', 'k', ',', 'r', 'p', 'W', '(', 'h', 'Q', 'u', 'x', 'e', ')', '\n', '.', ' ', '•', 'a', 'J', 'n', 'v', 'L', 'c', 'f', 'z', 'g', 'E', 'y', 'l', 'i', 't', 'D', 'S'} 

=== Top 10 frequent stemmed words (NLTK) ===
[('python', 2), ('john', 1), ('doe', 1), ('softwar', 1), ('engin', 1), ('skill', 1), ('java', 1), ('sql', 1), ('experi', 1), ('cloud', 1)] 



[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


=== Top 10 frequent lemmas (spaCy) ===
[('python', 2), ('software', 1), ('engineer', 1), ('experience', 1), ('cloud', 1), ('aw', 1), ('azure', 1), ('analyst', 1), ('proficient', 1), ('r', 1)] 

