<a href="https://colab.research.google.com/github/bharadwaj103/NLP_1610/blob/main/NLP_F_12_9_2025.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [14]:
import pandas as pd

# Sample resume data create chesthunna
data = {'Resume_Text': [
    "I am a software engineer with 5+ years of experience. My skills include Python, Java, and C++. I have worked on projects involving machine learning and web development.\n I am an excellent team player.",
    "Data Scientist with expertise in Python, R, and SQL. I have experience in statistical analysis, data visualization, and building predictive models. • Published a paper on a new machine learning algorithm.",
    "A marketing specialist with a background in digital marketing, social media management, and content creation. Skills: SEO, SEM, Adobe Photoshop. I have a proven track record of increasing brand visibility."
]}
resumes_df = pd.DataFrame(data)

# Q1. First 3 rows display chesi, noisy characters check chesthunna
print("First 3 rows of the sample resumes:")
print(resumes_df.head(3))
print("\nChecking for noisy characters...")
print(f"\n present: {'\n' in resumes_df['Resume_Text'].iloc[0]}")
print(f"• present: {'•' in resumes_df['Resume_Text'].iloc[1]}")

First 3 rows of the sample resumes:
                                         Resume_Text
0  I am a software engineer with 5+ years of expe...
1  Data Scientist with expertise in Python, R, an...
2  A marketing specialist with a background in di...

Checking for noisy characters...

 present: True
• present: True


In [13]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer

# NLTK downloads
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('punkt_tab') # Add this line to download the missing resource


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [15]:
# Q2. NLTK Preprocessing cheddam
def preprocess_nltk(text):
    # 1. Special characters and digits clean cheyyadam
    text = re.sub(r'[^a-zA-Z\s]', '', text, re.I|re.A)
    # 2. Tokenize cheyyadam
    tokens = word_tokenize(text.lower())
    # 3. Stop words remove cheyyadam
    stop_words = set(stopwords.words('english'))
    filtered_tokens = [token for token in tokens if token not in stop_words and len(token) > 1]
    # 4. Stemming cheyyadam
    stemmer = PorterStemmer()
    stemmed_tokens = [stemmer.stem(token) for token in filtered_tokens]
    return stemmed_tokens

resumes_df['nltk_tokens'] = resumes_df['Resume_Text'].apply(preprocess_nltk)

# Top 10 frequent stemmed words extract cheyyadam
all_nltk_tokens = [token for sublist in resumes_df['nltk_tokens'] for token in sublist]
fdist_nltk = nltk.FreqDist(all_nltk_tokens)
top_10_nltk = fdist_nltk.most_common(10)

print("\n--- NLTK Preprocessing Results ---")
print("Processed Tokens for first resume:")
print(resumes_df['nltk_tokens'].iloc[0])
print("\nTop 10 frequent stemmed words:")
print(top_10_nltk)



--- NLTK Preprocessing Results ---
Processed Tokens for first resume:
['softwar', 'engin', 'year', 'experi', 'skill', 'includ', 'python', 'java', 'work', 'project', 'involv', 'machin', 'learn', 'web', 'develop', 'excel', 'team', 'player']

Top 10 frequent stemmed words:
[('experi', 2), ('skill', 2), ('python', 2), ('machin', 2), ('learn', 2), ('data', 2), ('market', 2), ('softwar', 1), ('engin', 1), ('year', 1)]


In [16]:
import spacy

# spaCy model load cheyyadam
# 'en_core_web_sm' model already install chesi undali. Leda "python -m spacy download en_core_web_sm" ani run cheyyali.
try:
    nlp = spacy.load("en_core_web_sm")
except OSError:
    print("Downloading spaCy model 'en_core_web_sm'...")
    from spacy.cli import download
    download("en_core_web_sm")
    nlp = spacy.load("en_core_web_sm")

# Q3. spaCy Preprocessing cheddam
def preprocess_spacy(text):
    doc = nlp(text)
    # 1. Tokens ni lemmatize cheyyadam
    # 2. Only alphabetic nouns and verbs ni filter cheyyadam
    lemmas = [token.lemma_.lower() for token in doc if token.is_alpha and token.pos_ in ['NOUN', 'VERB']]
    return lemmas

resumes_df['spacy_lemmas'] = resumes_df['Resume_Text'].apply(preprocess_spacy)

# Top 10 frequent lemmas extract cheyyadam
all_spacy_lemmas = [lemma for sublist in resumes_df['spacy_lemmas'] for lemma in sublist]
fdist_spacy = nltk.FreqDist(all_spacy_lemmas)
top_10_spacy = fdist_spacy.most_common(10)

print("\n--- spaCy Preprocessing Results ---")
print("Processed Lemmas for first resume:")
print(resumes_df['spacy_lemmas'].iloc[0])
print("\nTop 10 frequent lemmas:")
print(top_10_spacy)


--- spaCy Preprocessing Results ---
Processed Lemmas for first resume:
['software', 'engineer', 'year', 'experience', 'skill', 'include', 'work', 'project', 'involve', 'machine', 'learning', 'web', 'development', 'team', 'player']

Top 10 frequent lemmas:
[('experience', 2), ('skill', 2), ('machine', 2), ('have', 2), ('marketing', 2), ('software', 1), ('engineer', 1), ('year', 1), ('include', 1), ('work', 1)]
