In [None]:
import pandas as pd
import nltk
import re
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from wordcloud import WordCloud

# Download stopwords
nltk.download("stopwords")
from nltk.corpus import stopwords

# Load the scraped job dataset
df = pd.read_csv("indeed_jobs.csv")

# Ensure 'Job Description' column exists (Modify if missing)
if "Job Description" not in df.columns:
    print("Error: 'Job Description' column missing. Modify scraper to include it!")
    exit()

# Text Cleaning Function
def clean_text(text):
    text = text.lower()
    text = re.sub(r"\W", " ", text)  # Remove special characters
    text = re.sub(r"\d+", "", text)  # Remove numbers
    words = text.split()
    words = [word for word in words if word not in stopwords.words("english")]
    return " ".join(words)

# Apply cleaning to job descriptions
df["cleaned_description"] = df["Job Description"].astype(str).apply(clean_text)

# TF-IDF Vectorization
vectorizer = TfidfVectorizer(max_features=500)
X = vectorizer.fit_transform(df["cleaned_description"]).toarray()
feature_names = vectorizer.get_feature_names_out()

# Simulating 'Skill Demand' labels (You can use real job data for training)
df["Skill_Demand"] = df["cleaned_description"].apply(lambda x: 1 if "python" in x else 0)

# Train a Random Forest Model
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X, df["Skill_Demand"])

# Feature Importance Analysis
importance = model.feature_importances_
word_importance = dict(zip(feature_names, importance))

# Generate a Word Cloud of Important Words
wordcloud = WordCloud(width=800, height=400, background_color="white").generate_from_frequencies(word_importance)

# Display the Word Cloud
plt.figure(figsize=(10, 5))
plt.imshow(wordcloud, interpolation="bilinear")
plt.axis("off")
plt.title("Important Skills in Job Descriptions")
plt.show()


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Sri\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
