
# Trending Jobs Scraper + Predictor
This notebook scrapes job titles from public job boards (RemoteOK, WeWorkRemotely), fetches Google Trends data for candidate job keywords (using `pytrends`), performs basic NLP clustering and frequency analysis, and uses a simple linear regression on Google Trends interest to **predict which jobs are likely to trend next week**.

**Important:** This notebook is a starting point. When you run scrapers, obey each site's `robots.txt` and terms of service. Some sites may block automated scraping; in production use, prefer official APIs or data partners.

Files created by this notebook: none by default. You can modify it to save CSVs of scraped job listings.


In [None]:

# Install required packages (run once)
import sys
import subprocess
packages = ['requests','beautifulsoup4','pandas','scikit-learn','pytrends','nbformat','numpy','matplotlib']
subprocess.check_call([sys.executable, '-m', 'pip', 'install', '--quiet'] + packages)
print('Installed packages (or they were already present).')


## 1) Scrape Remote job boards (examples)

In [None]:

# Scrape RemoteOK job titles (example)
import requests
from bs4 import BeautifulSoup
import pandas as pd
def scrape_remoteok():
    url = 'https://remoteok.com/remote-dev-jobs'
    headers = {'User-Agent':'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0'}
    r = requests.get(url, headers=headers, timeout=15)
    r.raise_for_status()
    soup = BeautifulSoup(r.text, 'html.parser')
    job_rows = soup.find_all('tr', {'class':'job'})
    titles = []
    for job in job_rows:
        t = job.find('h2')
        if t:
            titles.append(t.get_text(strip=True))
    return pd.DataFrame({'source':'remoteok','title':titles})
df_remoteok = scrape_remoteok()
print('RemoteOK:', len(df_remoteok), 'jobs scraped')
df_remoteok.head()


In [None]:

# Scrape WeWorkRemotely job titles (example)
import requests
from bs4 import BeautifulSoup
import pandas as pd
def scrape_wwr():
    url = 'https://weworkremotely.com/categories/remote-programming-jobs'
    headers = {'User-Agent':'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0'}
    r = requests.get(url, headers=headers, timeout=15)
    r.raise_for_status()
    soup = BeautifulSoup(r.text, 'html.parser')
    sections = soup.find_all('section', {'class':'jobs'})
    titles = []
    for sec in sections:
        for li in sec.find_all('li', recursive=False):
            a = li.find('a', recursive=True)
            if a:
                h = a.find('span', {'class':'title'})
                if h:
                    titles.append(h.get_text(strip=True))
    return pd.DataFrame({'source':'weworkremotely','title':titles})
df_wwr = scrape_wwr()
print('WeWorkRemotely:', len(df_wwr), 'jobs scraped')
df_wwr.head()


In [None]:

# Combine and preprocess titles
import pandas as pd
df = pd.concat([df_remoteok, df_wwr], ignore_index=True)
df['title_clean'] = df['title'].str.replace('[^A-Za-z0-9 &+-]',' ', regex=True).str.lower().str.strip()
df['title_clean'] = df['title_clean'].str.replace('\s+',' ', regex=True)
print('Total scraped titles:', len(df))
df.head()


## 2) NLP clustering and frequency analysis

In [None]:

# NLP: TF-IDF + KMeans clustering to find common job clusters/phrases
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
vectorizer = TfidfVectorizer(ngram_range=(1,2), max_features=2000, stop_words='english')
X = vectorizer.fit_transform(df['title_clean'].fillna(''))
k = 6  # number of clusters (adjustable)
km = KMeans(n_clusters=k, random_state=42, n_init=10)
labels = km.fit_predict(X)
df['cluster'] = labels
# top terms per cluster
terms = vectorizer.get_feature_names_out()
order_centroids = km.cluster_centers_.argsort()[:, ::-1]
cluster_terms = {}
for i in range(k):
    top_terms = [terms[ind] for ind in order_centroids[i, :10]]
    cluster_terms[i] = top_terms
cluster_terms


In [None]:

# Frequency count of bigrams/unigrams in titles to get candidate job keywords
from collections import Counter
import re
def get_ngrams(texts, n=2):
    counts = Counter()
    for t in texts:
        tokens = re.findall(r'\w+', t)
        for i in range(len(tokens)-n+1):
            counts[' '.join(tokens[i:i+n])] += 1
    return counts
unigrams = get_ngrams(df['title_clean'].tolist(), n=1)
bigrams = get_ngrams(df['title_clean'].tolist(), n=2)
top_unigrams = unigrams.most_common(50)
top_bigrams = bigrams.most_common(50)
pd.DataFrame(top_bigrams, columns=['bigram','count']).head(15)


## 3) Google Trends (pytrends) and forecasting

In [None]:

# Use pytrends to fetch Google Trends interest_over_time for candidate keywords
from pytrends.request import TrendReq
pytrends = TrendReq(hl='en-US', tz=360)
# Build candidate keywords from top bigrams (you can pick top 8-12 to avoid API limits)
candidates = [x[0] for x in top_bigrams[:10]]
print('Candidate keywords for Google Trends:', candidates)
pytrends.build_payload(candidates, timeframe='now 7-d')  # last 7 days
iot = pytrends.interest_over_time()
if iot.empty:
    print('No Google Trends data returned. Check network or pytrends limitations.')
else:
    display(iot.head())


In [None]:

# Simple prediction: linear regression on interest_over_time to forecast next 7 days
from sklearn.linear_model import LinearRegression
import numpy as np
predictions = {}
if not iot.empty:
    for col in iot.columns:
        if col == 'isPartial': continue
        series = iot[col].fillna(0).values
        X = np.arange(len(series)).reshape(-1,1)
        y = series
        if len(series) >= 3:
            lr = LinearRegression().fit(X,y)
            future_X = np.arange(len(series), len(series)+7).reshape(-1,1)
            pred = lr.predict(future_X).clip(min=0)
            predictions[col] = float(pred.mean())  # mean predicted interest next week
# Rank candidate keywords by predicted interest
pred_df = pd.DataFrame(list(predictions.items()), columns=['keyword','predicted_interest']).sort_values('predicted_interest', ascending=False)
pred_df


In [None]:

# Save results to CSV if needed
pred_df.to_csv('predicted_trending_jobs.csv', index=False)
df.to_csv('scraped_job_titles.csv', index=False)
print('Saved scraped_job_titles.csv and predicted_trending_jobs.csv')



## Notes & Next steps
- This notebook is a **template**. Depending on target sites, you may need to adapt selectors or use official APIs (LinkedIn, Indeed often block scraping).
- For more robust forecasting, consider using Prophet, ARIMA, or collect longer historic Google Trends series (e.g., last 90 days) rather than 7-day windows.
- Respect robots.txt and site terms; scraping large volumes can get your IP blocked.
- You can extend scraping to other job boards (Indeed, Glassdoor, LinkedIn) but they may require authentication or have anti-bot protections.
- If you want, I can adapt this notebook to target specific sites and produce a runnable .ipynb tailored to those sources.
