In [2]:
import re
import string

!pip install -U nltk

import nltk
nltk.download('punkt')
nltk.download('stopwords')
from nltk.tokenize import sent_tokenize # Sentence Tokenizer
from nltk.tokenize import word_tokenize # Word Tokenizer
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.probability import FreqDist

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

Requirement already up-to-date: nltk in /anaconda3/lib/python3.6/site-packages (3.4)


[nltk_data] Downloading package punkt to
[nltk_data]     /Users/danielleromanoff/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/danielleromanoff/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


# 1) (optional) Scrape 100 Job Listings that contain the title "Data Scientist" from indeed.com

At a minimum your final dataframe of job listings should contain
- Job Title
- Job Description

In [3]:
# Additional Imports for scraping Indeed

import requests
import time
from bs4 import BeautifulSoup

In [4]:
# Getting search results from indeed

url = 'https://www.indeed.com/jobs?as_and=data+scientist&as_phr=&as_any=&as_not=&as_ttl=&as_cmp=&jt=all&st=&as_src=&salary=&radius=25&l=New+Jersey&fromage=any&limit=50&sort=&psf=advsrch#'
page = requests.get(url)
page = page.text

In [5]:
soup = BeautifulSoup(page, 'html.parser')
listings = ([s.text for s in soup.findAll(class_='summary')])
listings

['\n                            Implement field and office data collection efforts, data validation, and data evaluation. Process data, and oversee the development of map figures, data tables,...',
 "\n                            Previous internship or relevant work experiences in scripting, software development, or data analytics. In Global Data, we're responsible for delivering this...",
 '\n                            Junior Research Scientist*. At Rocky Mountain Scientific Laboratory, we associate peace with strength, courage, and action....',
 '\n                            Candidate MUST live in NJ, Philadelphia PA area or NYC area. Position will include assisting with the research, data collection and reporting for Phase I...',
 '\n                            Communication and presentation to external clients with relevance to the market and consumer insights. Consistently meets agreed upon project objectives....',
 '\n                            Leveraging your educational back

In [6]:
def tokenize_jobs(jobs):
    # remove punctuation
    table = str.maketrans('', '', string.punctuation)
    jobs = [j.translate(table) for j in jobs]
    # tokenize words
    return [word_tokenize(j) for j in jobs]

tokens = tokenize_jobs(listings)                      
tokens

[['Implement',
  'field',
  'and',
  'office',
  'data',
  'collection',
  'efforts',
  'data',
  'validation',
  'and',
  'data',
  'evaluation',
  'Process',
  'data',
  'and',
  'oversee',
  'the',
  'development',
  'of',
  'map',
  'figures',
  'data',
  'tables'],
 ['Previous',
  'internship',
  'or',
  'relevant',
  'work',
  'experiences',
  'in',
  'scripting',
  'software',
  'development',
  'or',
  'data',
  'analytics',
  'In',
  'Global',
  'Data',
  'were',
  'responsible',
  'for',
  'delivering',
  'this'],
 ['Junior',
  'Research',
  'Scientist',
  'At',
  'Rocky',
  'Mountain',
  'Scientific',
  'Laboratory',
  'we',
  'associate',
  'peace',
  'with',
  'strength',
  'courage',
  'and',
  'action'],
 ['Candidate',
  'MUST',
  'live',
  'in',
  'NJ',
  'Philadelphia',
  'PA',
  'area',
  'or',
  'NYC',
  'area',
  'Position',
  'will',
  'include',
  'assisting',
  'with',
  'the',
  'research',
  'data',
  'collection',
  'and',
  'reporting',
  'for',
  'Phase',
  

In [7]:
cells = [' '.join(x) for x in tokens]
cells

['Implement field and office data collection efforts data validation and data evaluation Process data and oversee the development of map figures data tables',
 'Previous internship or relevant work experiences in scripting software development or data analytics In Global Data were responsible for delivering this',
 'Junior Research Scientist At Rocky Mountain Scientific Laboratory we associate peace with strength courage and action',
 'Candidate MUST live in NJ Philadelphia PA area or NYC area Position will include assisting with the research data collection and reporting for Phase I',
 'Communication and presentation to external clients with relevance to the market and consumer insights Consistently meets agreed upon project objectives',
 'Leveraging your educational background in Science Mathematics Statistics Computer Science Data Science or a related discipline along with your relevant',
 'Ensure complete accuracy appropriate and consistent grammar spelling and punctuation on all r

In [8]:
description = pd.DataFrame({'Summary': cells})
description.head()

Unnamed: 0,Summary
0,Implement field and office data collection eff...
1,Previous internship or relevant work experienc...
2,Junior Research Scientist At Rocky Mountain Sc...
3,Candidate MUST live in NJ Philadelphia PA area...
4,Communication and presentation to external cli...


## 2) Use NLTK to tokenize / clean the listings 

In [9]:
##### Your Code Here #####

# 3) Use Scikit-Learn's CountVectorizer to get word counts for each listing.

In [16]:
from sklearn.feature_extraction.text import CountVectorizer

# create the transform
vectorizer = CountVectorizer(lowercase=True, stop_words='english')
# tokenize and build vocab
vectorizer.fit(cells)
# Create a Vocabulary
# The vocabulary establishes all of the possible words that we might use.
counted = (vectorizer.vocabulary_)
# counted
# The vocabulary dictionary does not represent the counts of words!!
counts = vectorizer.transform(cells)
features = vectorizer.get_feature_names()
values = counts.toarray()
df = pd.DataFrame(values, columns=features)
df

Unnamed: 0,1000,1100,able,academic,accuracy,accurate,acquire,action,actionable,add,...,validity,various,visualization,visualizing,waste,wide,work,working,write,years
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
2,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


# 4) Visualize the most common word counts

 # 5) Use Scikit-Learn's tfidfVectorizer to get a TF-IDF feature matrix

In [17]:
from sklearn.feature_extraction.text import TfidfVectorizer

# List of document strings as text
text = cells
# Instantiate vectorizer object
tfidf = TfidfVectorizer(ngram_range=(1,1), max_features=20)
# Create a vocabulary and get word counts per document
feature_matrix = tfidf.fit_transform(text)
# Print word counts
# print(feature_matrix.toarray())

# Get feature names to use as dataframe column headers
feature_names = tfidf.get_feature_names()

# View Feature Matrix as DataFrame
df1 = pd.DataFrame(feature_matrix.toarray(), columns=feature_names)
print(df1.shape)
df1.head()

(58, 20)


Unnamed: 0,analysis,analytics,and,data,for,in,is,learning,machine,modeling,of,or,science,scientist,scientists,statistical,that,the,to,with
0,0.0,0.0,0.424291,0.826473,0.0,0.0,0.0,0.0,0.0,0.0,0.255289,0.0,0.0,0.0,0.0,0.0,0.0,0.267857,0.0,0.0
1,0.0,0.305004,0.0,0.309715,0.27116,0.489818,0.0,0.0,0.0,0.0,0.0,0.705408,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.323474,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.744609,0.0,0.0,0.0,0.0,0.0,0.583886
3,0.0,0.0,0.204545,0.239058,0.418599,0.378074,0.0,0.0,0.0,0.0,0.0,0.54448,0.0,0.0,0.0,0.0,0.0,0.387389,0.0,0.369213
4,0.0,0.0,0.419428,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.397179,0.72321,0.378544


## Stretch Goals

 - Scrape Job Listings for the job title "Data Analyst". How do these differ from Data Scientist Job Listings
 - Try and identify requirements for experience specific technologies that are asked for in the job listings. How are those distributed among the job listings?
 - Use a clustering algorithm to cluster documents by their most important terms. Do the clusters reveal any common themes?
  - **Hint:** K-means might not be the best algorithm for this. Do a little bit of research to see what might be good for this. Also, remember that algorithms that depend on Euclidean distance break down with high dimensional data.