<img align="left" src="https://lever-client-logos.s3.amazonaws.com/864372b1-534c-480e-acd5-9711f850815c-1524247202159.png" width=200>
<br></br>

# Vector Representations
## *Data Science Unit 4 Sprint 2 Assignment 2*

In [3]:
# Imports

import string
import requests
from collections import Counter

# Cleaning

import re
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup

# NLP Libraries

import spacy
from spacy.tokenizer import Tokenizer
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.neighbors import NearestNeighbors

# Plotting

import matplotlib.pyplot as plt
import seaborn as sns
import squarify

In [4]:
# Import job listings csv

df = pd.read_csv("../data/job_listings.csv")
df.head()

Unnamed: 0.1,Unnamed: 0,description,title
0,0,"b""<div><div>Job Requirements:</div><ul><li><p>...",Data scientist
1,1,b'<div>Job Description<br/>\n<br/>\n<p>As a Da...,Data Scientist I
2,2,b'<div><p>As a Data Scientist you will be work...,Data Scientist - Entry Level
3,3,"b'<div class=""jobsearch-JobMetadataHeader icl-...",Data Scientist
4,4,b'<ul><li>Location: USA \xe2\x80\x93 multiple ...,Data Scientist


In [5]:
# Drop the Unnamed: 0 column

df.drop(['Unnamed: 0'], axis = 1, inplace = True)
df.head()

Unnamed: 0,description,title
0,"b""<div><div>Job Requirements:</div><ul><li><p>...",Data scientist
1,b'<div>Job Description<br/>\n<br/>\n<p>As a Da...,Data Scientist I
2,b'<div><p>As a Data Scientist you will be work...,Data Scientist - Entry Level
3,"b'<div class=""jobsearch-JobMetadataHeader icl-...",Data Scientist
4,b'<ul><li>Location: USA \xe2\x80\x93 multiple ...,Data Scientist


## 1) *Clean:* Job Listings from indeed.com that contain the title "Data Scientist" 

You have `job_listings.csv` in the data folder for this module. The text data in the description column is still messy - full of html tags. Use the [BeautifulSoup](https://www.crummy.com/software/BeautifulSoup/bs4/doc/) library to clean up this column. You will need to read through the documentation to accomplish this task. 

In [6]:
# Clean Data Scientist job listings from indeed.com using BeautifulSoup html parser

df["description"] = df["description"].apply(lambda x: " ".join(BeautifulSoup(x, "html.parser").stripped_strings).replace("\\n", " ").replace("b'", "").replace("b\"", ""))
df.head()

Unnamed: 0,description,title
0,Job Requirements: Conceptual understanding i...,Data scientist
1,"Job Description As a Data Scientist 1, yo...",Data Scientist I
2,As a Data Scientist you will be working on co...,Data Scientist - Entry Level
3,"$4,969 - $6,756 a month Contract Under the ge...",Data Scientist
4,Location: USA \xe2\x80\x93 multiple locations...,Data Scientist


## 2) Use Spacy to tokenize the listings 

In [None]:
# Tokenizer

nlp = spacy.load("en_core_web_lg")

tokenizer = Tokenizer(nlp.vocab)

In [None]:
# Tokenizer Pipe

tokens = []

""" Make them tokens """
for doc in tokenizer.pipe(df['description'], batch_size=500):
    doc_tokens = [re.sub(r'[^a-zA-Z ^0-9]', '', token.text.lower()).strip() 
                 for token in doc if (token.is_stop != True) and (token.is_punct != True)]
    tokens.append(" ".join(doc_tokens))

df['tokens'] = tokens

## 3) Use Scikit-Learn's CountVectorizer to get word counts for each listing.

In [None]:
# Apply CountVectorizer to df, use custom Spacy Vectorizer, job listings in df["tokens"] variable

vect = CountVectorizer(stop_words='english', max_features=1000)

#Learn our Vocab

vect.fit(df["tokens"])

# Get sparse dtm

dtm = vect.transform(df["tokens"])
dtm = pd.DataFrame(dtm.todense(), columns=vect.get_feature_names())

In [None]:
# See dtm head with word counts

dtm.head()

## 4) Visualize the most common word counts

In [None]:
# Get word count from dtm df

wc = dtm.sum(axis=0)

# Create top 20 word count object by sorting the word count object (more words get jumbled in visualization)

wc_top20 = wc.sort_values(ascending=False)[:20]

# Make top 20 word count df from top 20 word count object

top20df = pd.DataFrame(wc_top20).reset_index()

# Return top 20 words stored in the index of the df

top20df['index'].values

In [None]:
# Square plot of top 20 words

squarify.plot(sizes=wc_top20, label=top20df['index'], alpha=.8 )
plt.axis('off')
plt.show()

In [None]:
# Barplot of top 20 words

sns.set(style="darkgrid")
sns.barplot(y=top20df['index'].values,x=top20df[0].values);

## 5) Use Scikit-Learn's tfidfVectorizer to get a TF-IDF feature matrix

In [None]:
# Tokenizer to tune paramaters

def tokenize(document):
    
    doc = nlp(document)
    
    return [token.lemma_.strip() for token in doc if (token.is_stop != True) and (token.is_punct != True)]

In [None]:
# Tunning Parameters

# Instantiate vectorizer object

tfidf = TfidfVectorizer(stop_words='english', 
                        ngram_range=(1,2),
                        max_df=.97,
                        min_df=3,
                        tokenizer=tokenize)

# Create a vocabulary and get word counts per document # Similiar to fit_predict

dtm = tfidf.fit_transform(df["description"]) 

# Print word counts # Get feature names to use as dataframe column headers

dtm = pd.DataFrame(dtm.todense(), columns=tfidf.get_feature_names())

# View Feature Matrix as DataFrame

dtm.head()

In [None]:
# Original without tuning or ngram

## Instantiate vectorizer object
#
#tfidf = TfidfVectorizer(stop_words='english', max_features=5000)
#
## Create a vocabulary and get word counts per document # Similiar to fit_predict
#
#dtm = tfidf.fit_transform(df["description"])
#
## Print word counts # Get feature names to use as dataframe column headers
#
#dtm = pd.DataFrame(dtm.todense(), columns=tfidf.get_feature_names())
#
## View Feature Matrix as DataFrame
#
#dtm.head()

## 6) Create a NearestNeighbor Model. Write the description of your ideal datascience job and query your job listings. 

In [None]:
# Fit on DTM

nn = NearestNeighbors(n_neighbors=5, algorithm='kd_tree')
nn.fit(dtm)

In [None]:
# Ideal Data Science job description and query

ideal_job_desc = ["I want to make so much money doing data science and machine learning."]

ideal = tfidf.transform(ideal_job_desc)

nn.kneighbors(ideal.todense())

In [None]:
# Ideal job 1

df["description"][151]

In [None]:
# Ideal job 2

df["description"][212]

In [None]:
# Ideal job 3

df["description"][210]

In [None]:
# Ideal job 4

df["description"][37]

In [None]:
# Ideal job 5

df["description"][145]

## Stretch Goals

 - Try different visualizations for words and frequencies - what story do you want to tell with the data?
 - Scrape Job Listings for the job title "Data Analyst". How do these differ from Data Scientist Job Listings
 - Try and identify requirements for experience specific technologies that are asked for in the job listings. How are those distributed among the job listings?
 - Use a clustering algorithm to cluster documents by their most important terms. Do the clusters reveal any common themes?
  - **Hint:** K-means might not be the best algorithm for this. Do a little bit of research to see what might be good for this. Also, remember that algorithms that depend on Euclidean distance break down with high dimensional data.
 - Create a labeled dataset - which jobs will you apply for? Train a model to select the jobs you are most likely to apply for. :) 