In [1]:
# Import Libraries
import pandas as pd
import numpy as np
import re

import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
from wordcloud import WordCloud
from sklearn.feature_extraction.text import CountVectorizer

import nltk
from nltk.tokenize import sent_tokenize, word_tokenize, RegexpTokenizer
from nltk.corpus import stopwords
from nltk import PorterStemmer
from nltk import WordNetLemmatizer

font = {'family' : 'normal',
        'weight' : 'bold',
        'size'   : 22}

In [13]:
# nltk.download('averaged_perceptron_tagger')
nltk.download('tagsets')

[nltk_data] Downloading package tagsets to
[nltk_data]     C:\Users\Arun\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping help\tagsets.zip.


True

In [2]:
# Read Dataset
df_processed = pd.read_csv(r'processed_data.csv')
df_processed.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5413 entries, 0 to 5412
Data columns (total 11 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   Unnamed: 0          5413 non-null   int64 
 1   Job_Title           5413 non-null   object
 2   Description         5413 non-null   object
 3   lower_description   5413 non-null   object
 4   word_tokenized      5413 non-null   object
 5   sentence_tokenized  5413 non-null   object
 6   word_count          5413 non-null   int64 
 7   sentence_count      5413 non-null   int64 
 8   clean_words         5413 non-null   object
 9   clean_stemmed       5413 non-null   object
 10  clean_lemmed        5413 non-null   object
dtypes: int64(3), object(8)
memory usage: 465.3+ KB


In [3]:
# duplicates reduce the df by 651 observations
df_processed.drop_duplicates(inplace = True)
df_processed.shape

(5413, 11)

In [4]:
df_processed.head()

Unnamed: 0.1,Unnamed: 0,Job_Title,Description,lower_description,word_tokenized,sentence_tokenized,word_count,sentence_count,clean_words,clean_stemmed,clean_lemmed
0,0,Data Scientist,"POSITION SUMMARY, The Business Analyst role is...","position summary, the business analyst role is...","['position', 'summary', 'the', 'business', 'an...","['POSITION SUMMARY, The Business Analyst role ...",424,25,"['position', 'summary', 'business', 'analyst',...","['posit', 'summari', 'busi', 'analyst', 'role'...","['position', 'summary', 'business', 'analyst',..."
1,1,Data Scientist,"What do we need?, You to have an amazing perso...","what do we need?, you to have an amazing perso...","['what', 'do', 'we', 'need', 'you', 'to', 'hav...","['What do we need?, You to have an amazing per...",286,10,"['need', 'amazing', 'personality', 'communicat...","['need', 'amaz', 'person', 'commun', 'style', ...","['need', 'amazing', 'personality', 'communicat..."
2,2,Data Scientist,"Validate, analyze, and conduct statistical ana...","validate, analyze, and conduct statistical ana...","['validate', 'analyze', 'and', 'conduct', 'sta...","['Validate, analyze, and conduct statistical a...",314,24,"['validate', 'analyze', 'conduct', 'statistica...","['valid', 'analyz', 'conduct', 'statist', 'ana...","['validate', 'analyze', 'conduct', 'statistica..."
3,3,Graduate Studies Program - Data Scientist,"Full time, Washington, DC metro area, Starting...","full time, washington, dc metro area, starting...","['full', 'time', 'washington', 'dc', 'metro', ...","['Full time, Washington, DC metro area, Starti...",297,13,"['full', 'time', 'washington', 'dc', 'metro', ...","['full', 'time', 'washington', 'dc', 'metro', ...","['full', 'time', 'washington', 'dc', 'metro', ..."
4,4,Data Scientist I,Assist in consultations with business partners...,assist in consultations with business partners...,"['assist', 'in', 'consultations', 'with', 'bus...",['Assist in consultations with business partne...,316,7,"['assist', 'consultations', 'business', 'partn...","['assist', 'consult', 'busi', 'partner', 'inte...","['assist', 'consultation', 'business', 'partne..."


Parts of Speech (POS)

In [5]:
def pos_series(keyword):
    '''categorizes parts of speech after tokenizing words with POS tags'''
    tokens = nltk.word_tokenize(keyword)
    tagged = nltk.pos_tag(tokens)
    return tagged

In [8]:
pos_tagged_arrs = df_processed.lower_description.apply(pos_series)

In [9]:
pos_tagged = []
for row in pos_tagged_arrs.values:
    for element in row:
        pos_tagged.append(element)

In [10]:
pos_df = pd.DataFrame(pos_tagged, columns = ('word','POS'))
# special chars were removed due to irrelevance as a tag but will be included in regex
char_removal = [',', '.', ':', '#', '$', '\'\'', '``', '(', ')']
drop_indices = (pos_df.loc[pos_df.POS.isin(char_removal)].index)
pos_df.drop(drop_indices, inplace = True)

In [11]:
# frequency of the unique pos tags within the dataset
pos_sum = pos_df.groupby('POS').count() # group by POS tags
pos_sum.sort_values(['word'], ascending=[False]) # in descending order of number of words per tag

Unnamed: 0_level_0,word
POS,Unnamed: 1_level_1
NN,592948
NNS,303335
JJ,261243
IN,213634
CC,157618
DT,105657
VB,99592
VBG,74037
TO,64907
VBP,40224


In [14]:
for tag in pos_df.POS.unique():
    print(nltk.help.upenn_tagset(tag))

NN: noun, common, singular or mass
    common-carrier cabbage knuckle-duster Casino afghan shed thermostat
    investment slide humour falloff slick wind hyena override subhumanity
    machinist ...
None
DT: determiner
    all an another any both del each either every half la many much nary
    neither no some such that the them these this those
None
VBZ: verb, present tense, 3rd person singular
    bases reconstructs marks mixes displeases seals carps weaves snatches
    slumps stretches authorizes smolders pictures emerges stockpiles
    seduces fizzes uses bolsters slaps speaks pleads ...
None
JJ: adjective or numeral, ordinal
    third ill-mannered pre-war regrettable oiled calamitous first separable
    ectoplasmic battery-powered participatory fourth still-to-be-named
    multilingual multi-disciplinary ...
None
IN: preposition or conjunction, subordinating
    astride among uppon whether out inside pro despite on by throughout
    below within for towards near behind atop around

In [15]:
pos_tagged_arrs[578]

[('current', 'JJ'),
 ('employees', 'NNS'),
 ('and', 'CC'),
 ('contingent', 'JJ'),
 ('workers', 'NNS'),
 ('click', 'VBP'),
 ('here', 'RB'),
 ('to', 'TO'),
 ('apply', 'VB'),
 ('and', 'CC'),
 ('search', 'VB'),
 ('by', 'IN'),
 ('the', 'DT'),
 ('job', 'NN'),
 ('posting', 'VBG'),
 ('title', 'NN'),
 ('.', '.'),
 ('job', 'NN'),
 ('summary', 'JJ'),
 (':', ':'),
 (',', ','),
 ('as', 'IN'),
 ('a', 'DT'),
 ('data', 'NN'),
 ('scientist', 'NN'),
 (',', ','),
 ('you', 'PRP'),
 ('will', 'MD'),
 (':', ':'),
 (',', ','),
 ('use', 'VBP'),
 ('emerging', 'VBG'),
 ('tools', 'NNS'),
 ('and', 'CC'),
 ('technology', 'NN'),
 ('to', 'TO'),
 ('develop', 'VB'),
 ('analytical', 'JJ'),
 ('models', 'NNS'),
 ('and', 'CC'),
 ('automation', 'NN'),
 ('in', 'IN'),
 ('music', 'NN'),
 ('planning', 'NN'),
 (',', ','),
 ('music', 'NN'),
 ('research', 'NN'),
 (',', ','),
 ('and', 'CC'),
 ('other', 'JJ'),
 ('areas', 'NNS'),
 ('of', 'IN'),
 ('the', 'DT'),
 ('company', 'NN'),
 ('communicate', 'NN'),
 ('complex', 'JJ'),
 ('solutio