# IMPORTING LIBRARIES

In [1]:
import re
import nltk
import spacy
import pickle
import string
import numpy as np
import pandas as pd
import seaborn as sns
from collections import Counter
import matplotlib.pyplot as plt
from spacy.tokenizer import Tokenizer

### Importing the cleaned dataset

In [2]:
pickle_in = open("CleanedDataAnalyst.pickle", "rb")
df = pickle.load(pickle_in)
pickle_in.close()

### Viewing the dataset

In [3]:
df.head()

Unnamed: 0,Job Title,Salary Estimate,Job Description,Rating,Company Name,Location,Headquarters,Size,Type of ownership,Industry,Sector,Revenue,Easy Apply,min_Salary,max_Salary,New Job Title,Job Position
0,"Data Analyst, Center on Immigration and Justic...",51500.0,Are you eager to roll up your sleeves and harn...,3.2,Vera Institute of Justice,New York,New York,201 to 500 employees,Nonprofit Organization,Social Assistance,Non-Profit,500000000.0,True,37000,66000,Migration Data Analyst,Clerk
1,Quality Data Analyst,51500.0,Overview\n\nProvides analytical and technical ...,3.8,Visiting Nurse Service of New York,New York,New York,10000+ employees,Nonprofit Organization,Health Care Services & Hospitals,Health Care,5000000000.0,False,37000,66000,Product Quality Data Analyst,Clerk
2,"Senior Data Analyst, Insights & Analytics Team...",51500.0,We’re looking for a Senior Data Analyst who ha...,3.4,Squarespace,New York,New York,1001 to 5000 employees,Company - Private,Internet,Information Technology,500000000.0,False,37000,66000,Service & Operations Data Analyst,Senior
3,Data Analyst,51500.0,Requisition NumberRR-0001939\nRemote:Yes\nWe c...,4.1,Celerity,New York,McLean,201 to 500 employees,Subsidiary or Business Segment,IT Services,Information Technology,100000000.0,False,37000,66000,Data Analyst,Clerk
4,Reporting Data Analyst,51500.0,ABOUT FANDUEL GROUP\n\nFanDuel Group is a worl...,3.9,FanDuel,New York,New York,501 to 1000 employees,Company - Private,Sports & Recreation,"Arts, Entertainment & Recreation",500000000.0,True,37000,66000,Data Analyst,Reporter


### Columns after cleaning

In [4]:
list(df)

['Job Title',
 'Salary Estimate',
 'Job Description',
 'Rating',
 'Company Name',
 'Location',
 'Headquarters',
 'Size',
 'Type of ownership',
 'Industry',
 'Sector',
 'Revenue',
 'Easy Apply',
 'min_Salary',
 'max_Salary',
 'New Job Title',
 'Job Position']

### Shape of Dataset

In [5]:
df.shape

(2252, 17)

### Taking a look at few Job Descriptions

In [6]:
df.iloc[8]

Job Title                                                 Data Analyst
Salary Estimate                                                  51500
Job Description      The Data Analyst is an integral member of the ...
Rating                                                               4
Company Name                                                      DMGT
Location                                                      New York
Headquarters                                                    London
Size                                           5001 to 10000 employees
Type of ownership                                     Company - Public
Industry                              Venture Capital & Private Equity
Sector                                                         Finance
Revenue                                                          2e+09
Easy Apply                                                       False
min_Salary                                                       37000
max_Sa

In [7]:
df["Job Description"].iloc[8]
#df.iloc[8,2]

"The Data Analyst is an integral member of the global commercial data and analytics team driving commercial insights and opportunities for the world's largest English language newspaper website, DailyMail.com. This is a unique opportunity to work in a fast-paced entrepreneurial environment, with wide exposure to ad-tech and big data platforms.\n\nThe Data Analyst will be responsible for maintaining and optimizing the global commercial data systems, identifying methods to maximize commercial performance and providing business insights to internal stakeholders. This individual will have a genuine passion for digital media and data technology.\n\nDailyMail.com is a division of UK-based DMGT, an international portfolio of digital, information, media and events businesses, which employs over 12,000 people and is listed on the London Stock Exchange (LSE:DMGT.L).\n\nSpecific Responsibilities\nParticipate in cross-functional projects using advanced data modeling and analysis techniques to disc

In [8]:
df.iloc[16]

Job Title                                                 Data Analyst
Salary Estimate                                                  51500
Job Description      Undertone stands alone among AdTech and ad net...
Rating                                                             3.8
Company Name                                                 Undertone
Location                                                      New York
Headquarters                                                  New York
Size                                              201 to 500 employees
Type of ownership                       Subsidiary or Business Segment
Industry                                       Advertising & Marketing
Sector                                               Business Services
Revenue                                                          5e+08
Easy Apply                                                        True
min_Salary                                                       37000
max_Sa

In [9]:
df["Job Description"].iloc[16]
#df.iloc[16,2]

'Undertone stands alone among AdTech and ad network businesses in its ability to address marketing objectives through Synchronized Digital Branding and extraordinarily creative treatments. We drive best in class results for clients through Undertones expansive rich media and video capabilities that are expressed across multiple channels and platforms, matching the consumer journey.\n\nUndertones Data Management Service (UDMS) is a big data, cloud-based data-warehouse, dashboard and reporting environment. Do you want to help enable a data-driven organization? This is your opportunity to join a mission critical team, at an innovative company in an industry just beginning to harness the power of data.\n\nAs member of the Undertones UDMS Team, the Data Analyst drives value by providing provocative, differentiating analytics and insights. This position will support a wide variety of business intelligence efforts across Undertone while working in a highly collaborative manner within multiple

In [10]:
df.iloc[32]

Job Title                                                 Data Analyst
Salary Estimate                                                  66500
Job Description      Job Description:\nLegal experience is required...
Rating                                                             3.5
Company Name                                                    Pozent
Location                                                      New York
Headquarters                                                Piscataway
Size                                                 1 to 50 employees
Type of ownership                                             Contract
Industry                                                   IT Services
Sector                                          Information Technology
Revenue                                                          1e+06
Easy Apply                                                       False
min_Salary                                                       46000
max_Sa

In [11]:
df["Job Description"].iloc[32]
#df.iloc[32,2]

'Job Description:\nLegal experience is required.\nManaging several different data sets - including creation, updates, and deletion.\nProvide quality assurance of imported data, working with quality assurance analyst if necessary.\nTroubleshooting data issues with IT/ Review Team.\nSupporting initiatives for data integrity and normalization.\nGenerating reports from single or multiple systems.\nEvaluating changes and updates to source production systems.\nUAT Testing.'

# TOKENIZING THE JOB DESCRIPTION

In [12]:
nlp = spacy.load("en_core_web_lg")
tokenizer = Tokenizer(nlp.vocab)
STOP_WORDS = nlp.Defaults.stop_words.union(['year'])

### Tokenizer pipe removing stop words and blank words and lemmatizing. Appends a new column containing these tokens.

In [13]:
tokens = []

for doc in tokenizer.pipe(df['Job Description'], batch_size=500):
    
    doc_tokens = []
    for token in doc:
        if (token.lemma_ not in STOP_WORDS) & (token.text != ' '):
            doc_tokens.append(token.lemma_)

    tokens.append(doc_tokens)
    
for i in tokens:
    for j in i:
        j=j.lower()
df['tokens'] = tokens

### Taking a look at the dataset

In [14]:
df.head()

Unnamed: 0,Job Title,Salary Estimate,Job Description,Rating,Company Name,Location,Headquarters,Size,Type of ownership,Industry,Sector,Revenue,Easy Apply,min_Salary,max_Salary,New Job Title,Job Position,tokens
0,"Data Analyst, Center on Immigration and Justic...",51500.0,Are you eager to roll up your sleeves and harn...,3.2,Vera Institute of Justice,New York,New York,201 to 500 employees,Nonprofit Organization,Social Assistance,Non-Profit,500000000.0,True,37000,66000,Migration Data Analyst,Clerk,"[Are, eager, roll, sleeve, harness, datum, dri..."
1,Quality Data Analyst,51500.0,Overview\n\nProvides analytical and technical ...,3.8,Visiting Nurse Service of New York,New York,New York,10000+ employees,Nonprofit Organization,Health Care Services & Hospitals,Health Care,5000000000.0,False,37000,66000,Product Quality Data Analyst,Clerk,"[Overview, \n\n, Provides, analytic, technical..."
2,"Senior Data Analyst, Insights & Analytics Team...",51500.0,We’re looking for a Senior Data Analyst who ha...,3.4,Squarespace,New York,New York,1001 to 5000 employees,Company - Private,Internet,Information Technology,500000000.0,False,37000,66000,Service & Operations Data Analyst,Senior,"[We’re, look, Senior, Data, Analyst, love, men..."
3,Data Analyst,51500.0,Requisition NumberRR-0001939\nRemote:Yes\nWe c...,4.1,Celerity,New York,McLean,201 to 500 employees,Subsidiary or Business Segment,IT Services,Information Technology,100000000.0,False,37000,66000,Data Analyst,Clerk,"[Requisition, NumberRR-0001939, \n, Remote:Yes..."
4,Reporting Data Analyst,51500.0,ABOUT FANDUEL GROUP\n\nFanDuel Group is a worl...,3.9,FanDuel,New York,New York,501 to 1000 employees,Company - Private,Sports & Recreation,"Arts, Entertainment & Recreation",500000000.0,True,37000,66000,Data Analyst,Reporter,"[ABOUT, FANDUEL, GROUP, \n\n, FanDuel, Group, ..."


# EXTRACTION OF TECHNICAL TERMS FROM TOKENS

### List of technical terms generally used by data analysts.

In [15]:
tech_terms = ['python', 'rlanguage', 'sql', 'hadoop', 'spark', 'java', 'sas', 'tableau',
              'hive', 'scala', 'aws', 'c', 'c++', 'matlab', 'tensorflow', 'excel',
              'nosql', 'linux', 'azure', 'scikit', 'machine learning', 'statistic',
              'analysis', 'computer science', 'visual', 'ai', 'deep learning',
              'nlp', 'natural language processing', 'neural network', 'mathematic',
              'database', 'oop', 'blockchain',
              'html', 'css', 'javascript', 'jquery', 'git', 'photoshop', 'illustrator',
              'word press', 'seo', 'responsive design', 'php', 'mobile', 'design', 'react',
              'security', 'ruby', 'fireworks', 'json', 'node', 'express', 'redux', 'ajax',
              'java', 'api', 'state management',
              'wireframe', 'ui prototype', 'ux writing', 'interactive design',
              'metric', 'analytic', 'ux research', 'empathy', 'collaborate', 'mockup', 
              'prototype', 'test', 'ideate', 'usability', 'high-fidelity design',
              'framework',
              'swift', 'xcode', 'spatial reasoning', 'human interface', 'core data',
              'grand central', 'network', 'objective-c', 'foundation', 'uikit', 
              'cocoatouch', 'spritekit', 'scenekit', 'opengl', 'metal', 'api', 'iot',
              'karma','rochade','google',"cloud"]

### Extracting these terms into a new column.

In [16]:
df['tokens_filtered'] = df.apply(lambda x: list(set(x['tokens']) & set(tech_terms)), axis=1)

In [17]:
df.head()

Unnamed: 0,Job Title,Salary Estimate,Job Description,Rating,Company Name,Location,Headquarters,Size,Type of ownership,Industry,Sector,Revenue,Easy Apply,min_Salary,max_Salary,New Job Title,Job Position,tokens,tokens_filtered
0,"Data Analyst, Center on Immigration and Justic...",51500.0,Are you eager to roll up your sleeves and harn...,3.2,Vera Institute of Justice,New York,New York,201 to 500 employees,Nonprofit Organization,Social Assistance,Non-Profit,500000000.0,True,37000,66000,Migration Data Analyst,Clerk,"[Are, eager, roll, sleeve, harness, datum, dri...","[foundation, analysis, statistic, security, da..."
1,Quality Data Analyst,51500.0,Overview\n\nProvides analytical and technical ...,3.8,Visiting Nurse Service of New York,New York,New York,10000+ employees,Nonprofit Organization,Health Care Services & Hospitals,Health Care,5000000000.0,False,37000,66000,Product Quality Data Analyst,Clerk,"[Overview, \n\n, Provides, analytic, technical...","[analysis, statistic, analytic, metric, database]"
2,"Senior Data Analyst, Insights & Analytics Team...",51500.0,We’re looking for a Senior Data Analyst who ha...,3.4,Squarespace,New York,New York,1001 to 5000 employees,Company - Private,Internet,Information Technology,500000000.0,False,37000,66000,Service & Operations Data Analyst,Senior,"[We’re, look, Senior, Data, Analyst, love, men...","[collaborate, analysis, analytic, metric, desi..."
3,Data Analyst,51500.0,Requisition NumberRR-0001939\nRemote:Yes\nWe c...,4.1,Celerity,New York,McLean,201 to 500 employees,Subsidiary or Business Segment,IT Services,Information Technology,100000000.0,False,37000,66000,Data Analyst,Clerk,"[Requisition, NumberRR-0001939, \n, Remote:Yes...","[analytic, design, analysis]"
4,Reporting Data Analyst,51500.0,ABOUT FANDUEL GROUP\n\nFanDuel Group is a worl...,3.9,FanDuel,New York,New York,501 to 1000 employees,Company - Private,Sports & Recreation,"Arts, Entertainment & Recreation",500000000.0,True,37000,66000,Data Analyst,Reporter,"[ABOUT, FANDUEL, GROUP, \n\n, FanDuel, Group, ...","[analysis, analytic, network, design, database]"


# EXTRACTION OF LANGUAGES FROM TOKENS

### Importing a list of languages extracted and cleaned from Wikipedia.

In [18]:
lang_df = pd.read_csv("languages.csv")
lang_df.head()

Unnamed: 0,Language
0,Chinese
1,Spanish
2,English
3,Hindi
4,Bengali


In [19]:
lang_array = lang_df["Language"].unique()
lang_array

array(['Chinese', 'Spanish', 'English', 'Hindi', 'Bengali', 'Portuguese',
       'Russian', 'Japanese', 'Western Punjabi', 'Marathi', 'Telugu',
       'Turkish', 'Korean', 'French', 'German', 'Vietnamese', 'Tamil',
       'Urdu ', 'Javanese', 'Italian', 'Arabic', 'Gujarati', 'Persian',
       'Bhojpuri', 'Hausa', 'Kannada', 'Indonesian', 'Polish', 'Yoruba',
       'Malayalam', 'Odia', 'Maithili', 'Burmese', 'Eastern Punjabi',
       'Sunda', 'Ukrainian', 'Igbo', 'Northern Uzbek', 'Sindhi',
       'Romanian', 'Tagalog', 'Dutch', 'Amharic', 'Northern Pashto',
       'Magahi', 'Thai', 'Saraiki', 'Khmer', 'Chhattisgarhi', 'Somali',
       'Malay', 'Cebuano', 'Nepali', 'Assamese', 'Sinhalese',
       'Northern Kurdish', 'Nigerian Fulfulde', 'Bavarian',
       'South Azerbaijani', 'Greek', 'Chittagonian', 'Kazakh', 'Deccan',
       'Hungarian', 'Kinyarwanda', 'Zulu', 'Southern Pashto', 'Rundi',
       'Czech', 'Uyghur', 'Sylheti'], dtype=object)

In [20]:
df['languages_filtered'] = df.apply(lambda x: list(set(x['tokens']) & set(lang_array)), axis=1)

In [21]:
df.head()

Unnamed: 0,Job Title,Salary Estimate,Job Description,Rating,Company Name,Location,Headquarters,Size,Type of ownership,Industry,Sector,Revenue,Easy Apply,min_Salary,max_Salary,New Job Title,Job Position,tokens,tokens_filtered,languages_filtered
0,"Data Analyst, Center on Immigration and Justic...",51500.0,Are you eager to roll up your sleeves and harn...,3.2,Vera Institute of Justice,New York,New York,201 to 500 employees,Nonprofit Organization,Social Assistance,Non-Profit,500000000.0,True,37000,66000,Migration Data Analyst,Clerk,"[Are, eager, roll, sleeve, harness, datum, dri...","[foundation, analysis, statistic, security, da...",[]
1,Quality Data Analyst,51500.0,Overview\n\nProvides analytical and technical ...,3.8,Visiting Nurse Service of New York,New York,New York,10000+ employees,Nonprofit Organization,Health Care Services & Hospitals,Health Care,5000000000.0,False,37000,66000,Product Quality Data Analyst,Clerk,"[Overview, \n\n, Provides, analytic, technical...","[analysis, statistic, analytic, metric, database]",[]
2,"Senior Data Analyst, Insights & Analytics Team...",51500.0,We’re looking for a Senior Data Analyst who ha...,3.4,Squarespace,New York,New York,1001 to 5000 employees,Company - Private,Internet,Information Technology,500000000.0,False,37000,66000,Service & Operations Data Analyst,Senior,"[We’re, look, Senior, Data, Analyst, love, men...","[collaborate, analysis, analytic, metric, desi...",[]
3,Data Analyst,51500.0,Requisition NumberRR-0001939\nRemote:Yes\nWe c...,4.1,Celerity,New York,McLean,201 to 500 employees,Subsidiary or Business Segment,IT Services,Information Technology,100000000.0,False,37000,66000,Data Analyst,Clerk,"[Requisition, NumberRR-0001939, \n, Remote:Yes...","[analytic, design, analysis]",[]
4,Reporting Data Analyst,51500.0,ABOUT FANDUEL GROUP\n\nFanDuel Group is a worl...,3.9,FanDuel,New York,New York,501 to 1000 employees,Company - Private,Sports & Recreation,"Arts, Entertainment & Recreation",500000000.0,True,37000,66000,Data Analyst,Reporter,"[ABOUT, FANDUEL, GROUP, \n\n, FanDuel, Group, ...","[analysis, analytic, network, design, database]",[]


# CLEANING BEFORE COUNT VECTORIZATION

- Many rows have an empty list of techical terms, because the job description did not contain any, so we are going to add Python as default values here.

- Similarly many job decriptions have not specified any particular spoken language expertise so we are adding English by default.

In [22]:
def fill_with_pyr(i):
    if len(i) == 0:
        i = ["python","rlanguage"]
    return i

In [23]:
def fill_with_eng(i):
    if len(i) == 0:
        i = ["English"]
    return i

In [24]:
df["tokens_filtered"] = df["tokens_filtered"].apply(fill_with_pyr)

In [25]:
df["languages_filtered"] = df["languages_filtered"].apply(fill_with_eng)

In [26]:
df

Unnamed: 0,Job Title,Salary Estimate,Job Description,Rating,Company Name,Location,Headquarters,Size,Type of ownership,Industry,Sector,Revenue,Easy Apply,min_Salary,max_Salary,New Job Title,Job Position,tokens,tokens_filtered,languages_filtered
0,"Data Analyst, Center on Immigration and Justic...",51500.0,Are you eager to roll up your sleeves and harn...,3.2,Vera Institute of Justice,New York,New York,201 to 500 employees,Nonprofit Organization,Social Assistance,Non-Profit,5.000000e+08,True,37000,66000,Migration Data Analyst,Clerk,"[Are, eager, roll, sleeve, harness, datum, dri...","[foundation, analysis, statistic, security, da...",[English]
1,Quality Data Analyst,51500.0,Overview\n\nProvides analytical and technical ...,3.8,Visiting Nurse Service of New York,New York,New York,10000+ employees,Nonprofit Organization,Health Care Services & Hospitals,Health Care,5.000000e+09,False,37000,66000,Product Quality Data Analyst,Clerk,"[Overview, \n\n, Provides, analytic, technical...","[analysis, statistic, analytic, metric, database]",[English]
2,"Senior Data Analyst, Insights & Analytics Team...",51500.0,We’re looking for a Senior Data Analyst who ha...,3.4,Squarespace,New York,New York,1001 to 5000 employees,Company - Private,Internet,Information Technology,5.000000e+08,False,37000,66000,Service & Operations Data Analyst,Senior,"[We’re, look, Senior, Data, Analyst, love, men...","[collaborate, analysis, analytic, metric, desi...",[English]
3,Data Analyst,51500.0,Requisition NumberRR-0001939\nRemote:Yes\nWe c...,4.1,Celerity,New York,McLean,201 to 500 employees,Subsidiary or Business Segment,IT Services,Information Technology,1.000000e+08,False,37000,66000,Data Analyst,Clerk,"[Requisition, NumberRR-0001939, \n, Remote:Yes...","[analytic, design, analysis]",[English]
4,Reporting Data Analyst,51500.0,ABOUT FANDUEL GROUP\n\nFanDuel Group is a worl...,3.9,FanDuel,New York,New York,501 to 1000 employees,Company - Private,Sports & Recreation,"Arts, Entertainment & Recreation",5.000000e+08,True,37000,66000,Data Analyst,Reporter,"[ABOUT, FANDUEL, GROUP, \n\n, FanDuel, Group, ...","[analysis, analytic, network, design, database]",[English]
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2248,RQS - IHHA - 201900004460 -1q Data Security An...,91000.0,Maintains systems to protect data from unautho...,2.5,"Avacend, Inc.",Denver,Alpharetta,51 to 200 employees,Company - Private,Staffing & Outsourcing,Business Services,5.000000e+08,False,78000,104000,Security Data Analyst,Clerk,"[Maintains, system, protect, datum, unauthoriz...",[security],[English]
2249,Senior Data Analyst (Corporate Audit),91000.0,Position:\nSenior Data Analyst (Corporate Audi...,2.9,Arrow Electronics,Centennial,Centennial,10000+ employees,Company - Public,Wholesale,Business Services,1.000000e+10,False,78000,104000,Data Analyst,Senior,"[Position:, \n, Senior, Data, Analyst, (Corpor...","[test, analysis, statistic, analytic, design, ...",[English]
2250,"Technical Business Analyst (SQL, Data analytic...",91000.0,"Title: Technical Business Analyst (SQL, Data a...",3.7,Spiceorb,Denver,Denver,1001 to 5000 employees,Company - Private,IT Services,Information Technology,5.000000e+08,False,78000,104000,Business Intelligence Analyst,Clerk,"[Title:, Technical, Business, Analyst, (SQL,, ...","[python, rlanguage]",[English]
2251,"Data Analyst 3, Customer Experience",91000.0,Summary\n\nResponsible for working cross-funct...,3.1,Contingent Network Services,Centennial,West Chester,201 to 500 employees,Company - Private,Enterprise Software & Network Solutions,Information Technology,5.000000e+07,False,78000,104000,Public Data Analyst,Clerk,"[Summary, \n\n, Responsible, work, cross-funct...","[network, analytic, analysis]",[English]


### Since count vectorization works well with strings we have to convert our extracted features into a string.

In [27]:
def list_stringify(l):
    string = " ".join(l)
    return string

In [28]:
df["tokens_filtered"] = df["tokens_filtered"].apply(list_stringify)

In [29]:
df["languages_filtered"] = df["languages_filtered"].apply(list_stringify)

In [30]:
new_df = df
new_df.head()

Unnamed: 0,Job Title,Salary Estimate,Job Description,Rating,Company Name,Location,Headquarters,Size,Type of ownership,Industry,Sector,Revenue,Easy Apply,min_Salary,max_Salary,New Job Title,Job Position,tokens,tokens_filtered,languages_filtered
0,"Data Analyst, Center on Immigration and Justic...",51500.0,Are you eager to roll up your sleeves and harn...,3.2,Vera Institute of Justice,New York,New York,201 to 500 employees,Nonprofit Organization,Social Assistance,Non-Profit,500000000.0,True,37000,66000,Migration Data Analyst,Clerk,"[Are, eager, roll, sleeve, harness, datum, dri...",foundation analysis statistic security database,English
1,Quality Data Analyst,51500.0,Overview\n\nProvides analytical and technical ...,3.8,Visiting Nurse Service of New York,New York,New York,10000+ employees,Nonprofit Organization,Health Care Services & Hospitals,Health Care,5000000000.0,False,37000,66000,Product Quality Data Analyst,Clerk,"[Overview, \n\n, Provides, analytic, technical...",analysis statistic analytic metric database,English
2,"Senior Data Analyst, Insights & Analytics Team...",51500.0,We’re looking for a Senior Data Analyst who ha...,3.4,Squarespace,New York,New York,1001 to 5000 employees,Company - Private,Internet,Information Technology,500000000.0,False,37000,66000,Service & Operations Data Analyst,Senior,"[We’re, look, Senior, Data, Analyst, love, men...",collaborate analysis analytic metric design excel,English
3,Data Analyst,51500.0,Requisition NumberRR-0001939\nRemote:Yes\nWe c...,4.1,Celerity,New York,McLean,201 to 500 employees,Subsidiary or Business Segment,IT Services,Information Technology,100000000.0,False,37000,66000,Data Analyst,Clerk,"[Requisition, NumberRR-0001939, \n, Remote:Yes...",analytic design analysis,English
4,Reporting Data Analyst,51500.0,ABOUT FANDUEL GROUP\n\nFanDuel Group is a worl...,3.9,FanDuel,New York,New York,501 to 1000 employees,Company - Private,Sports & Recreation,"Arts, Entertainment & Recreation",500000000.0,True,37000,66000,Data Analyst,Reporter,"[ABOUT, FANDUEL, GROUP, \n\n, FanDuel, Group, ...",analysis analytic network design database,English


# COUNT VECTORIZATION

In [31]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features=1500, analyzer='word', lowercase=False)

In [32]:
tech_cv = cv.fit_transform(new_df["tokens_filtered"])
tech_cv = pd.DataFrame(tech_cv.toarray(), columns=cv.get_feature_names())

In [33]:
tech_cv

Unnamed: 0,analysis,analytic,azure,blockchain,cloud,collaborate,database,design,empathy,excel,...,react,rlanguage,security,sql,statistic,swift,tableau,test,usability,visual
0,1,0,0,0,0,0,1,0,0,0,...,0,0,1,0,1,0,0,0,0,0
1,1,1,0,0,0,0,1,0,0,0,...,0,0,0,0,1,0,0,0,0,0
2,1,1,0,0,0,1,0,1,0,1,...,0,0,0,0,0,0,0,0,0,0
3,1,1,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
4,1,1,0,0,0,0,1,1,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2247,0,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
2248,1,1,0,0,0,0,1,1,0,0,...,0,0,0,0,1,0,0,1,0,0
2249,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
2250,1,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [34]:
lang_cv = cv.fit_transform(new_df["languages_filtered"])
lang_cv = pd.DataFrame(lang_cv.toarray(), columns=cv.get_feature_names())

In [35]:
lang_cv

Unnamed: 0,Chinese,English,French,German,Indonesian,Italian,Japanese,Korean,Portuguese,Russian,Spanish
0,0,1,0,0,0,0,0,0,0,0,0
1,0,1,0,0,0,0,0,0,0,0,0
2,0,1,0,0,0,0,0,0,0,0,0
3,0,1,0,0,0,0,0,0,0,0,0
4,0,1,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...
2247,0,1,0,0,0,0,0,0,0,0,0
2248,0,1,0,0,0,0,0,0,0,0,0
2249,0,1,0,0,0,0,0,0,0,0,0
2250,0,1,0,0,0,0,0,0,0,0,0


In [36]:
new_df = new_df.join(tech_cv,how="inner")

In [37]:
new_df = new_df.join(lang_cv,how="inner")
new_df

Unnamed: 0,Job Title,Salary Estimate,Job Description,Rating,Company Name,Location,Headquarters,Size,Type of ownership,Industry,...,English,French,German,Indonesian,Italian,Japanese,Korean,Portuguese,Russian,Spanish
0,"Data Analyst, Center on Immigration and Justic...",51500.0,Are you eager to roll up your sleeves and harn...,3.2,Vera Institute of Justice,New York,New York,201 to 500 employees,Nonprofit Organization,Social Assistance,...,1,0,0,0,0,0,0,0,0,0
1,Quality Data Analyst,51500.0,Overview\n\nProvides analytical and technical ...,3.8,Visiting Nurse Service of New York,New York,New York,10000+ employees,Nonprofit Organization,Health Care Services & Hospitals,...,1,0,0,0,0,0,0,0,0,0
2,"Senior Data Analyst, Insights & Analytics Team...",51500.0,We’re looking for a Senior Data Analyst who ha...,3.4,Squarespace,New York,New York,1001 to 5000 employees,Company - Private,Internet,...,1,0,0,0,0,0,0,0,0,0
3,Data Analyst,51500.0,Requisition NumberRR-0001939\nRemote:Yes\nWe c...,4.1,Celerity,New York,McLean,201 to 500 employees,Subsidiary or Business Segment,IT Services,...,1,0,0,0,0,0,0,0,0,0
4,Reporting Data Analyst,51500.0,ABOUT FANDUEL GROUP\n\nFanDuel Group is a worl...,3.9,FanDuel,New York,New York,501 to 1000 employees,Company - Private,Sports & Recreation,...,1,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2247,Marketing/Communications - Data Analyst-Marketing,91000.0,Job Description\nJob Title: Marketing/Communic...,4.1,APN Software Services Inc.,Broomfield,Newark,51 to 200 employees,Company - Private,Computer Hardware & Software,...,1,0,0,0,0,0,0,0,0,0
2248,RQS - IHHA - 201900004460 -1q Data Security An...,91000.0,Maintains systems to protect data from unautho...,2.5,"Avacend, Inc.",Denver,Alpharetta,51 to 200 employees,Company - Private,Staffing & Outsourcing,...,1,0,0,0,0,0,0,0,0,0
2249,Senior Data Analyst (Corporate Audit),91000.0,Position:\nSenior Data Analyst (Corporate Audi...,2.9,Arrow Electronics,Centennial,Centennial,10000+ employees,Company - Public,Wholesale,...,1,0,0,0,0,0,0,0,0,0
2250,"Technical Business Analyst (SQL, Data analytic...",91000.0,"Title: Technical Business Analyst (SQL, Data a...",3.7,Spiceorb,Denver,Denver,1001 to 5000 employees,Company - Private,IT Services,...,1,0,0,0,0,0,0,0,0,0


In [38]:
pd.set_option('display.max_columns', None)
new_df

Unnamed: 0,Job Title,Salary Estimate,Job Description,Rating,Company Name,Location,Headquarters,Size,Type of ownership,Industry,Sector,Revenue,Easy Apply,min_Salary,max_Salary,New Job Title,Job Position,tokens,tokens_filtered,languages_filtered,analysis,analytic,azure,blockchain,cloud,collaborate,database,design,empathy,excel,express,foundation,framework,git,hadoop,java,javascript,linux,mathematic,metal,metric,mobile,mockup,network,prototype,python,react,rlanguage,security,sql,statistic,swift,tableau,test,usability,visual,Chinese,English,French,German,Indonesian,Italian,Japanese,Korean,Portuguese,Russian,Spanish
0,"Data Analyst, Center on Immigration and Justic...",51500.0,Are you eager to roll up your sleeves and harn...,3.2,Vera Institute of Justice,New York,New York,201 to 500 employees,Nonprofit Organization,Social Assistance,Non-Profit,5.000000e+08,True,37000,66000,Migration Data Analyst,Clerk,"[Are, eager, roll, sleeve, harness, datum, dri...",foundation analysis statistic security database,English,1,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0
1,Quality Data Analyst,51500.0,Overview\n\nProvides analytical and technical ...,3.8,Visiting Nurse Service of New York,New York,New York,10000+ employees,Nonprofit Organization,Health Care Services & Hospitals,Health Care,5.000000e+09,False,37000,66000,Product Quality Data Analyst,Clerk,"[Overview, \n\n, Provides, analytic, technical...",analysis statistic analytic metric database,English,1,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0
2,"Senior Data Analyst, Insights & Analytics Team...",51500.0,We’re looking for a Senior Data Analyst who ha...,3.4,Squarespace,New York,New York,1001 to 5000 employees,Company - Private,Internet,Information Technology,5.000000e+08,False,37000,66000,Service & Operations Data Analyst,Senior,"[We’re, look, Senior, Data, Analyst, love, men...",collaborate analysis analytic metric design excel,English,1,1,0,0,0,1,0,1,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0
3,Data Analyst,51500.0,Requisition NumberRR-0001939\nRemote:Yes\nWe c...,4.1,Celerity,New York,McLean,201 to 500 employees,Subsidiary or Business Segment,IT Services,Information Technology,1.000000e+08,False,37000,66000,Data Analyst,Clerk,"[Requisition, NumberRR-0001939, \n, Remote:Yes...",analytic design analysis,English,1,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0
4,Reporting Data Analyst,51500.0,ABOUT FANDUEL GROUP\n\nFanDuel Group is a worl...,3.9,FanDuel,New York,New York,501 to 1000 employees,Company - Private,Sports & Recreation,"Arts, Entertainment & Recreation",5.000000e+08,True,37000,66000,Data Analyst,Reporter,"[ABOUT, FANDUEL, GROUP, \n\n, FanDuel, Group, ...",analysis analytic network design database,English,1,1,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2247,Marketing/Communications - Data Analyst-Marketing,91000.0,Job Description\nJob Title: Marketing/Communic...,4.1,APN Software Services Inc.,Broomfield,Newark,51 to 200 employees,Company - Private,Computer Hardware & Software,Information Technology,5.000000e+07,False,78000,104000,Market Data Analyst,Clerk,"[Job, Description, \n, Job, Title:, Marketing/...",python rlanguage,English,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0
2248,RQS - IHHA - 201900004460 -1q Data Security An...,91000.0,Maintains systems to protect data from unautho...,2.5,"Avacend, Inc.",Denver,Alpharetta,51 to 200 employees,Company - Private,Staffing & Outsourcing,Business Services,5.000000e+08,False,78000,104000,Security Data Analyst,Clerk,"[Maintains, system, protect, datum, unauthoriz...",security,English,1,1,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0
2249,Senior Data Analyst (Corporate Audit),91000.0,Position:\nSenior Data Analyst (Corporate Audi...,2.9,Arrow Electronics,Centennial,Centennial,10000+ employees,Company - Public,Wholesale,Business Services,1.000000e+10,False,78000,104000,Data Analyst,Senior,"[Position:, \n, Senior, Data, Analyst, (Corpor...",test analysis statistic analytic design database,English,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0
2250,"Technical Business Analyst (SQL, Data analytic...",91000.0,"Title: Technical Business Analyst (SQL, Data a...",3.7,Spiceorb,Denver,Denver,1001 to 5000 employees,Company - Private,IT Services,Information Technology,5.000000e+08,False,78000,104000,Business Intelligence Analyst,Clerk,"[Title:, Technical, Business, Analyst, (SQL,, ...",python rlanguage,English,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0


In [39]:
new_df.to_csv("Data_Analyst_Count_Vectorized.csv",index=False)

In [40]:
new_df.describe()

Unnamed: 0,Salary Estimate,Rating,Revenue,min_Salary,max_Salary,analysis,analytic,azure,blockchain,cloud,collaborate,database,design,empathy,excel,express,foundation,framework,git,hadoop,java,javascript,linux,mathematic,metal,metric,mobile,mockup,network,prototype,python,react,rlanguage,security,sql,statistic,swift,tableau,test,usability,visual,Chinese,English,French,German,Indonesian,Italian,Japanese,Korean,Portuguese,Russian,Spanish
count,2251.0,2251.0,2251.0,2251.0,2251.0,2251.0,2251.0,2251.0,2251.0,2251.0,2251.0,2251.0,2251.0,2251.0,2251.0,2251.0,2251.0,2251.0,2251.0,2251.0,2251.0,2251.0,2251.0,2251.0,2251.0,2251.0,2251.0,2251.0,2251.0,2251.0,2251.0,2251.0,2251.0,2251.0,2251.0,2251.0,2251.0,2251.0,2251.0,2251.0,2251.0,2251.0,2251.0,2251.0,2251.0,2251.0,2251.0,2251.0,2251.0,2251.0,2251.0,2251.0
mean,72107.063527,3.72821,1719488000.0,54251.443803,89962.683252,0.568192,0.429587,0.000444,0.000888,0.037761,0.099511,0.344291,0.311861,0.002221,0.01777,0.010662,0.023989,0.044869,0.00311,0.001777,0.000444,0.000444,0.000444,0.000888,0.000888,0.148823,0.036428,0.000444,0.083074,0.013327,0.116393,0.002665,0.111062,0.089738,0.003554,0.084407,0.000444,0.00311,0.220347,0.003554,0.021324,0.001777,0.990227,0.000444,0.002665,0.000444,0.000888,0.002221,0.000888,0.000444,0.000888,0.006664
std,23604.097551,0.628875,3107175000.0,19575.014492,29321.489013,0.495438,0.495127,0.021077,0.029801,0.19066,0.299414,0.475242,0.463356,0.047088,0.132143,0.102728,0.15305,0.207062,0.055691,0.042126,0.021077,0.021077,0.021077,0.029801,0.029801,0.355993,0.187395,0.021077,0.276056,0.114698,0.320767,0.051571,0.314278,0.285869,0.059522,0.278059,0.021077,0.055691,0.414572,0.059522,0.144494,0.042126,0.098398,0.021077,0.051571,0.021077,0.029801,0.047088,0.029801,0.021077,0.029801,0.081377
min,33500.0,1.0,1000000.0,24000.0,38000.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,58000.0,3.4,100000000.0,41000.0,70000.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,69000.0,3.7,500000000.0,50000.0,87000.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,80500.0,4.0,500000000.0,64000.0,104000.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,150000.0,5.0,10000000000.0,113000.0,190000.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
