In [1]:
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import confusion_matrix
import os
import numpy as np
import pandas as pd
from collections import Counter
from nltk import pos_tag
from nltk.corpus import wordnet
from nltk.corpus import stopwords
stop = stopwords.words('english')
import string
punctuations = list(string.punctuation)
stop = stop + punctuations

In [2]:

def ModPosTag(tag):
    if tag.startswith('J'):
        return wordnet.ADJ
    elif tag.startswith('V'):
        return wordnet.VERB
    elif tag.startswith('N'):
        return wordnet.NOUN
    elif tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN

from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

def simple_clean(words):
    output_words = [w.lower() for w in words if not w.lower() in stop]
    return output_words

def complex_clean(words):
    output_words = []
    for w in words:
        if w.lower() not in stop:
            pos = pos_tag([w])
            clean_w = lemmatizer.lemmatize(w, pos = ModPosTag(pos[0][1]))
            output_words.append(clean_w.lower())
    return output_words


def read_words(words_dir):

    files = [os.path.join(words_dir, fi) for fi in os.listdir(words_dir)]

    #features_matrix = np.zeros((len(files), 3000))
    docID = 0;
    all_words = []
    all_words_heading = []
    major_words_abstract = []
    for fil in files:
        with open(fil) as fi:
            words_abstract = []
            for i, line in enumerate(fi):
                if (i == 0):
                    temp = line.split()
                    temp1 = simple_clean(temp)
                    words_abstract += temp1
                    all_words_heading += temp1
                else:
                    temp = line.split()
                    temp1 = complex_clean(temp)
                    words_abstract += temp1
                    all_words += temp1

            major_words_abstract.append(words_abstract)
            len(all_words)
    return major_words_abstract, all_words, all_words_heading

def tag_updater(df, variable_code):
    for key, values in variable_code.items():
        temp = str(key)
        area = variable_code[key]
        for val in area:
            df.at[val-1, temp] = 1

    return df


def read_tags(words_dir):
    files = [os.path.join(words_dir, fi) for fi in os.listdir(words_dir)]

    # features_matrix = np.zeros((len(files), 3000))
    docID = 0;
    tagss = set()
    for fil in files:
        docID+=1
        with open(fil) as fi:
            for i, line in enumerate(fi):
                temp = line.split(',')
                tagss.update(temp)
    return docID, tagss




In [3]:
mydict = {}

def read_files(words_directory, tags_directory):

    wo = [os.path.join(words_directory, wi) for wi in os.listdir(words_directory)]
    ta = [os.path.join(tags_directory, ti) for ti in os.listdir(tags_directory)]
    ctr=0
    for (ab, t) in zip(wo, ta):
        ctr +=1
        with open(t) as su:
            for i, line in enumerate(su):
                tag = line.split(',')
                for q in tag:
                    if q not in mydict:
                        mydict[q] = []
                    mydict[q].append(ctr)

    return mydict,ctr

def extract_features(Words, features):
    feature_matrix = np.zeros((len(Words), len(features)))
    docID = 0
    for doc in Words:
        for word in doc:
            for i in range(len(features)):
                if features[i] == word:
                    wordID = i
                    feature_matrix[docID, wordID] += 1
        docID = docID + 1
    np.shape(feature_matrix)
    print(feature_matrix)
    return feature_matrix

In [4]:
words_dir = 'abstract'
tags_directory = 'tags'
Words, all_words, all_words_heading = read_words(words_dir)


In [5]:
cou, Tags = read_tags(tags_directory)
#df = pd.DataFrame(Tags)
Words = np.array(Words)
# exploring frequency of all words not in heading
import nltk
freq = nltk.FreqDist(all_words + all_words_heading)
common = freq.most_common(3000)
common = list(common)

features = []
features += [w[0] for w in common]
features += [w for w in all_words_heading if w not in common]
print(len(common))
print(len(features))


3000
4124


In [6]:


variable_code, ctr = read_files(words_dir, tags_directory)
columns = list(Tags)
rows = list(range(ctr))
dataFrame = np.zeros((len(rows), len(columns)))
#df = tag_updater(dataFrame, variable_code, columns)

df = pd.DataFrame(data = dataFrame, index = rows, columns = columns, dtype='int64')
print(variable_code)
Y = tag_updater(df, variable_code)
#print(df.at[1, ' Machine learning '])
# for x in df['Computers']:
#     print(x)



X = extract_features(Words, features)
X_train = pd.DataFrame(data = X, index = rows, columns = features, dtype='int64')


# using binary relevance
from skmultilearn.problem_transform import BinaryRelevance
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score
# initialize binary relevance multi-label classifier
# with a gaussian naive bayes base classifier
classifier = BinaryRelevance(GaussianNB())
# train
classifier.fit(X_train, Y)
# predict
#predictions = classifier.predict(X_train)
#print(predictions.toarray())
#print(accuracy_score(Y, predictions))



[[2. 5. 0. ... 1. 1. 1.]
 [4. 7. 0. ... 0. 0. 0.]
 [8. 3. 0. ... 0. 0. 0.]
 ...
 [7. 5. 4. ... 0. 0. 0.]
 [8. 0. 4. ... 0. 0. 0.]
 [0. 1. 0. ... 3. 3. 1.]]


BinaryRelevance(classifier=GaussianNB(priors=None),
        require_dense=[True, True])

In [45]:
words_dir = 'testing'
Words, waste1, waste2 = read_words(words_dir)
X = extract_features(Words, features)
rows = list(range(1))
X_test = pd.DataFrame(data = X, columns = features, dtype='int64')

[[2. 2. 0. ... 0. 0. 0.]]


In [46]:
predictions = classifier.predict(X_test)

In [47]:
from scipy.sparse import csc_matrix
print(type(predictions))
a = predictions.nonzero()
#a.row[a.data]

<class 'scipy.sparse.csc.csc_matrix'>


In [48]:
def find_index(self):
    A = self.tocoo()
    nz_mask = A.data != 0
    return (list(A.col[nz_mask]))

In [49]:
b = find_index(predictions)
b

[854, 1036, 1713]

In [50]:
we_have = []
print('------------------------TECHNICAL SKILLS REQUIRED-----------------------------')
for i in b:
    print(columns[i])
    we_have.append(columns[i])
    
#we_have.append('artificial intelligence')
#we_have.append('Machine learning algorithms')
#we_have.append('Support vector machines')
#we_have.append('Kernel')

print(we_have)

------------------------TECHNICAL SKILLS REQUIRED-----------------------------
Machine learning 
 Machine learning algorithms 
 Support vector machines 
['Machine learning ', ' Machine learning algorithms ', ' Support vector machines ']


In [51]:
#this is keyword extractor

In [52]:
#next comes the wiki reader

In [53]:
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from nltk.corpus import wordnet
import nltk
import pickle


lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words("english"))


def get_wordnet_pos(treebank_tag):
    if treebank_tag.startswith('J'):
        return wordnet.ADJ
    elif treebank_tag.startswith('V'):
        return wordnet.VERB
    elif treebank_tag.startswith('N'):
        return wordnet.NOUN
    elif treebank_tag.startswith('R'):
        return wordnet.ADV
    else:
        return ''


def load_skill_set():

    skill_dict = dict()
    pick_file = open("final_skill_dict.pickle", "rb")
    skill_dict = pickle.load(pick_file)
    pick_file.close()
    return skill_dict


skill_dict = load_skill_set()


def get_skills(skill_dic,topic):

    # print("getting skill set for ",topic)
    skill_set = set()
    for word in topic.split(' '):
        # print("for word=",word)
        if word.strip().isalpha():
            word = word.strip().lower()
            wordnet_pos = get_wordnet_pos(nltk.pos_tag([word])[0][1])
            if wordnet_pos == '':
                word = lemmatizer.lemmatize(word)
            else:
                word = lemmatizer.lemmatize(word, wordnet_pos)

            try:
                curr_skill_set = skill_dic[word]
                # print("skill set=",curr_skill_set)
                if len(skill_set) == 0:
                    skill_set = curr_skill_set
                else:
                    skill_set = skill_set.intersection(curr_skill_set)

                #print("intersection=",skill_set)
            except KeyError as e:
                print("no skill set found",e)

    return skill_set


print(get_skills(skill_dict,'opportunity'))


{'evaluation of commercial opportunities', 'identifying and seizing fast breaking opportunities', 'opportunity management', 'identification and evaluation of market opportunities'}


In [54]:
import urllib.request
from bs4 import BeautifulSoup


def get_page(topic):
    domain = "https://en.wikipedia.org"
    html = urllib.request.urlopen("https://en.wikipedia.org/w/index.php?search="+topic.replace(' ','+')+"&title=Special%3ASearch&go=Go")
    soup = BeautifulSoup(html, features="lxml")
    first_result = soup.find(attrs={"data-serp-pos": "0"})
    if first_result is None:
        print('page-found')
        return soup
    href = first_result.get('href')
    print('opening first-result')
    html = urllib.request.urlopen(domain+href)
    soup = BeautifulSoup(html, features="lxml")
    return soup


def get_first_para(topic):
    soup = get_page(topic)
    text_section = soup.find(attrs={'class': 'mw-parser-output'})
    text = ''
    for child in text_section.children:
        # print('for tag', child.name, child)
        try:
            if child is not None:
                if child.name == 'p':
                    text += child.text.lower()
                elif child.name == 'div' and 'toc' in child['class']:
                    break
        except Exception as e:
            print("exception", e)
    return text

vo_set = set()

In [55]:
import re
# cleaning we _have 
dore = []
for x in we_have:
    regex = re.compile('[^a-zA-Z]')
    #First parameter is the replacement, second parameter is your input string
    #regex.sub('', x)
    dore.append(re.sub("[^a-zA-Z_ ]*", "", x))

for y in dore:
    print(y)

Machine learning 
 Machine learning algorithms 
 Support vector machines 


In [56]:
import csv
with open('tech2.csv', 'w', newline='') as csvfile:
    spamwriter = csv.writer(csvfile, delimiter=' ', quotechar='|', quoting=csv.QUOTE_MINIMAL)
    spamwriter.writerow(dore)

In [57]:
import nltk
from nltk.tokenize import sent_tokenize
from nltk.tokenize import word_tokenize


if __name__ == '__main__':

    #print(get_first_para('computer vision'))
    for original in dore: 
        print(original)
        w1=get_first_para(original)
        w2=word_tokenize(w1)
        
        print(w1)

        for word in w2:                                                                 
            if word not in stop and word.__len__()>3 and word.isalpha():
                w3 = get_skills(skill_dict, word)
                for x in w3:
                    vo_set.add(x)
                
                


Machine learning 
page-found
machine learning (ml) is a field of artificial intelligence that uses statistical techniques to give computer systems the ability to "learn" (e.g., progressively improve performance on a specific task) from data, without being explicitly programmed.[2]
the name machine learning was coined in 1959 by arthur samuel.[1] machine learning explores the study and construction of algorithms that can learn from and make predictions on data[3] – such algorithms overcome following strictly static program instructions by making data-driven predictions or decisions,[4]:2 through building a model from sample inputs. machine learning is employed in a range of computing tasks where designing and programming explicit algorithms with good performance is difficult or infeasible; example applications include email filtering, detection of network intruders, and computer vision.
machine learning is closely related to (and often overlaps with) computational statistics, which also

page-found
in machine learning, support vector machines (svms, also support vector networks[1]) are supervised learning models with associated learning algorithms that analyze data used for classification and regression analysis.  given a set of training examples, each marked as belonging to one or the other of two categories, an svm training algorithm builds a model that assigns new examples to one category or the other, making it a non-probabilistic binary linear classifier (although methods such as platt scaling exist to use svm in a probabilistic classification setting). an svm model is a representation of the examples as points in space, mapped so that the examples of the separate categories are divided by a clear gap that is as wide as possible. new examples are then mapped into that same space and predicted to belong to a category based on which side of the gap they fall.
in addition to performing linear classification, svms can efficiently perform a non-linear classification us

In [58]:
vo_set

{'abc analysis',
 'ability to analyze your own emotions',
 "ability to see other people's point of view",
 'ability to simplify complex ideas',
 'ability to take criticism',
 'accountability',
 'accountability matrices',
 'action focus',
 'action-oriented',
 'active listening',
 'adaptability',
 'adaptability & resilience',
 'aesthetic sense',
 'agreeableness',
 'aleatory techniques',
 'aligning others around short term and long range plans',
 'analogies',
 'analysis of customer buy cycles',
 'analysis of customer-side politics',
 'analyzing rfps',
 'appealing to group narcissism (e.g. chicago is my favorite town!)',
 'applying data to decision making',
 'applying data to decisions',
 'applying economic models',
 'applying innovation techniques',
 'applying patterns of market change',
 'architecture management',
 'arrangement',
 'artistic ability',
 'artistic sense',
 'assertiveness',
 'attunement to social norms',
 'benchmarking',
 'benefit optimization',
 'benefits analysis',
 'benef

In [59]:
from nltk.stem import WordNetLemmatizer
import pickle


lemmatizer = WordNetLemmatizer()


def load_skill_set2():

    skill_dict = dict()
    pick_file = open("final_skill_dict_2.pickle", "rb")
    skill_dict = pickle.load(pick_file)
    pick_file.close()
    return skill_dict


def get_skill2(skill_dic,topic):
    return skill_dic[ lemmatizer.lemmatize(topic.strip()) ]


# skill_dict = make_skill_dict()
skill_dict2 = load_skill_set2()

print(get_skill2(skill_dict2,'procurement management'))



facilitation skills


In [60]:

print(len(vo_set))

544


In [61]:
new_set = set()
for i in vo_set:
    if i in skill_dict2:
        new_set.add(get_skill2(skill_dict2,i))

len(new_set)

97

In [62]:
print('--------------NON- TECHNICAL SKILLS---------------')
for word in new_set:
    print(word)
my_list2 = list(new_set)

--------------NON- TECHNICAL SKILLS---------------
communication skills
scope management
emotional intelligence
marketing communications
sales pipeline management
negotiation
technical skills
cultural competence
sales operations
visual abilities
core project management skills
benefits management
quality management
procurement management
speaking skills
sales management
information modeling & design
selling skills
networking
program management skills
core executive leadership skills
basic leadership skills
risk management
executive management
core management skills
creative skills
problem solving
time management
entrepreneurial skills
technology (for it project management)
idea formation
program lifecycle
management
market research
targeted communication
human resources management
sales strategy
brand management
writing
persuasion techniques
influencing to negotiate
using emotions
decision making
hard bargaining
sale
influencing
human resources
managerial finance
finance
information vis

In [63]:
import csv
with open('nontech2.csv', 'w', newline='') as csvfile:
    spamwriter = csv.writer(csvfile, delimiter=' ', quotechar='|', quoting=csv.QUOTE_MINIMAL)
    spamwriter.writerow(my_list2)

In [64]:
# trying out reading just the extract for the technical skills
import os
files = [os.path.join('testing', fi) for fi in os.listdir('testing')]
my_text = []
from nltk.tokenize import word_tokenize
for fil in files:
    with open(fil) as fi:
            line = fi.read()
            temp= word_tokenize(line)

In [65]:
temp2 = list(temp)
my_text =  [w.lower() for w in temp2 if not w.lower() in stop]

In [66]:
my_text

['determination',
 'vocational',
 'fields',
 'machine',
 'learning',
 'algorithm',
 'importance',
 'vocational',
 'technical',
 'training',
 'growing',
 'day',
 'day',
 'parallel',
 'developing',
 'technology',
 'inevitable',
 'utilise',
 'opportunities',
 'presented',
 'information',
 'communication',
 'technologies',
 'order',
 'determine',
 'vocational',
 'fields',
 'vocational',
 'technical',
 'training',
 'efficient',
 'manner',
 'respect',
 'possible',
 'create',
 'efficient',
 'tool',
 'compared',
 'current',
 'methods',
 'utilising',
 'machine',
 'learning',
 'artificial',
 'intelligence',
 'model',
 'energy',
 'applications',
 'predicts',
 'events',
 'future',
 'depending',
 'past',
 'experiences',
 'current',
 'study',
 'software',
 'developed',
 'ensures',
 'system',
 'learns',
 'successful',
 'unsuccessful',
 'choices',
 'made',
 'past',
 'applying',
 '“',
 'naive',
 'bayes',
 '”',
 'algorithm',
 'machine',
 'learning',
 'algorithm',
 'data',
 'collected',
 'concerning',
 '

In [67]:
q_set = set()
for q in my_text:
    if q in skill_dict:
        veu = get_skills(skill_dict,q)
        for qw in veu:
            q_set.add(qw)

In [68]:
for qw in q_set:
    print(qw)

communication skills
emotional intelligence
creative direction
digital business strategy
communications to stakeholders
managing emotions
applying economic models
listening.
visual expression of concepts
publicity
marketing communications
intuition
creative questioning
flow charts
graphs
creating and maintaining a high performance culture
project metric development and communications
effective greetings
friendliness
technology savvy
interviewing
whiteboarding
communications
targeting information to an audience
statistical analysis
negotiation
empathy
developing presentations
timelines
using project management tools
designing graphics
community engagement
quick-wittedness
reporting & communication
communications to executive leadership
communication plans
cultural competence
managing a portfolio of it investments
research
applying data to decisions
managing training
visualization standards
fluid intelligence
marketing intelligence management
selective listening
technology trend awarenes

In [69]:
len(q_set)

256

In [70]:
qq_set = set()
for i in q_set:
    qq_set.add(get_skill2(skill_dict2,i))

In [71]:
temp = []
for ele in qq_set:
    temp = temp + (word_tokenize(ele))

In [95]:
dict123 = {}
my_final = {}
my_final['others'] = set()
print(temp)
for i in temp:
    if i in dict123:
        dict123[i] = dict123[i]+1
    else:
        dict123[i] = 1
for ele in qq_set:
    w = word_tokenize(ele)
    for i in w:
        if dict123[i]>5:
            if i in my_final:
                my_final[i].add(ele)
            else:
                my_final[i] = set()
                my_final[i].add(ele)
        if dict123[i]<2:
            my_final['others'].add(ele)
len(qq_set)
len(my_final)

['communication', 'skills', 'presenting', 'visual', 'information', 'cost', 'management', 'scope', 'management', 'emotional', 'intelligence', 'finance', 'information', 'visualization', 'marketing', 'communications', 'business', '&', 'product', 'development', 'strategic', 'program', 'management', 'core', 'executive', 'leadership', 'skills', 'managing', 'visual', 'information', 'sales', 'pipeline', 'management', 'risk', 'management', 'executive', 'management', 'core', 'management', 'skills', 'presentation', 'skills', 'soft', 'skills', 'applying', 'innovation', 'techniques', 'technical', 'skills', 'stakeholder', 'management', 'marketing', 'strategy', '&', 'planning', 'time', 'management', 'entrepreneurial', 'skills', 'project', 'reporting', '&', 'communication', 'technology', '(', 'for', 'it', 'project', 'management', ')', 'change', 'management', 'professional', 'communication', 'business', 'execution', 'cultural', 'competence', 'sales', 'operations', 'idea', 'formation', 'program', 'lifec

4

In [96]:
my_final

{'communication': {'communication',
  'communication skills',
  'leadership of visual communication',
  'professional communication',
  'reporting & communication',
  'targeted communication',
  'visual communication'},
 'management': {'brand management',
  'change management',
  'core management skills',
  'cost management',
  'executive management',
  'human resources management',
  'integration management',
  'management',
  'procurement management',
  'risk management',
  'sales management',
  'sales pipeline management',
  'scope management',
  'stakeholder management',
  'strategic program management',
  'technology (for it project management)',
  'time management'},
 'others': {'applying innovation techniques',
  'brand management',
  'business & product development',
  'business execution',
  'change management',
  'cost management',
  'cultural competence',
  'decision making',
  'emotional intelligence',
  'entrepreneurial skills',
  'finance',
  'governance',
  'human resour

In [97]:
### --------    Anupams dictionary  --------  ###


# from nltk.stem import WordNetLemmatizer
# import pickle
# import xlrd


# lemmatizer = WordNetLemmatizer()

# # print(sheet.cell_value(0,0))


# def make_skill_dict_final():

#     file = ("test_files/skills_anupam.xlsx")

#     wb = xlrd.open_workbook(file)
#     sheet = wb.sheet_by_index(0)

#     # print(len(lines))
#     skill_dict = {}

#     root_skill = ''
#     for i in range(sheet.nrows):
#         if sheet.cell_value(i,0) != '':
#             root_skill = lemmatizer.lemmatize(sheet.cell_value(i, 0).strip().lower())

#         for j in range(sheet.ncols):

#             if sheet.cell_value(i, j) != '':
#                 key = lemmatizer.lemmatize(sheet.cell_value(i, j).strip().lower())

#                 if key in skill_dict:
#                     skill_dict[key].add(root_skill)
#                 else:
#                     skill_dict[key] = {root_skill}

#     print("printing dict.......")
#     for key in skill_dict:
#         print(str(key).ljust(40) + str(skill_dict[key]).rjust(40))

#     pick_file = open("anupam_skill_dict.pickle","wb")
#     pickle.dump(skill_dict, pick_file)
#     pick_file.close()

#     return skill_dict


# def load_skill_set_final():

#     pick_file = open("anupam_skill_dict.pickle", "rb")
#     skill_dict = pickle.load(pick_file)
#     pick_file.close()
#     return skill_dict


# def get_skill_final(skill_dic,topic):
#     topic = lemmatizer.lemmatize(topic.strip().lower())
#     if topic in skill_dic:
#         return skill_dic[topic]
#     else:
#         return None


# # skill_dict = make_skill_dict()
# skill_dict_final = load_skill_set()

# print(get_skill_final(skill_dict_final,'motivate'))
# # while True:
# #     inp = input("topic:")
# #     print(get_skill(skill_dict,inp))

In [98]:
print('----------------Technical skills---------------')
for zzz in dore:
    print(zzz)
print()
print('----------------Technical skills---------------')
print()

for i in my_final:
    
    print('--------------------------------')
    print('             --', i , '--')
    print('---------------------------------')
    for j in my_final[i]:
        print(j, ',')
        

----------------Technical skills---------------
Machine learning 
 Machine learning algorithms 
 Support vector machines 

----------------Technical skills---------------

--------------------------------
             -- others --
---------------------------------
presenting visual information ,
cost management ,
scope management ,
emotional intelligence ,
finance ,
information visualization ,
marketing communications ,
business & product development ,
strategic program management ,
managing visual information ,
sales pipeline management ,
applying innovation techniques ,
risk management ,
presentation skills ,
soft skills ,
technical skills ,
stakeholder management ,
marketing strategy & planning ,
time management ,
entrepreneurial skills ,
reporting & communication ,
technology (for it project management) ,
business execution ,
cultural competence ,
sales operations ,
idea formation ,
program lifecycle ,
targeted communication ,
market research ,
interpersonal skills ,
human resource