Step 1: Setup and Read Data

In [None]:
import spacy
import re
import nltk
from collections import Counter
import os
import glob

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# Load English tokenizer, POS tagger, parser, NER and word vectors
nlp = spacy.load("en_core_web_sm")

# Read stop words from file
with open('/content/drive/MyDrive/Natural Language Processing/Assignment5 data/stopwords_en.txt', 'r') as f:
    stopwords = set(f.read().splitlines())

In [None]:
# Function to extract job categries
def extract_job_categories(data_folder):
    job_categories = []
    for category in os.listdir(data_folder):
        category_path = os.path.join(data_folder, category)
        if os.path.isdir(category_path):
            job_categories.append(category)
    return job_categories

In [None]:
job_categories = extract_job_categories('/content/drive/MyDrive/Natural Language Processing/Assignment5 data/data')
print(job_categories)

['Engineering', 'Healthcare_Nursing', 'Sales', 'Accounting_Finance']


In [None]:
def load_job_descriptions(data_folder):
    job_descriptions = []

    for category in os.listdir(data_folder):
        category_path = os.path.join(data_folder, category)

        if os.path.isdir(category_path):
            for job_file in glob.glob(os.path.join(category_path, "Job_*.txt")):
                with open(job_file, 'r', encoding='utf-8') as f:
                    content = f.read()

                    # Extracting the relevant parts of the job description
                    title = extract_value(content, 'Title:')
                    webindex = extract_value(content, 'Webindex:')
                    company = extract_value(content, 'Company:')
                    description = extract_value(content, 'Description:')

                    # Creating the job dictionary
                    job_data = {
                        'Category': category,
                        'Title': title,
                        'Webindex': webindex,
                        'Company': company,
                        'Description': description
                    }

                    job_descriptions.append(job_data)

    return job_descriptions

def extract_value(content, key):
    try:
        # Find the starting point of the key
        start_idx = content.index(key) + len(key)
        # Find the end of the line or the next key
        end_idx = content.find('\n', start_idx)
        if end_idx == -1:  # If there's no newline character, take until the end of content
            end_idx = len(content)
        value = content[start_idx:end_idx].strip()
        return value
    except ValueError:
        return None

In [None]:
job_descriptions = load_job_descriptions('/content/drive/MyDrive/Natural Language Processing/Assignment5 data/data')

In [None]:
job_descriptions[0:5]

[{'Category': 'Engineering',
  'Title': 'Site Maintenance Engineer (ElectroMechanical)',
  'Webindex': '72635560',
  'Company': 'Rise Technical Recruitment',
  'Description': 'Site Maintenance Engineer (ElectroMechanical) Birmingham ****  extensive company benefits Are you a Maintenance Engineer looking for structured training and development within a days based position? On offer is plenty of overtime and the chance to develop and progress within a company with the best reputation in the industry. The company is a leader in manufacturing generators and motors. As a Site Engineer you will be required to work in house on motors and servo drives. You will have the stability of working for a growing company and the opportunity to technically progress. With continuous expansion plans in place, this is a real chance to progress your engineering career and be a part of an exciting future. The role:  Repair of motors  Installation of servo drives  Site repair of machinery The Person:  Service

Step 2: Tokenize each job description

In [None]:
# Function To Tokenize each job description
def tokenize_descriptions(descriptions):
  for job in descriptions:
    # Tokenize the description in each job dictionary
    description = job.get('Description', '')
    tokens = re.findall(r"[a-zA-Z]+(?:[-'][a-zA-Z]+)?", description)
    job['Description'] = tokens
  return descriptions

In [None]:
tokenized_descriptions = tokenize_descriptions(job_descriptions)
# for tokens in tokenized_descriptions:
#     print(tokens)
len(tokenized_descriptions)

776

In [None]:
tokenized_descriptions[0:5]

[{'Category': 'Engineering',
  'Title': 'Site Maintenance Engineer (ElectroMechanical)',
  'Webindex': '72635560',
  'Company': 'Rise Technical Recruitment',
  'Description': ['Site',
   'Maintenance',
   'Engineer',
   'ElectroMechanical',
   'Birmingham',
   'extensive',
   'company',
   'benefits',
   'Are',
   'you',
   'a',
   'Maintenance',
   'Engineer',
   'looking',
   'for',
   'structured',
   'training',
   'and',
   'development',
   'within',
   'a',
   'days',
   'based',
   'position',
   'On',
   'offer',
   'is',
   'plenty',
   'of',
   'overtime',
   'and',
   'the',
   'chance',
   'to',
   'develop',
   'and',
   'progress',
   'within',
   'a',
   'company',
   'with',
   'the',
   'best',
   'reputation',
   'in',
   'the',
   'industry',
   'The',
   'company',
   'is',
   'a',
   'leader',
   'in',
   'manufacturing',
   'generators',
   'and',
   'motors',
   'As',
   'a',
   'Site',
   'Engineer',
   'you',
   'will',
   'be',
   'required',
   'to',
   'wor

Step 3: Convert all words to lowercase

In [None]:
for desc_list in tokenized_descriptions:
  description=[]
  for token in desc_list['Description']:
    description.append(token.lower())
  desc_list['Description']=description

In [None]:
tokenized_descriptions[0:5]

[{'Category': 'Engineering',
  'Title': 'Site Maintenance Engineer (ElectroMechanical)',
  'Webindex': '72635560',
  'Company': 'Rise Technical Recruitment',
  'Description': ['site',
   'maintenance',
   'engineer',
   'electromechanical',
   'birmingham',
   'extensive',
   'company',
   'benefits',
   'are',
   'you',
   'a',
   'maintenance',
   'engineer',
   'looking',
   'for',
   'structured',
   'training',
   'and',
   'development',
   'within',
   'a',
   'days',
   'based',
   'position',
   'on',
   'offer',
   'is',
   'plenty',
   'of',
   'overtime',
   'and',
   'the',
   'chance',
   'to',
   'develop',
   'and',
   'progress',
   'within',
   'a',
   'company',
   'with',
   'the',
   'best',
   'reputation',
   'in',
   'the',
   'industry',
   'the',
   'company',
   'is',
   'a',
   'leader',
   'in',
   'manufacturing',
   'generators',
   'and',
   'motors',
   'as',
   'a',
   'site',
   'engineer',
   'you',
   'will',
   'be',
   'required',
   'to',
   'wor

In [None]:
def has_uppercase(token_list,size):
    for token in token_list:
        for char in token:
            if char.isupper():
                size+=1
                print(token)

In [None]:
size=0
for token_list in tokenized_descriptions:
    has_uppercase(token_list['Description'],size)
print(f"Count of words who have uppercase characters: {size}")

Count of words who have uppercase characters: 0


Step 4: Remove words with length less than 2

In [None]:
size=0
for token_list in tokenized_descriptions:
  for token in token_list['Description']:
    if len(token) < 2:
        size+=1
        print(token)
print(f"Count of words whose length is less than 2 characters: {size}")

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
a
a
a
k
a
c
c
c
c
a
a
a
a
a
a
a
a
a
h
a
a
h
s
a
a
a
j
a
a
a
a
a
a
a
a
a
d
a
s
a
a
a
a
a
a
a
s
a
a
a
a
a
a
a
a
s
a
s
k
a
a
a
a
a
a
a
a
a
a
a
m
a
a
a
a
a
a
a
a
a
a
a
a
t
a
a
a
a
a
a
a
m
e
k
m
m
e
a
a
a
m
e
d
b
a
a
a
p
h
a
a
a
p
h
a
a
a
a
a
a
a
a
a
a
t
a
a
a
d
a
a
d
a
a
a
a
a
a
a
a
a
a
a
a
c
c
a
a
a
a
k
a
a
a
a
a
a
a
a
k
a
a
a
a
a
a
a
a
a
a
a
a
a
a
p
h
k
a
a
a
k
c
g
c
g
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
s
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
s
a
m
e
a
a
a
a
a
a
a
a
a
a
x
a
a
s
a
a
a
a
a
a
a
a
a
a
a
a
a
i
a
a
a
f
a
m
e
a
k
a
a
a
a
a
a
a
a
a
a
a
a
i
a
a
s
a
a
a
s
a
a
a
a
a
t
s
a
a
a
a
a
a
a
d
a
a
a
s
a
a
a
a
a
a
a
c
k
a
a
a
a
a
a
a
a
a
a
a
a
a
s
a
a
a
m
e
k
m
e
a
a
a
a
a
s
a
a
a
a
a
a
a
a
a
s
a
a
a
a
a
a
a
a
s
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
c
c
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
i
e
a
a
a
x
a
x
x
a
a
a
a
a
a
a
a
a
m
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
s
a
s
s
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
v
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a

In [None]:
for token_list in tokenized_descriptions:
  for token in token_list['Description']:
    if len(token) < 2:
        token_list['Description'].remove(token)

In [None]:
size=0
for token_list in tokenized_descriptions:
  for token in token_list['Description']:
    if len(token) < 2:
        size+=1
        print(token)
print(f"Count of words whose length is less than 2 characters: {size}")

Count of words whose length is less than 2 characters: 0


In [None]:
tokenized_descriptions[0:5]

[{'Category': 'Engineering',
  'Title': 'Site Maintenance Engineer (ElectroMechanical)',
  'Webindex': '72635560',
  'Company': 'Rise Technical Recruitment',
  'Description': ['site',
   'maintenance',
   'engineer',
   'electromechanical',
   'birmingham',
   'extensive',
   'company',
   'benefits',
   'are',
   'you',
   'maintenance',
   'engineer',
   'looking',
   'for',
   'structured',
   'training',
   'and',
   'development',
   'within',
   'days',
   'based',
   'position',
   'on',
   'offer',
   'is',
   'plenty',
   'of',
   'overtime',
   'and',
   'the',
   'chance',
   'to',
   'develop',
   'and',
   'progress',
   'within',
   'company',
   'with',
   'the',
   'best',
   'reputation',
   'in',
   'the',
   'industry',
   'the',
   'company',
   'is',
   'leader',
   'in',
   'manufacturing',
   'generators',
   'and',
   'motors',
   'as',
   'site',
   'engineer',
   'you',
   'will',
   'be',
   'required',
   'to',
   'work',
   'in',
   'house',
   'on',
   'mo

Step 5: Remove Stopwords

In [None]:
def check_stopwords(tokens, stopwords_file):
    stopwords = set()
    with open(stopwords_file, 'r') as file:
        stopwords = set(file.read().splitlines())
    size=0
    for token_list in tokens:
      for token in token_list['Description']:
        if token in stopwords:
            size+=1
            print("Stopword found:", token)
    print(f"Count of stopwords: {size}")

In [None]:
stopwords_file = "/content/drive/MyDrive/Natural Language Processing/Assignment5 data/stopwords_en.txt"
check_stopwords(tokenized_descriptions, stopwords_file)

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Stopword found: their
Stopword found: have
Stopword found: become
Stopword found: the
Stopword found: of
Stopword found: for
Stopword found: of
Stopword found: and
Stopword found: the
Stopword found: will
Stopword found: be
Stopword found: as
Stopword found: of
Stopword found: us
Stopword found: that
Stopword found: across
Stopword found: the
Stopword found: us
Stopword found: the
Stopword found: also
Stopword found: and
Stopword found: there
Stopword found: will
Stopword found: be
Stopword found: of
Stopword found: and
Stopword found: the
Stopword found: will
Stopword found: and
Stopword found: and
Stopword found: and
Stopword found: from
Stopword found: across
Stopword found: of
Stopword found: with
Stopword found: to
Stopword found: own
Stopword found: and
Stopword found: in
Stopword found: the
Stopword found: and
Stopword found: and
Stopword found: of
Stopword found: the
Stopword found: will
Stopword found: have
Stopw

In [None]:
def remove_stopwords(tokens_list, stopwords_file):
    stopwords = set()
    with open(stopwords_file, 'r') as file:
        stopwords = set(file.read().splitlines())

    filtered_tokens_list = []
    for tokens in tokens_list:
      # Create a copy to avoid modifying original dictionary
      filtered_tokens = dict(tokens)
      if 'Description' in filtered_tokens:
        # Lowercase and filter description tokens
        filtered_tokens['Description'] = [
          token for token in [t.lower() for t in filtered_tokens['Description']] if token not in stopwords
      ]
      filtered_tokens_list.append(filtered_tokens)
    return filtered_tokens_list

In [None]:
remove_stopwords_description = remove_stopwords(tokenized_descriptions, stopwords_file)

In [None]:
check_stopwords(remove_stopwords_description, stopwords_file)

Count of stopwords: 0


In [None]:
remove_stopwords_description[0:5]

[{'Category': 'Engineering',
  'Title': 'Site Maintenance Engineer (ElectroMechanical)',
  'Webindex': '72635560',
  'Company': 'Rise Technical Recruitment',
  'Description': ['site',
   'maintenance',
   'engineer',
   'electromechanical',
   'birmingham',
   'extensive',
   'company',
   'benefits',
   'maintenance',
   'engineer',
   'structured',
   'training',
   'development',
   'days',
   'based',
   'position',
   'offer',
   'plenty',
   'overtime',
   'chance',
   'develop',
   'progress',
   'company',
   'reputation',
   'industry',
   'company',
   'leader',
   'manufacturing',
   'generators',
   'motors',
   'site',
   'engineer',
   'required',
   'work',
   'house',
   'motors',
   'servo',
   'drives',
   'stability',
   'working',
   'growing',
   'company',
   'opportunity',
   'technically',
   'progress',
   'continuous',
   'expansion',
   'plans',
   'place',
   'real',
   'chance',
   'progress',
   'engineering',
   'career',
   'part',
   'exciting',
   'fut

In [None]:
size=0
for i in remove_stopwords_description:
  size+=len(i['Description'])
print(size)

107161


In [None]:
len(remove_stopwords_description)

776

Step 6: Remove the word that appears only once

In [None]:
from collections import Counter

def find_single_occurrence_words(tokens):
    word_counts = Counter()
    # Count the frequency of each word
    for token_list in tokens:
      word_counts.update(token_list['Description'])

    # Identify words that appear only once
    single_occurrence_words = [word for word, count in word_counts.items() if count == 1]

    return single_occurrence_words

In [None]:
single_occurrence_words = find_single_occurrence_words(remove_stopwords_description)
print("Words that appear only once:", single_occurrence_words)
print("Count of Words that appear only once:", len(single_occurrence_words))

Words that appear only once: ['winder', 'assembling', 'littlehampton', 'expereinced', 'expereince', 'akton', 'cliche', 'atkon', 'litteraly', 'dracup', 'tdracupaktonrecruitment', 'adopts', 'noa', 'muratsubaki', 'grip', 'licenses', 'worldleading', 'aligning', 'oscilloscopes', 'dmm', 'digitaltestengineer', 'fmmedia', 'victims', 'assault', 'violent', 'distressed', 'witnesses', 'transcribe', 'lost', 'standardised', 'nonspecialist', 'negligible', 'highvolume', 'instruction', 'escalates', 'spends', 'comcats', 'hants', 'berks', 'maker', 'tandem', 'predetermined', 'strip', 'regrind', 'setters', 'converse', 'wheels', 'tips', 'die', 'polishing', 'rerefurbishment', 'presses', 'shadowgraph', 'projector', 'wasting', 'wouldn', 'unhappy', 'turners', 'universal', 'ntd', 'joiners', 'eng', 'seniormanufacturingengineer', 'diagnostics', 'temptesttechnician', 'hoist', 'veterinary', 'persistent', 'shaped', 'flagship', 'cfd', 'acis', 'winchfield', 'detector', 'enquires', 'detectors', 'checkweighers', 'rpo', '

In [None]:
def remove_single_occurrence_words(tokens_list):
    filtered_tokens_list = []
    for tokens in tokens_list:
        filtered_tokens = dict(tokens)
        if 'Description' in filtered_tokens:
          # Lowercase and filter description tokens
          filtered_tokens['Description'] = [
            token for token in [t for t in filtered_tokens['Description']] if token not in single_occurrence_words
          ]
        # Append the filtered token list to the result
        filtered_tokens_list.append(filtered_tokens)
    return filtered_tokens_list

In [None]:
remove_single_occurrence_words_description = remove_single_occurrence_words(remove_stopwords_description)

In [None]:
single_occurrence_word_present = find_single_occurrence_words(remove_single_occurrence_words_description)
print("Words that appear only once:", single_occurrence_word_present)
print("Count of Words that appear only once:", len(single_occurrence_word_present))

Words that appear only once: []
Count of Words that appear only once: 0


In [None]:
size=0
for i in remove_single_occurrence_words_description:
  size+=len(i['Description'])
print(size)

102975


In [None]:
len(remove_single_occurrence_words_description)

776

Step 7:Remove the top 50 most frequent words

In [None]:
from collections import Counter

def find_top_frequent_words(tokens, n):
    word_counts = Counter()
    # Count the frequency of each word
    for token_list in tokens:
      word_counts.update(token_list['Description'])

    # Identify words that appear only once
    top_n_occurrence_words = [word for word, count in word_counts.most_common(n)]

    return top_n_occurrence_words

In [None]:
top_50_occurrence_words = find_top_frequent_words(remove_single_occurrence_words_description,50)
print("Top 50 most occurrance Words:", top_50_occurrence_words)
print("Count of top 50 most occurrance Words:", len(top_50_occurrence_words))

Top 50 most occurrance Words: ['experience', 'sales', 'role', 'work', 'business', 'team', 'working', 'job', 'care', 'skills', 'company', 'client', 'management', 'manager', 'support', 'uk', 'service', 'excellent', 'development', 'required', 'based', 'opportunity', 'services', 'knowledge', 'apply', 'successful', 'training', 'design', 'engineering', 'recruitment', 'customer', 'salary', 'candidate', 'clients', 'high', 'join', 'ability', 'strong', 'provide', 'home', 'ensure', 'leading', 'including', 'engineer', 'financial', 'good', 'staff', 'position', 'systems', 'full']
Count of top 50 most occurrance Words: 50


In [None]:
def remove_top_frequent_words(tokens_list, n):
    filtered_tokens_list = []
    for tokens in tokens_list:
      filtered_tokens = dict(tokens)
      if 'Description' in filtered_tokens:
        # Lowercase and filter description tokens
        filtered_tokens['Description'] = [
          token for token in [t for t in filtered_tokens['Description']] if token not in top_50_occurrence_words
        ]
        # # Append the filtered token list to the result
        filtered_tokens_list.append(filtered_tokens)

    return filtered_tokens_list

In [None]:
remove_top_50_frequent_words_description = remove_top_frequent_words(remove_single_occurrence_words_description, 50)

In [None]:
size=0
for i in remove_top_50_frequent_words_description:
  size+=len(i['Description'])
print(f"Count of words after removing top 50 frequent words: {size}")

Count of words after removing top 50 frequent words: 80068


In [None]:
remove_top_50_frequent_words_description[0:3]

[{'Category': 'Engineering',
  'Title': 'Site Maintenance Engineer (ElectroMechanical)',
  'Webindex': '72635560',
  'Company': 'Rise Technical Recruitment',
  'Description': ['site',
   'maintenance',
   'electromechanical',
   'birmingham',
   'extensive',
   'benefits',
   'maintenance',
   'structured',
   'days',
   'offer',
   'plenty',
   'overtime',
   'chance',
   'develop',
   'progress',
   'reputation',
   'industry',
   'leader',
   'manufacturing',
   'generators',
   'motors',
   'site',
   'house',
   'motors',
   'servo',
   'drives',
   'stability',
   'growing',
   'technically',
   'progress',
   'continuous',
   'expansion',
   'plans',
   'place',
   'real',
   'chance',
   'progress',
   'career',
   'part',
   'exciting',
   'future',
   'repair',
   'motors',
   'installation',
   'servo',
   'drives',
   'site',
   'repair',
   'machinery',
   'person',
   'installation',
   'electronic',
   'electrical',
   'speed',
   'drives',
   'motors',
   'minimum',
   

In [None]:
len(remove_top_50_frequent_words_description)

776

Step 8: Save all job advertisement text

In [None]:
import json

def convert_descriptions_to_sentence(data):
  processed_data = []
  for item in data:
    if 'Description' in item:
      # Join description tokens into a sentence
      description_sentence = ' '.join(item['Description'])
      item['Description'] = description_sentence
    processed_data.append(item)

  # Write processed data to JSON file (replace 'output.json' with your desired filename)
  with open('/content/drive/MyDrive/Natural Language Processing/Assignment5_Solution/preprocessed_ads.json', 'w') as outfile:
    json.dump(processed_data, outfile)

In [None]:
convert_descriptions_to_sentence(remove_top_50_frequent_words_description)

In [None]:
json_file = '/content/drive/MyDrive/Natural Language Processing/Assignment5_Solution/preprocessed_ads.json'
with open(json_file, 'r') as file:
    preprocessed_ads = json.load(file)  # Load JSON data from the file

In [None]:
preprocessed_ads[500]

{'Category': 'Sales',
 'Title': 'Territory Manager',
 'Webindex': '72672874',
 'Company': 'BMS Sales Specialists LLP',
 'Description': "territory diy products basic bonus car mobile phone pension scheme brand leader manufacturer diy products experiencing period growth territory covering north east region brand award winning marketing highly ethical approach offer genuinely secure exciting career individual selling independent diy retailers merchants interior stores shops liaise store level owners daily basis focus point sale monthly promotions product introduction merchandising diary planning important build relationships boost presence stores covering newcastle darlington middlesbrough sunderland areas complete autonomy manage patch feel located region corridor set realistic targets kpi's whilst rewarded financially results person ideal structured organised approach territory personable enthusiastic manner minimum account fmcg diy hardware construction sector north east territories co

In [None]:
def save_job_categories(job_categories, file_path):
    with open(file_path, 'w') as file:
        file.write('\n'.join(job_categories))

In [None]:
save_job_categories(job_categories,'/content/drive/MyDrive/Natural Language Processing/Assignment5_Solution/job_categories.txt')

Step 9: Build vocabulary of the cleaned job advertisement descriptions

In [None]:
def build_vocabulary(tokens_list):
    vocabulary = []
    for job in tokens_list:
      # Tokenize the description in each job dictionary
      description = job.get('Description', '')
      tokens = re.findall(r"[a-zA-Z]+(?:[-'][a-zA-Z]+)?", description)
      for token in tokens:
        if token not in vocabulary:
          vocabulary.append(token)
    vocabulary=sorted(set(vocabulary))
    return vocabulary

In [None]:
def save_vocabulary(vocabulary, file_path):
    with open(file_path, 'w') as file:
        for index, word in enumerate(vocabulary):
            file.write(f"{word}:{index}\n")

In [None]:
# Build vocabulary
vocabulary = build_vocabulary(remove_top_50_frequent_words_description)
print(vocabulary[0:10])
print(len(vocabulary))

['aap', 'aaron', 'aat', 'abb', 'abenefit', 'aberdeen', 'abi', 'abilities', 'abreast', 'abroad']
5168


In [None]:
save_vocabulary(vocabulary, '/content/drive/MyDrive/Natural Language Processing/Assignment5_Solution/vocab.txt')