In [1]:
import pandas as pd

In [2]:
df = pd.read_json("./sample.json", lines=True)

In [3]:
df.head(5)


Unnamed: 0,id,title,min_amount,max_amount,full_description
0,0,Senior .NET/C# Developer (Open to Remote),130000.0,135000.0,Senior .NET/C# Developer (Open to Remote) Peng...
1,1,REMOTE - Sr. Software Test Engineer,,,REMOTE - Sr. Software Test Engineer The Softwa...
2,2,Golang Developer,110727.0,135327.0,"Golang Developer Overall, 6-10Go Lang experien..."
3,3,Software Development Engineer In Test (SEDT),,,Software Development Engineer In Test (SEDT) *...
4,4,Outlook Add-In (C#) Developer,50573.0,79932.0,Outlook Add-In (C#) Developer **Requisition Nu...


In [4]:
def extract_surrounding_words(text, keyword, num_words=3):
    words = text.split()
    
    keyword_indices = [i for i, word in enumerate(words) if keyword in word]

    surrounding_words = []
    for index in keyword_indices:
        start = max(0, index - num_words) # Ensure start index isn't negative
        end = min(len(words), index + num_words + 1) # Ensure end index doesn't exceed list bounds
        context = words[start:index] + words[index+1:end] # Exclude the keyword itself
        surrounding_words.append(context)
    
    return surrounding_words

text = "With over four years of industry experience, our team has the knowledge and experience to handle any project."

surrounding_words = extract_surrounding_words(text, "experience")
surrounding_words

[['years', 'of', 'industry', 'our', 'team', 'has'],
 ['the', 'knowledge', 'and', 'to', 'handle', 'any']]

In [5]:
df_d = df['full_description']
df_d.head(5)

0    Senior .NET/C# Developer (Open to Remote) Peng...
1    REMOTE - Sr. Software Test Engineer The Softwa...
2    Golang Developer Overall, 6-10Go Lang experien...
3    Software Development Engineer In Test (SEDT) *...
4    Outlook Add-In (C#) Developer **Requisition Nu...
Name: full_description, dtype: object

In [6]:
for i in range(20):
    print(i)
    print(df_d[i])
    print()

0
Senior .NET/C# Developer (Open to Remote) Penguin Random House is on the lookout for a seasoned .NET/C# Developer to join our Data Warehousing and Business Intelligence team. We are seeking a talented individual who can contribute to the maintenance and enhancement of our Azure-hosted web applications. These critical applications are instrumental for our internal users, providing insights that influence pivotal business decisions, streamline operations, and bolster productivity.  As a key member of our team, the successful candidate will not only maintain and refine existing applications but also take charge of crafting new features and entire applications in response to the evolving needs of the business. Collaborating closely with colleagues and business stakeholders, the Developer will play an integral role in requirements gathering and delivering iterative solutions throughout the lifecycle of various projects.  This role offers the exciting opportunity to stay at the forefront o

In [7]:
import re

def extract_experience_numbers_with_words_completed(text):
    # Mapping of numeric words to their numeric values
    numeric_words = {
        "one": "1", "two": "2", "three": "3", "four": "4", "five": "5",
        "six": "6", "seven": "7", "eight": "8", "nine": "9", "ten": "10",
        "eleven": "11", "twelve": "12", "thirteen": "13", "fourteen": "14",
        "fifteen": "15", "sixteen": "16", "seventeen": "17", "eighteen": "18",
        "nineteen": "19", "twenty": "20"
    }

    # Combine numeric words and digits into a single regex pattern
    number_pattern = re.compile(r'\b(\d+\+?|' + '|'.join(numeric_words.keys()) + r')\b', re.IGNORECASE)
    
    # Tokenize the text into words
    words = text.split()
    
    # Find all occurrences of numeric expressions (digits or numeric words)
    numbers = []
    for i, w in enumerate(words):
        if number_pattern.match(w):
            # Convert numeric words to their corresponding digits
            num = w.lower()
            num = numeric_words.get(num, w)  # Keep the original if it's a digit
            numbers.append((num, i))
    
    # Check for 'experience' or 'years' within 4 words before or after the number
    experience_related_numbers = []
    full = []
    for number, index in numbers:
        # Define the search window
        start = max(0, index - 3)
        end = min(len(words), index + 8)  # +5 because range end is exclusive
        window = words[start:end]
        
        # Check if 'experience' or 'years' is in the window
        if 'experience' in window and any(keyword in window for keyword in ['years', 'year']):
                    full_window = " ".join(window)  # Convert window back to string
                    full.append(full_window)
                    experience_related_numbers.append(number)

    
    highest_value = process_extracted_numbers(experience_related_numbers)
    return  highest_value, experience_related_numbers, full


def process_extracted_numbers(experience_related_numbers):
    processed_numbers = []
    for number in experience_related_numbers:
        # Remove '+' signs
        number = number.replace('+', '')
        # Convert ranges to their lowest value
        if '-' in number:
            number = number.split('-')[0]
        # Convert to integer for comparison
        try:
            processed_number = int(number)
            processed_numbers.append(processed_number)
        except ValueError:
            # Handle the case where the number can't be converted to int
            continue

    # Find the highest value
    if processed_numbers:
        highest_value = max(processed_numbers)
    else:
        highest_value = None

    return highest_value


In [8]:
highest_experience_years = []
for i in range(len(df)):
    high, exp, full = extract_experience_numbers_with_words_completed(df_d[i])
    #print("{} {} ----- {}".format(high, exp, full))
    highest_experience_years.append(high)

df['experience'] = highest_experience_years
df.head(5)

Unnamed: 0,id,title,min_amount,max_amount,full_description,experience
0,0,Senior .NET/C# Developer (Open to Remote),130000.0,135000.0,Senior .NET/C# Developer (Open to Remote) Peng...,5.0
1,1,REMOTE - Sr. Software Test Engineer,,,REMOTE - Sr. Software Test Engineer The Softwa...,3.0
2,2,Golang Developer,110727.0,135327.0,"Golang Developer Overall, 6-10Go Lang experien...",2.0
3,3,Software Development Engineer In Test (SEDT),,,Software Development Engineer In Test (SEDT) *...,
4,4,Outlook Add-In (C#) Developer,50573.0,79932.0,Outlook Add-In (C#) Developer **Requisition Nu...,2.0


In [9]:
print(df['title'])

0              Senior .NET/C# Developer (Open to Remote)
1                    REMOTE - Sr. Software Test Engineer
2                                       Golang Developer
3           Software Development Engineer In Test (SEDT)
4                          Outlook Add-In (C#) Developer
                             ...                        
420    Software Engineer (React Native Mobile Developer)
421                     Software Developer (Hybrid role)
422                               Field Service Engineer
423    Software Engineer, PLC, SCADA, WinCC OA  Dalla...
424                         Automation Software Engineer
Name: title, Length: 425, dtype: object


In [10]:
senior_title = []
for i, row in df.iterrows():
    if any(keyword in row['title'].lower() for keyword in ['senior', 'sr ', 'sr.', 'senior.']):
        senior_title.append(1)
    else:
        senior_title.append(0)

df['senior_title'] = senior_title
df.head(5)

Unnamed: 0,id,title,min_amount,max_amount,full_description,experience,senior_title
0,0,Senior .NET/C# Developer (Open to Remote),130000.0,135000.0,Senior .NET/C# Developer (Open to Remote) Peng...,5.0,1
1,1,REMOTE - Sr. Software Test Engineer,,,REMOTE - Sr. Software Test Engineer The Softwa...,3.0,1
2,2,Golang Developer,110727.0,135327.0,"Golang Developer Overall, 6-10Go Lang experien...",2.0,0
3,3,Software Development Engineer In Test (SEDT),,,Software Development Engineer In Test (SEDT) *...,,0
4,4,Outlook Add-In (C#) Developer,50573.0,79932.0,Outlook Add-In (C#) Developer **Requisition Nu...,2.0,0


In [11]:
junior_title = []
for i, row in df.iterrows():
    if any(keyword in row['title'].lower() for keyword in ['junior', 'jr ', 'jr.', 'junior.']):
        junior_title.append(1)
    else:
        junior_title.append(0)

df['junior_title'] = junior_title
df.head(5)

Unnamed: 0,id,title,min_amount,max_amount,full_description,experience,senior_title,junior_title
0,0,Senior .NET/C# Developer (Open to Remote),130000.0,135000.0,Senior .NET/C# Developer (Open to Remote) Peng...,5.0,1,0
1,1,REMOTE - Sr. Software Test Engineer,,,REMOTE - Sr. Software Test Engineer The Softwa...,3.0,1,0
2,2,Golang Developer,110727.0,135327.0,"Golang Developer Overall, 6-10Go Lang experien...",2.0,0,0
3,3,Software Development Engineer In Test (SEDT),,,Software Development Engineer In Test (SEDT) *...,,0,0
4,4,Outlook Add-In (C#) Developer,50573.0,79932.0,Outlook Add-In (C#) Developer **Requisition Nu...,2.0,0,0


In [12]:
entry_title = []
for i, row in df.iterrows():
    if any(keyword in row['title'].lower() for keyword in ['entry level', 'entry-level']):
        entry_title.append(1)
    else:
        entry_title.append(0)

df['entry_title'] = entry_title
df.head(5)

Unnamed: 0,id,title,min_amount,max_amount,full_description,experience,senior_title,junior_title,entry_title
0,0,Senior .NET/C# Developer (Open to Remote),130000.0,135000.0,Senior .NET/C# Developer (Open to Remote) Peng...,5.0,1,0,0
1,1,REMOTE - Sr. Software Test Engineer,,,REMOTE - Sr. Software Test Engineer The Softwa...,3.0,1,0,0
2,2,Golang Developer,110727.0,135327.0,"Golang Developer Overall, 6-10Go Lang experien...",2.0,0,0,0
3,3,Software Development Engineer In Test (SEDT),,,Software Development Engineer In Test (SEDT) *...,,0,0,0
4,4,Outlook Add-In (C#) Developer,50573.0,79932.0,Outlook Add-In (C#) Developer **Requisition Nu...,2.0,0,0,0


In [13]:
lead_title = []
for i, row in df.iterrows():
    if any(keyword in row['title'].lower() for keyword in ['lead']):
        lead_title.append(1)
    else:
        lead_title.append(0)

df['lead_title'] = lead_title
df.head(5)

Unnamed: 0,id,title,min_amount,max_amount,full_description,experience,senior_title,junior_title,entry_title,lead_title
0,0,Senior .NET/C# Developer (Open to Remote),130000.0,135000.0,Senior .NET/C# Developer (Open to Remote) Peng...,5.0,1,0,0,0
1,1,REMOTE - Sr. Software Test Engineer,,,REMOTE - Sr. Software Test Engineer The Softwa...,3.0,1,0,0,0
2,2,Golang Developer,110727.0,135327.0,"Golang Developer Overall, 6-10Go Lang experien...",2.0,0,0,0,0
3,3,Software Development Engineer In Test (SEDT),,,Software Development Engineer In Test (SEDT) *...,,0,0,0,0
4,4,Outlook Add-In (C#) Developer,50573.0,79932.0,Outlook Add-In (C#) Developer **Requisition Nu...,2.0,0,0,0,0


In [14]:
mid_title = []
for i, row in df.iterrows():
    if any(keyword in row['title'].lower() for keyword in ['mid level', 'middle level', 'mid-level', 'mid', 'middle-level']):
        mid_title.append(1)
    else:
        mid_title.append(0)

df['mid_title'] = mid_title
df.head(5)

Unnamed: 0,id,title,min_amount,max_amount,full_description,experience,senior_title,junior_title,entry_title,lead_title,mid_title
0,0,Senior .NET/C# Developer (Open to Remote),130000.0,135000.0,Senior .NET/C# Developer (Open to Remote) Peng...,5.0,1,0,0,0,0
1,1,REMOTE - Sr. Software Test Engineer,,,REMOTE - Sr. Software Test Engineer The Softwa...,3.0,1,0,0,0,0
2,2,Golang Developer,110727.0,135327.0,"Golang Developer Overall, 6-10Go Lang experien...",2.0,0,0,0,0,0
3,3,Software Development Engineer In Test (SEDT),,,Software Development Engineer In Test (SEDT) *...,,0,0,0,0,0
4,4,Outlook Add-In (C#) Developer,50573.0,79932.0,Outlook Add-In (C#) Developer **Requisition Nu...,2.0,0,0,0,0,0


In [15]:
mid_title = []
for i, row in df.iterrows():
    if any(keyword in row['full_description'].lower() for keyword in ['mid level', 'middle level', 'mid-level', 'mid', 'middle-level']):
        mid_title.append(1)
    else:
        mid_title.append(0)

df['mid_desc'] = mid_title
df.head(5)

Unnamed: 0,id,title,min_amount,max_amount,full_description,experience,senior_title,junior_title,entry_title,lead_title,mid_title,mid_desc
0,0,Senior .NET/C# Developer (Open to Remote),130000.0,135000.0,Senior .NET/C# Developer (Open to Remote) Peng...,5.0,1,0,0,0,0,0
1,1,REMOTE - Sr. Software Test Engineer,,,REMOTE - Sr. Software Test Engineer The Softwa...,3.0,1,0,0,0,0,0
2,2,Golang Developer,110727.0,135327.0,"Golang Developer Overall, 6-10Go Lang experien...",2.0,0,0,0,0,0,0
3,3,Software Development Engineer In Test (SEDT),,,Software Development Engineer In Test (SEDT) *...,,0,0,0,0,0,0
4,4,Outlook Add-In (C#) Developer,50573.0,79932.0,Outlook Add-In (C#) Developer **Requisition Nu...,2.0,0,0,0,0,0,0


In [16]:
entry_title = []
for i, row in df.iterrows():
    if any(keyword in row['full_description'].lower() for keyword in ['entry level', 'entry-level']):
        entry_title.append(1)
    else:
        entry_title.append(0)

df['entry_desc'] = entry_title
df.head(5)

Unnamed: 0,id,title,min_amount,max_amount,full_description,experience,senior_title,junior_title,entry_title,lead_title,mid_title,mid_desc,entry_desc
0,0,Senior .NET/C# Developer (Open to Remote),130000.0,135000.0,Senior .NET/C# Developer (Open to Remote) Peng...,5.0,1,0,0,0,0,0,0
1,1,REMOTE - Sr. Software Test Engineer,,,REMOTE - Sr. Software Test Engineer The Softwa...,3.0,1,0,0,0,0,0,0
2,2,Golang Developer,110727.0,135327.0,"Golang Developer Overall, 6-10Go Lang experien...",2.0,0,0,0,0,0,0,0
3,3,Software Development Engineer In Test (SEDT),,,Software Development Engineer In Test (SEDT) *...,,0,0,0,0,0,0,0
4,4,Outlook Add-In (C#) Developer,50573.0,79932.0,Outlook Add-In (C#) Developer **Requisition Nu...,2.0,0,0,0,0,0,0,0


In [17]:
junior_title = []
for i, row in df.iterrows():
    if any(keyword in row['full_description'].lower() for keyword in ['junior', 'jr ', 'jr.', 'junior.']):
        junior_title.append(1)
    else:
        junior_title.append(0)

df['junior_desc'] = junior_title
df.head(5)

Unnamed: 0,id,title,min_amount,max_amount,full_description,experience,senior_title,junior_title,entry_title,lead_title,mid_title,mid_desc,entry_desc,junior_desc
0,0,Senior .NET/C# Developer (Open to Remote),130000.0,135000.0,Senior .NET/C# Developer (Open to Remote) Peng...,5.0,1,0,0,0,0,0,0,0
1,1,REMOTE - Sr. Software Test Engineer,,,REMOTE - Sr. Software Test Engineer The Softwa...,3.0,1,0,0,0,0,0,0,0
2,2,Golang Developer,110727.0,135327.0,"Golang Developer Overall, 6-10Go Lang experien...",2.0,0,0,0,0,0,0,0,0
3,3,Software Development Engineer In Test (SEDT),,,Software Development Engineer In Test (SEDT) *...,,0,0,0,0,0,0,0,0
4,4,Outlook Add-In (C#) Developer,50573.0,79932.0,Outlook Add-In (C#) Developer **Requisition Nu...,2.0,0,0,0,0,0,0,0,0


In [18]:
mid_title = []
for i, row in df.iterrows():
    if any(keyword in row['title'].lower() for keyword in ['new graduate', 'new grad']):
        mid_title.append(1)
    else:
        mid_title.append(0)

df['grad_title'] = mid_title
df.head(5)

Unnamed: 0,id,title,min_amount,max_amount,full_description,experience,senior_title,junior_title,entry_title,lead_title,mid_title,mid_desc,entry_desc,junior_desc,grad_title
0,0,Senior .NET/C# Developer (Open to Remote),130000.0,135000.0,Senior .NET/C# Developer (Open to Remote) Peng...,5.0,1,0,0,0,0,0,0,0,0
1,1,REMOTE - Sr. Software Test Engineer,,,REMOTE - Sr. Software Test Engineer The Softwa...,3.0,1,0,0,0,0,0,0,0,0
2,2,Golang Developer,110727.0,135327.0,"Golang Developer Overall, 6-10Go Lang experien...",2.0,0,0,0,0,0,0,0,0,0
3,3,Software Development Engineer In Test (SEDT),,,Software Development Engineer In Test (SEDT) *...,,0,0,0,0,0,0,0,0,0
4,4,Outlook Add-In (C#) Developer,50573.0,79932.0,Outlook Add-In (C#) Developer **Requisition Nu...,2.0,0,0,0,0,0,0,0,0,0


In [21]:
mid_title = []
for i, row in df.iterrows():
    if any(keyword in row['full_description'].lower() for keyword in ['new graduate', 'new grad', 'newly graduated', 'recent grad', 'recent graduate', 'recently graduated', 'graduating']):
        mid_title.append(1)
    else:
        mid_title.append(0)

df['grad_desc'] = mid_title
df.head(5)

Unnamed: 0,id,title,min_amount,max_amount,full_description,experience,senior_title,junior_title,entry_title,lead_title,mid_title,mid_desc,entry_desc,junior_desc,grad_title,grad_desc
0,0,Senior .NET/C# Developer (Open to Remote),130000.0,135000.0,Senior .NET/C# Developer (Open to Remote) Peng...,5.0,1,0,0,0,0,0,0,0,0,0
1,1,REMOTE - Sr. Software Test Engineer,,,REMOTE - Sr. Software Test Engineer The Softwa...,3.0,1,0,0,0,0,0,0,0,0,0
2,2,Golang Developer,110727.0,135327.0,"Golang Developer Overall, 6-10Go Lang experien...",2.0,0,0,0,0,0,0,0,0,0,0
3,3,Software Development Engineer In Test (SEDT),,,Software Development Engineer In Test (SEDT) *...,,0,0,0,0,0,0,0,0,0,0
4,4,Outlook Add-In (C#) Developer,50573.0,79932.0,Outlook Add-In (C#) Developer **Requisition Nu...,2.0,0,0,0,0,0,0,0,0,0,0


In [22]:
mid_title = []
for i, row in df.iterrows():
    if any(keyword in row['full_description'].lower() for keyword in ['manager']):
        mid_title.append(1)
    else:
        mid_title.append(0)

df['manageer_title'] = mid_title
df.head(5)

Unnamed: 0,id,title,min_amount,max_amount,full_description,experience,senior_title,junior_title,entry_title,lead_title,mid_title,mid_desc,entry_desc,junior_desc,grad_title,grad_desc,manageer_title
0,0,Senior .NET/C# Developer (Open to Remote),130000.0,135000.0,Senior .NET/C# Developer (Open to Remote) Peng...,5.0,1,0,0,0,0,0,0,0,0,0,0
1,1,REMOTE - Sr. Software Test Engineer,,,REMOTE - Sr. Software Test Engineer The Softwa...,3.0,1,0,0,0,0,0,0,0,0,0,0
2,2,Golang Developer,110727.0,135327.0,"Golang Developer Overall, 6-10Go Lang experien...",2.0,0,0,0,0,0,0,0,0,0,0,0
3,3,Software Development Engineer In Test (SEDT),,,Software Development Engineer In Test (SEDT) *...,,0,0,0,0,0,0,0,0,0,0,0
4,4,Outlook Add-In (C#) Developer,50573.0,79932.0,Outlook Add-In (C#) Developer **Requisition Nu...,2.0,0,0,0,0,0,0,0,0,0,0,0


In [24]:
df.to_json('./input.json', orient='records', lines=True)

In [36]:
pip install transformers torch


Note: you may need to restart the kernel to use updated packages.


In [25]:
from transformers import BloomForCausalLM, BloomTokenizerFast
import torch

def setup_model(model_name="bigscience/bloom-560m"):
    tokenizer = BloomTokenizerFast.from_pretrained(model_name)
    model = BloomForCausalLM.from_pretrained(model_name)  # Default is float32
    if torch.cuda.is_available():
        model.cuda()  # Transfer model to GPU
    return model, tokenizer

def ask_question(model, tokenizer, question, context, max_length=150):
    prompt = f"Answer the following question based on the context provided.\nContext: {context}\nQuestion: {question}\nAnswer:"
    inputs = tokenizer(prompt, return_tensors="pt", padding=True, truncation=True, max_length=1024)
    if torch.cuda.is_available():
        inputs = inputs.to("cuda")  # Move input to GPU

    outputs = model.generate(**inputs, max_length=max_length, num_return_sequences=1)
    answer = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return answer.replace(prompt, '')  # Clean up the output to remove the prompt text

# Example usage
model, tokenizer = setup_model()


In [8]:
context = "The dog was red and had a head but he could not tread"
question = "What color was the dog?"
answer = ask_question(model, tokenizer, question, context)
print(answer)


 The dog was red and had a head but he could not tread. The dog was red and had a head but he could not tread. The dog was red and had a head but he could not tread. The dog was red and had a head but he could not tread. The dog was red and had a head but he could not tread. The dog was red and had a head but he could not tread. The dog was red and had a head but he could not tread. The dog was red and had a head but


In [None]:
ans = []

for i in range(5):
    input = "Give me a one word answer. Please return the degree type necessary for the job. I have provided the job description as well. If there is no requirement please say that, but if it is unclear please return unknown: " + df.iloc[i]['full_description']
    inputs = tokenizer.encode(input, return_tensors="pt")
    outputs = model.generate(inputs, max_length=1000)
    print(tokenizer.decode(outputs[0]))

