In [None]:
import torch
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from transformers import BertTokenizer, BertForSequenceClassification
from sklearn.model_selection import train_test_split

import pandas as pd
import numpy as np

from tabulate import tabulate
from tqdm import trange
import random
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [None]:
#Load the Model
tokenizer = BertTokenizer.from_pretrained(
    'bert-base-uncased',
    do_lower_case = True
    )

model = BertForSequenceClassification.from_pretrained(
"001Model"
).to(device)

def preprocessing(input_text, tokenizer):

    return tokenizer.encode_plus(
                        input_text,
                        add_special_tokens = True,
                        max_length = 512,
                        pad_to_max_length = True,
                        return_attention_mask = True,
                        return_tensors = 'pt'
                   )

In [None]:
#Test for one description

new_sentence = '''
Part Time Assistant Manager 8055 W Bowles Ave  Store 0013  Tuesday Morning  Littleton, CO Tuesday Morning    Job Company  Job details  Salary 14.35  23.00 an hour Job Type Parttime  Full Job Description  Tuesday Morning is taking the lead in offprice retail offering upscale decorative home accessories, housewares, seasonal goods and famousmaker gifts.  Our mission is simple offer fresh and exciting merchandise at unbelievable value, with impeccable service.  With over 750 stores in 40 states, and continuing to grow, we are always seeking strong leadership to fuel our growth.  The Part Time Assistant Store Managers role is to, take the lead from and, partner with the Store Manager to engage, motivate and lead a team of associates in operating a profitable store, while creating a positive environment for the associate and the guest. The Assistant Store Manager is the extension of the Store Manager and will provide overall support to drive the Store Managers vision and direction for the store.  Responsibilities  Sales Driving sales by creating a sales generating environment through the implementation of all corporate sales directives. Service Foster a service oriented environment tailored to the unique seeker, and ensuring the guest is always taken care of the right way. Merchandise Ensure Merchandising standards and product presentations are second to none, and create that WOW factor. Leadership Provide ongoing coaching feedback, empowering your team to do whats right, setting clear expectations and leading by example. Communication Set the vision and direction for the store, share information to align your team  help them feel a part of something big.  Skills  experience  23 years of progressively responsible retail, and at least 1 year of supervision, experience required. Must understand and be able to execute concepts related to financial principles, inventory management, and merchandising. Bachelors degree preferred. Possess strong leadership skills with the ability to train, coach and mentor associates with professional maturity. Ability to make decisions, communicate, analyze financial information, problem solve, organization and computer skills. Must be 21 years of age. Ability to relocate, for future growth and promotional opportunities, strongly desired.  We offer competitive compensation, excellent benefits to include 401k, bestinclass products and more, in a high performing environment. Working in our stores provides you with unlimited possibilities to start or expand your career.  Pay Range 14.35  23.00hr  Benefits  Join Tuesday Morning and enjoy  Some of the best hours in retail 401K 20 Associate discount Rewarding career with advancement opportunities  CB  Tuesday Morning 
'''
test_ids = []
test_attention_mask = []
encoding = preprocessing(new_sentence, tokenizer)

# Extract IDs and Attention Mask
test_ids.append(encoding['input_ids'])
test_attention_mask.append(encoding['attention_mask'])
test_ids = torch.cat(test_ids, dim = 0).to(device)
test_attention_mask = torch.cat(test_attention_mask, dim = 0).to(device)


In [None]:
#Print Predictions
with torch.no_grad():
  output = model(test_ids, token_type_ids = None, attention_mask = test_attention_mask)

prediction = 'remote' if np.argmax(output.logits.cpu().numpy()).flatten().item() == 1 else 'non-remote'

print('Input Sentence: ', new_sentence)
print('Predicted Class: ', prediction)

In [None]:
#Run on Test Set
import nltk
from nltk.corpus import stopwords
import re
nltk.download("stopwords")

def remove_stopwords(text):
    stop_words = set(stopwords.words("english"))
    words = text.split()  # Split the text into words
    filtered_words = [word for word in words if word.lower() not in stop_words]
    return " ".join(filtered_words)  # Join the remaining words back into a string

df=pd.read_csv("test_set.csv",nrows=5000)


In [None]:
#Preprocess
def preprocess_string(input_string):
    # Use regular expression to remove non-space, non-letter, non-number, non-comma, non-period, and non-question mark characters
    cleaned_string = re.sub(r'[^a-zA-Z0-9\s,.\?]', '', input_string)
    
    return cleaned_string

df['input1'] = df['input1'].apply(preprocess_string)
df['input1'] = df['input1'].apply(remove_stopwords)

df.rename(columns={'input1': 'text'}, inplace=True)
df.rename(columns={'output1': 'label'}, inplace=True)


In [None]:
#Spliting into 100 word chunks for each description makes the model run faster. 
#A prediction of "remote" means at least one chunk is classified as remote.
def split_into_100_words_each(text):
    words = re.findall(r'\b\w+\b', text)
    result = []
    current_chunk = []
    word_count = 0

    for word in words:
        current_chunk.append(word)
        word_count += 1

        if word_count >= 300:
            result.append(' '.join(current_chunk))
            current_chunk = []
            word_count = 0

    if current_chunk:
        result.append(' '.join(current_chunk))
    while len(result) < 3:
        result.append("-999")
    while len(result) > 3:
        result.pop()
    return result

#Some data processing that combine the chunk columns into one dataframe
df['split_text'] = df['text'].apply(split_into_100_words_each)
df['split_text'].to_list()
df[['input1', 'input2', 'input3']] = pd.DataFrame(df['split_text'].to_list(), index=df.index)
df.drop(["split_text","text","Unnamed: 0"],axis=1, inplace=True)
df["job_id"]=df.index
df1= df[["job_id",'label', 'input1']]
df2= df[["job_id",'label', 'input2']]
df3= df[["job_id",'label', 'input3']]
df2 = df2.rename(columns={ 'input2': 'input1'})
df3 = df3.rename(columns={ 'input3': 'input1'})
df = pd.concat([df1, df2, df3],axis=0)
df = df[df['input1'] != "-999"]

In [None]:

df3 = df3.rename(columns={ 'input3': 'input1'})
df = pd.concat([df1, df2, df3],axis=0)
df = df[df['input1'] != "-999"]

In [None]:
pred=[]
for i in df["input1"]:
    new_sentence = i
# We need Token IDs and Attention Mask for inference on the new sentence
    test_ids = []
    test_attention_mask = []

# Apply the tokenizer
    encoding = preprocessing(new_sentence, tokenizer)

# Extract IDs and Attention Mask
    test_ids.append(encoding['input_ids'])
    test_attention_mask.append(encoding['attention_mask'])
    test_ids = torch.cat(test_ids, dim = 0).to(device)
    test_attention_mask = torch.cat(test_attention_mask, dim = 0).to(device)

    with torch.no_grad():
        output = model(test_ids, token_type_ids = None, attention_mask = test_attention_mask)
    pred.append(np.argmax(output.logits.cpu().numpy()).flatten().item())
    print(np.argmax(output.logits.cpu().numpy()).flatten().item())
df["pred"]=pred

In [None]:
df=df.drop("input1",axis=1)

In [None]:
result = df.groupby('job_id')[['pred',"label"]].sum()

In [None]:
result['pred'] = result['pred'].apply(lambda x: 1 if x > 1 else x)
result['label'] = result['label'].apply(lambda x: 1 if x > 1 else x)
df=result

In [None]:
count_same_values = (df['pred'] == df['label']).sum()
count_same_values/5000