In [15]:
pip install transformers datasets torch

Note: you may need to restart the kernel to use updated packages.


# Load Data

In [5]:
import pandas as pd

# Load the dataset using absolute path
file_path = "/Users/audrey/H-1B Insights AI/USAforeignworkerssalarydata-1556559586172.xlsx"
df = pd.read_excel(file_path)

# Check the first few rows
df.head()

# Summary of the dataset
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 167278 entries, 0 to 167277
Data columns (total 26 columns):
 #   Column                          Non-Null Count   Dtype  
---  ------                          --------------   -----  
 0   CASE_NUMBER                     167278 non-null  object 
 1   CASE_STATUS                     167278 non-null  object 
 2   CASE_RECEIVED_DATE              167278 non-null  object 
 3   DECISION_DATE                   167278 non-null  object 
 4   EMPLOYER_NAME                   167278 non-null  object 
 5   PREVAILING_WAGE_SUBMITTED       167278 non-null  float64
 6   PREVAILING_WAGE_SUBMITTED_UNIT  167278 non-null  object 
 7   PAID_WAGE_SUBMITTED             167278 non-null  float64
 8   PAID_WAGE_SUBMITTED_UNIT        167278 non-null  object 
 9   JOB_TITLE                       167278 non-null  object 
 10  WORK_CITY                       167275 non-null  object 
 11  EDUCATION_LEVEL_REQUIRED        11063 non-null   object 
 12  COLLEGE_MAJOR_RE

### Data Format
- CASE_NUMBER: Unique identifier for each visa application case.
- CASE_STATUS: Status of the case (e.g., certified, denied).
- EMPLOYER_NAME: The name of the employer filing the H-1B petition.
- PREVAILING_WAGE_SUBMITTED: The wage submitted by the employer as part of the application.
- PAID_WAGE_SUBMITTED: The wage paid to the worker.
- JOB_TITLE: The job title for which the H-1B visa is filed.
- WORK_CITY and WORK_STATE: The city and state where the job is located.
- VISA_CLASS: Type of visa being applied for.
- PREVAILING_WAGE_PER_YEAR and PAID_WAGE_PER_YEAR: The annualized wages (submitted and paid).


# Clean Data

In [9]:
# Create a new DataFrame for cleaned data 
df_cleaned = df.copy()

# Drop rows where key columns have missing data 
df_cleaned = df_cleaned.dropna(subset=['CASE_NUMBER', 'EMPLOYER_NAME', 'JOB_TITLE'])

# Normalize date formats to a consistent format (YYYY-MM-DD)
df_cleaned['CASE_RECEIVED_DATE'] = pd.to_datetime(df_cleaned['CASE_RECEIVED_DATE'], errors='coerce').dt.date
df_cleaned['DECISION_DATE'] = pd.to_datetime(df_cleaned['DECISION_DATE'], errors='coerce').dt.date

# Ensure numeric columns are correctly formatted as floats
numeric_columns = ['PREVAILING_WAGE_SUBMITTED', 'PAID_WAGE_SUBMITTED', 'PREVAILING_WAGE_PER_YEAR', 'PAID_WAGE_PER_YEAR']
df_cleaned[numeric_columns] = df_cleaned[numeric_columns].apply(pd.to_numeric, errors='coerce')

# Save the cleaned data to a new Excel file to preserve the original
cleaned_file_path = "/Users/audrey/H-1B Insights AI/Cleaned_H1B_Visa_Data.xlsx"
df_cleaned.to_excel(cleaned_file_path, index=False)

# Check the cleaned data summary and first few rows
print(df_cleaned.info())
print(df_cleaned.head())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 167278 entries, 0 to 167277
Data columns (total 26 columns):
 #   Column                          Non-Null Count   Dtype  
---  ------                          --------------   -----  
 0   CASE_NUMBER                     167278 non-null  object 
 1   CASE_STATUS                     167278 non-null  object 
 2   CASE_RECEIVED_DATE              167278 non-null  object 
 3   DECISION_DATE                   167278 non-null  object 
 4   EMPLOYER_NAME                   167278 non-null  object 
 5   PREVAILING_WAGE_SUBMITTED       167278 non-null  float64
 6   PREVAILING_WAGE_SUBMITTED_UNIT  167278 non-null  object 
 7   PAID_WAGE_SUBMITTED             167278 non-null  float64
 8   PAID_WAGE_SUBMITTED_UNIT        167278 non-null  object 
 9   JOB_TITLE                       167278 non-null  object 
 10  WORK_CITY                       167275 non-null  object 
 11  EDUCATION_LEVEL_REQUIRED        11063 non-null   object 
 12  COLLEGE_MAJOR_RE

# Convert Rows to Sentences

In [10]:
# Function to convert each row into a sentence
def row_to_sentence(row):
    return (
        f"On {row['CASE_RECEIVED_DATE']}, {row['EMPLOYER_NAME']} applied for a {row['VISA_CLASS']} visa "
        f"for the position of {row['JOB_TITLE']} in {row['WORK_STATE']}. "
        f"The prevailing wage was {row['PREVAILING_WAGE_SUBMITTED']} {row['PREVAILING_WAGE_SUBMITTED_UNIT']} "
        f"and the paid wage was {row['PAID_WAGE_SUBMITTED']} {row['PAID_WAGE_SUBMITTED_UNIT']}. "
        f"The application was {row['CASE_STATUS']} on {row['DECISION_DATE']}."
    )

# Apply the function to each row
df_cleaned['Sentence'] = df_cleaned.apply(row_to_sentence, axis=1)

# Show the first few sentences
df_cleaned['Sentence'].head()

0    On 2014-03-14, ADVANCED TECHNOLOGY GROUP USA, ...
1    On 2015-03-19, SAN FRANCISCO STATE UNIVERSITY ...
2    On 2013-09-13, CAROUSEL SCHOOL applied for a H...
3    On 2014-03-28, HARLINGEN CONSOLIDATED INDEPEND...
4    On 2014-09-16, SIGNAL SCIENCES CORPORATION app...
Name: Sentence, dtype: object

In [11]:
# Set display options to show the full width of the text
pd.set_option('display.max_colwidth', None)

# Now display the first few rows of the 'Sentence' column
print(df_cleaned['Sentence'].head())

0                           On 2014-03-14, ADVANCED TECHNOLOGY GROUP USA, INC. applied for a H-1B visa for the position of SOFTWARE ENGINEER in IL. The prevailing wage was 6217100.0 year and the paid wage was 62171.0 year. The application was denied on 2014-03-21.
1    On 2015-03-19, SAN FRANCISCO STATE UNIVERSITY applied for a greencard visa for the position of Assistant Professor of Marketing in CALIFORNIA. The prevailing wage was 5067600.0 year and the paid wage was 91440.0 year. The application was denied on 2015-03-19.
2                                       On 2013-09-13, CAROUSEL SCHOOL applied for a H-1B visa for the position of SPECIAL EDUCATION TEACHER in CA. The prevailing wage was 4947000.0 year and the paid wage was 49470.0 year. The application was denied on 2013-09-23.
3              On 2014-03-28, HARLINGEN CONSOLIDATED INDEPENDENT SCHOOL DISTRICT applied for a H-1B visa for the position of SCIENCE TEACHER in TX. The prevailing wage was 251052.0 month and the paid wage 

In [13]:
# Save the sentences to a text file
sentence_file_path = "/Users/audrey/H-1B Insights AI/Sentences_H1B_Visa_Data.txt"
df_cleaned['Sentence'].to_csv(sentence_file_path, index=False, header=False)

# Print a success message
print(f"Sentences successfully saved to {sentence_file_path}")

Sentences successfully saved to /Users/audrey/H-1B Insights AI/Sentences_H1B_Visa_Data.txt


# Train GPT-2 Model

In [16]:
import pandas as pd

# Load the sentence data
sentences_file = "/Users/audrey/H-1B Insights AI/Sentences_H1B_Visa_Data.txt"
sentences = pd.read_csv(sentences_file, header=None)[0].tolist()

# Check the first few sentences
print(sentences[:5])

['On 2014-03-14, ADVANCED TECHNOLOGY GROUP USA, INC. applied for a H-1B visa for the position of SOFTWARE ENGINEER in IL. The prevailing wage was 6217100.0 year and the paid wage was 62171.0 year. The application was denied on 2014-03-21.', 'On 2015-03-19, SAN FRANCISCO STATE UNIVERSITY applied for a greencard visa for the position of Assistant Professor of Marketing in CALIFORNIA. The prevailing wage was 5067600.0 year and the paid wage was 91440.0 year. The application was denied on 2015-03-19.', 'On 2013-09-13, CAROUSEL SCHOOL applied for a H-1B visa for the position of SPECIAL EDUCATION TEACHER in CA. The prevailing wage was 4947000.0 year and the paid wage was 49470.0 year. The application was denied on 2013-09-23.', 'On 2014-03-28, HARLINGEN CONSOLIDATED INDEPENDENT SCHOOL DISTRICT applied for a H-1B visa for the position of SCIENCE TEACHER in TX. The prevailing wage was 251052.0 month and the paid wage was 43800.0 year. The application was denied on 2014-04-07.', 'On 2014-09-16,

In [29]:
from transformers import GPT2Tokenizer

# Load the pre-trained GPT-2 tokenizer
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')

# Set the pad token to the eos token (end-of-sequence token)
tokenizer.pad_token = tokenizer.eos_token

# Tokenize the sentences
inputs = tokenizer(sentences, return_tensors='pt', padding=True, truncation=True, max_length=128)

# Check the tokenized inputs
print(inputs)

{'input_ids': tensor([[ 2202,  1946,    12,  ..., 50256, 50256, 50256],
        [ 2202,  1853,    12,  ..., 50256, 50256, 50256],
        [ 2202,  2211,    12,  ..., 50256, 50256, 50256],
        ...,
        [ 2202,  2813,    12,  ..., 50256, 50256, 50256],
        [ 2202,  2813,    12,  ..., 50256, 50256, 50256],
        [ 2202,  2813,    12,  ..., 50256, 50256, 50256]]), 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]])}


In [32]:
from torch.utils.data import Dataset, DataLoader, Subset

class TextDataset(Dataset):
    def __init__(self, inputs):
        self.input_ids = inputs['input_ids']
        self.attention_mask = inputs['attention_mask']

    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, idx):
        return {
            'input_ids': self.input_ids[idx],
            'attention_mask': self.attention_mask[idx]
        }

# Create a dataset
train_dataset = TextDataset(inputs)

# Use only a subset (first 10 examples) for faster demonstration
subset_indices = list(range(10))  # First 10 indices
train_subset = Subset(train_dataset, subset_indices)

# Create a DataLoader with the subset
train_loader = DataLoader(train_subset, batch_size=2, shuffle=True)  # Adjust batch_size as needed

# Iterate over the DataLoader
for batch in train_loader:
    print(batch)

{'input_ids': tensor([[ 2202,  1946,    12,  3070,    12,  1415,    11, 43685, 20940,  1961,
         44999,    45, 43781, 44441,  4916,    11, 19387,    13,  5625,   329,
           257,   367,    12,    16,    33, 14552,   329,   262,  2292,   286,
         47466, 36924,  8881,  1137,   287, 14639,    13,   383, 26602,  7699,
           373,  8190,  1558,  3064,    13,    15,   614,   290,   262,  3432,
          7699,   373,  8190, 27192,    13,    15,   614,    13,   383,  3586,
           373,  6699,   319,  1946,    12,  3070,    12,  2481,    13, 50256,
         50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256,
         50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256,
         50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256],
        [ 2202,  1853,    12,  3070,    12,  1129,    11, 37376,  8782, 20940,
          1797,  8220, 35454, 49677,  9050,  5625,   329,   257,  4077,  9517,
         14552,   329,   262,  2292,   286, 15286, 

In [33]:
from transformers import GPT2LMHeadModel
import torch
from torch.optim import AdamW

# Load the pre-trained GPT-2 model
model = GPT2LMHeadModel.from_pretrained('gpt2')

device = 'cuda' if torch.cuda.is_available() else 'cpu'
model.to(device)

# Set up optimizer
optimizer = AdamW(model.parameters(), lr=5e-5)

# Set the model to training mode
model.train()

epochs = 1  # Reduced for faster training
for epoch in range(epochs):
    print(f"Epoch {epoch+1}/{epochs}")
    total_loss = 0
    for batch in train_loader:
        optimizer.zero_grad()

        # Move batch to device (GPU/CPU)
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)

        # Get model output and compute loss
        outputs = model(input_ids, attention_mask=attention_mask, labels=input_ids)
        loss = outputs.loss

        # Backpropagate the loss
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    print(f"Total loss: {total_loss:.4f}")

Epoch 1/1
Total loss: 17.3248


### Note: This example runs for only 1 epoch for demonstration purposes.
### The total loss (e.g., 17.3248) is high because the model has only been trained for a single epoch.
### In a real-world application, you'd typically run for multiple epochs to allow the model to learn from the data.
### With more epochs and tuning, the loss would gradually decrease, leading to better model performance.

# Save trained model

In [34]:
model.save_pretrained('./fine_tuned_gpt2')
tokenizer.save_pretrained('./fine_tuned_gpt2')

('./fine_tuned_gpt2/tokenizer_config.json',
 './fine_tuned_gpt2/special_tokens_map.json',
 './fine_tuned_gpt2/vocab.json',
 './fine_tuned_gpt2/merges.txt',
 './fine_tuned_gpt2/added_tokens.json')

# Test Model

In [42]:
from transformers import GPT2LMHeadModel, GPT2Tokenizer
import torch

# Load the fine-tuned model and tokenizer
model = GPT2LMHeadModel.from_pretrained('./fine_tuned_gpt2')
tokenizer = GPT2Tokenizer.from_pretrained('./fine_tuned_gpt2')

# Set the model to evaluation mode (not training)
model.eval()

# If you have a GPU available
device = 'cuda' if torch.cuda.is_available() else 'cpu'
model.to(device)

# Example function to generate text
def generate_text(prompt):
    # Tokenize the prompt and prepare it for the model
    inputs = tokenizer(prompt, return_tensors="pt").to(device)
    
    # Generate output with additional settings for better text control
    outputs = model.generate(
        **inputs,
        max_length=100,                  # Limit the length to avoid overly long outputs
        num_return_sequences=1,          # Generate only one response
        pad_token_id=tokenizer.eos_token_id,  # Set pad token to the end of sequence token
        top_k=50,                        # Top-k sampling for diversity
        top_p=0.95,                      # Nucleus sampling
        temperature=0.7,                 # Controls randomness: lower values mean more deterministic
        repetition_penalty=1.2           # Penalize repetitions to reduce redundant output
    )
    
    # Decode the generated tokens back into text
    generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
    
    # Clean up the text to make it look better (remove redundant sentences)
    cleaned_text = post_process_output(generated_text)
    
    return cleaned_text

# Post-processing function to handle simple cleaning of the output
def post_process_output(text):
    # Remove repeated sentences by splitting and rejoining sentences
    sentences = text.split('. ')
    cleaned_sentences = []
    for sentence in sentences:
        if sentence not in cleaned_sentences:
            cleaned_sentences.append(sentence)
    
    return '. '.join(cleaned_sentences) + '.' if len(cleaned_sentences) > 0 else text

# Test the model with a prompt
prompt = "Can you tell me about the H-1B visa process?"
generated_text = generate_text(prompt)

# Print out the generated text
print("Prompt:", prompt)
print("Generated Response:", generated_text)

Prompt: Can you tell me about the H-1B visa process?
Generated Response: Can you tell me about the H-1B visa process?
A: The U.S.-based company that is responsible for processing applications from foreign nationals, has been working with us to ensure we are able and willing to provide a fair opportunity in order to meet our needs as well as providing an affordable solution which will allow them to stay here permanently without having their visas revoked or denied by any government agency at all!.
