### AI Project


In [1]:
!pip install datasets
!pip install evaluate
!pip install -U transformers
!pip install kagglehub



In [2]:
import kagglehub
import pandas as pd, re
import numpy as np
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import os
from transformers import (AutoModelForSequenceClassification,
                          TrainingArguments, Trainer)
from datasets import Dataset
import evaluate, numpy as np

In [3]:
# Download latest version
path = kagglehub.dataset_download("gauravduttakiit/resume-dataset")
print("Path to dataset files:", path)


Path to dataset files: /kaggle/input/resume-dataset


In [4]:
resume_csv = os.path.join(path, "UpdatedResumeDataSet.csv")
raw = pd.read_csv(resume_csv)
def scrub(text):
    text = text.lower()
    text = re.sub(r'\S+@\S+','<EMAIL>', text)           # strip emails
    text = re.sub(r'\b\d{10,}\b','<PHONE>', text)       # strip phones
    # remove obvious PII tokens you do not want the model to learn
    return text

raw["jd_clean"]      = raw["Category"].apply(scrub)
raw["resume_clean"]  = raw["Resume"].apply(scrub)

assert not raw.isnull().any().any(), "nulls sneaked in!"

raw.head()

Unnamed: 0,Category,Resume,jd_clean,resume_clean
0,Data Science,Skills * Programming Languages: Python (pandas...,data science,skills * programming languages: python (pandas...
1,Data Science,Education Details \r\nMay 2013 to May 2017 B.E...,data science,education details \r\nmay 2013 to may 2017 b.e...
2,Data Science,"Areas of Interest Deep Learning, Control Syste...",data science,"areas of interest deep learning, control syste..."
3,Data Science,Skills â¢ R â¢ Python â¢ SAP HANA â¢ Table...,data science,skills â¢ r â¢ python â¢ sap hana â¢ table...
4,Data Science,"Education Details \r\n MCA YMCAUST, Faridab...",data science,"education details \r\n mca ymcaust, faridab..."


Creat

In [5]:
neg = raw.copy()
neg["jd_clean"] = np.random.permutation(neg["jd_clean"].values)
neg["label"]    = 0          # <-- mismatched
pos = raw.copy()
pos["label"]    = 1          # <-- matched

df = pd.concat([pos, neg], ignore_index=True)
df.head()

Unnamed: 0,Category,Resume,jd_clean,resume_clean,label
0,Data Science,Skills * Programming Languages: Python (pandas...,data science,skills * programming languages: python (pandas...,1
1,Data Science,Education Details \r\nMay 2013 to May 2017 B.E...,data science,education details \r\nmay 2013 to may 2017 b.e...,1
2,Data Science,"Areas of Interest Deep Learning, Control Syste...",data science,"areas of interest deep learning, control syste...",1
3,Data Science,Skills â¢ R â¢ Python â¢ SAP HANA â¢ Table...,data science,skills â¢ r â¢ python â¢ sap hana â¢ table...,1
4,Data Science,"Education Details \r\n MCA YMCAUST, Faridab...",data science,"education details \r\n mca ymcaust, faridab...",1


In [6]:
train_df, tmp_df = train_test_split(df,  test_size=0.30,
                                    stratify=df["label"],
                                    random_state=42)
val_df,   test_df = train_test_split(tmp_df, test_size=0.50,
                                     stratify=tmp_df["label"],
                                     random_state=42)

### Model Defination:

Tokenize

In [7]:
MODEL_ID = "allenai/longformer-base-4096"
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
model = AutoModelForSequenceClassification.from_pretrained(
    MODEL_ID,
    num_labels=2,
    attention_window=256,         # Longformer专有参数，窗口大小
)

MAX_LEN = 1536
def tok_fn(batch):
    return tokenizer(batch["jd_clean"],
                     batch["resume_clean"],
                     truncation=True,
                     padding="max_length",
                     max_length=MAX_LEN)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
Some weights of LongformerForSequenceClassification were not initialized from the model checkpoint at allenai/longformer-base-4096 and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Define dateset

In [8]:
train_ds = Dataset.from_pandas(train_df)
val_ds   = Dataset.from_pandas(val_df)
test_ds  = Dataset.from_pandas(test_df)

train_ds = train_ds.map(tok_fn, batched=True).remove_columns(df.columns[:-1])
val_ds   = val_ds.map(tok_fn, batched=True).remove_columns(df.columns[:-1])
test_ds  = test_ds.map(tok_fn, batched=True).remove_columns(df.columns[:-1])
train_ds.set_format("torch")
val_ds.set_format("torch")
test_ds.set_format("torch")

Map:   0%|          | 0/1346 [00:00<?, ? examples/s]

Map:   0%|          | 0/289 [00:00<?, ? examples/s]

Map:   0%|          | 0/289 [00:00<?, ? examples/s]

In [9]:
args = TrainingArguments(
    output_dir="./checkpoints",
    learning_rate=2e-5,
    weight_decay=0.01,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=15,
    logging_steps=500,
    save_steps=500,
    eval_steps=500,  # 每500步评估一次
    report_to="none",
)


metric_acc  = evaluate.load("accuracy")
metric_f1   = evaluate.load("f1")

def compute_metrics(p):
    preds = np.argmax(p.predictions, axis=1)
    acc   = metric_acc.compute(predictions=preds, references=p.label_ids)
    f1    = metric_f1.compute(predictions=preds, references=p.label_ids)
    return {"accuracy": acc["accuracy"], "f1": f1["f1"]}

trainer = Trainer(model=model,
                  args=args,
                  train_dataset=train_ds,
                  eval_dataset=val_ds,
                  compute_metrics=compute_metrics)



In [10]:
trainer.train()

Initializing global attention on CLS token...


Step,Training Loss
500,0.326
1000,0.1407


TrainOutput(global_step=1275, training_loss=0.2078763640160654, metrics={'train_runtime': 2170.6939, 'train_samples_per_second': 9.301, 'train_steps_per_second': 0.587, 'total_flos': 1.989275105875968e+16, 'train_loss': 0.2078763640160654, 'epoch': 15.0})

In [11]:
test_metrics = trainer.evaluate(test_ds)
print(test_metrics)


{'eval_loss': 0.28879424929618835, 'eval_accuracy': 0.9342560553633218, 'eval_f1': 0.9377049180327869, 'eval_runtime': 8.4102, 'eval_samples_per_second': 34.363, 'eval_steps_per_second': 2.259, 'epoch': 15.0}


In [12]:
from transformers import pipeline
pipe = pipeline("text-classification", model=model, tokenizer=tokenizer)

def rank_resumes(jd_text, résumé_list, top_k=5):
    jd_text = scrub(jd_text)
    scored = []
    for r in résumé_list:
        r_txt = scrub(r)
        pred = pipe(jd_text + tokenizer.sep_token + r_txt,
                    truncation=True, max_length=512)[0]
        scored.append((pred["score"], r))
    return sorted(scored, reverse=True)[:top_k]


Device set to use cuda:0
