In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [3]:
!pip install sentencepiece

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [4]:
!pip install datasets

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [5]:
from types import SimpleNamespace
from pathlib import Path
from datetime import datetime
import math
import os
import re
import random

import pandas as pd
import numpy as np
import seaborn as sns
from sklearn.model_selection import StratifiedKFold, GroupKFold
from sklearn.metrics import log_loss,f1_score
from sklearn.model_selection import train_test_split


from fastai.imports import *
from torch.utils.data import DataLoader
import warnings,transformers,logging,torch
from transformers import TrainingArguments,Trainer
from transformers import AutoModelForSequenceClassification,AutoTokenizer
import datasets
from datasets import load_dataset, Dataset, DatasetDict

In [6]:
def set_seed(seed=42):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
set_seed(42)

In [7]:
warnings.simplefilter('ignore')
logging.disable(logging.WARNING)

In [8]:
INPUT_DIR = '/content/drive/MyDrive/Competitions/Signate/Student Cup 2022/input/'
OUTPUT_DIR = '/content/drive/MyDrive/Competitions/Signate/Student Cup 2022/output/'
OUTPUT_SUB_DIR = os.path.join(OUTPUT_DIR,'Submission')
OUTPUT_MODEL_DIR = os.path.join(OUTPUT_DIR,'Model')

In [9]:
model_nm = 'microsoft/deberta-v3-small'

In [10]:
train = pd.read_csv(os.path.join(INPUT_DIR, 'train.csv'))
test = pd.read_csv(os.path.join(INPUT_DIR, 'test.csv'))
submission_df = pd.read_csv(os.path.join(INPUT_DIR, 'submit_sample.csv'),header=None)
display(train.head())
print(train.shape)

Unnamed: 0,id,description,jobflag
0,0,<li>Develop cutting-edge web applications that perform superbly across all platforms.</li><li>Work in a highly collaborative environment with cross-functional teams on projects ranging from a few weeks to a few months in length.</li><li>Maintain high standard of quality as you creatively and strategically problem solve throughout the product delivery process.</li><li>Be able to effectively communicate your work with both technical and non-technical peers</li><li>Be excited about new web technologies/techniques</li><li>Build solid front-end architectures that integrate easily with other sys...,3
1,1,"<li> Designs and develops high quality, scalable and efficient solutions and products on schedule </li><li> Engages with system users and business analysts to identify system enhancements and/or new applications to meet business needs. </li><li> Actively contributes to the development of solutions and ideas that add value. </li><li> Proactively performs extensive system testing to ensure that the systems work efficiently and are developed following the applicable development methodology. </li><li> Provides support during meetings as required; prepares and takes dictation, minutes, agendas,...",3
2,2,"<li>Functions as a point person for Network Strategy work. Represents Network Strategy at internal and external stakeholder meetings. Serves as a resource for functional areas in responding to Network Strategy initiatives (identifying standard or historical responses) and coordinates response with other departments as necessary.</li><li>Communicates effectively and serves as a point of contact for status updates and overall project status. Coordinates activity with functional areas, lines of business, and markets to ensure schedule is met.</li><li>Monitors network activities to ensure the ...",4
3,3,"<li> Work on the technical design, development, release and deployment of cloud-based infrastructure and applications </li><li> Working with, and supporting, the development team with application configuration for deployment, monitoring and other automation </li><li> Provide operational management and support of Linux and Windows servers and containers, including server hardening, patching, network security and log management to deliver web application and service stacks </li><li> Mentor and collaborate with other team members</li><li> Provide on-call/out of hours support, as part of a rot...",3
4,4,"<li>Quantify the resources required for a task/project related to an Enterprise Resource Platform (ERP) implementation</li><li>Familiarize yourself with integration involving Sales, CRM, eCommerce, supply chain, manufacturing, inventory, POS, accounting, and so much more</li>",4


(1516, 3)


In [11]:
tokenizer = AutoTokenizer.from_pretrained(model_nm)
#tokz.model_max_length = 512
sep = tokenizer.sep_token
sep

'[SEP]'

In [12]:
tokenizer.all_special_tokens

['[CLS]', '[SEP]', '[UNK]', '[PAD]', '[MASK]']

In [13]:
def remove_tag(x):
    p = re.compile(r"<[^>]*?>")
    return p.sub('',x)

def cleaning(texts):
    clean_texts = []
    for text in texts:
        # htmlタグを削除
        text = remove_tag(text)
        #アルファベット以外をスペースに置き換え
        #clean_punc = re.sub(r'[^a-zA-Z]', ' ', text)
        clean_texts.append(text)
    return clean_texts



from text_unidecode import unidecode
from typing import Dict, List, Tuple
import codecs

def replace_encoding_with_utf8(error: UnicodeError) -> Tuple[bytes, int]:
    return error.object[error.start : error.end].encode("utf-8"), error.end


def replace_decoding_with_cp1252(error: UnicodeError) -> Tuple[str, int]:
    return error.object[error.start : error.end].decode("cp1252"), error.end

# Register the encoding and decoding error handlers for `utf-8` and `cp1252`.
codecs.register_error("replace_encoding_with_utf8", replace_encoding_with_utf8)
codecs.register_error("replace_decoding_with_cp1252", replace_decoding_with_cp1252)

def resolve_encodings_and_normalize(text: str) -> str:
    """Resolve the encoding problems and normalize the abnormal characters."""
    text = (
        text.encode("raw_unicode_escape")
        .decode("utf-8", errors="replace_decoding_with_cp1252")
        .encode("cp1252", errors="replace_encoding_with_utf8")
        .decode("utf-8", errors="replace_decoding_with_cp1252")
    )
    text = unidecode(text)
    return text

train['description'] = cleaning(train['description'])
test['description'] = cleaning(test['description'])
train['inputs'] = train['description'].apply(lambda x : resolve_encodings_and_normalize(x))
test['inputs'] = test['description'].apply(lambda x : resolve_encodings_and_normalize(x))
train = train.rename(columns = {"jobflag": "label"})
train["label"] = train["label"].apply(lambda x : 0 if x == 4 else x)
train

Unnamed: 0,id,description,label,inputs
0,0,Develop cutting-edge web applications that perform superbly across all platforms.Work in a highly collaborative environment with cross-functional teams on projects ranging from a few weeks to a few months in length.Maintain high standard of quality as you creatively and strategically problem solve throughout the product delivery process.Be able to effectively communicate your work with both technical and non-technical peersBe excited about new web technologies/techniquesBuild solid front-end architectures that integrate easily with other systems and technologiesWorking closely with other d...,3,Develop cutting-edge web applications that perform superbly across all platforms.Work in a highly collaborative environment with cross-functional teams on projects ranging from a few weeks to a few months in length.Maintain high standard of quality as you creatively and strategically problem solve throughout the product delivery process.Be able to effectively communicate your work with both technical and non-technical peersBe excited about new web technologies/techniquesBuild solid front-end architectures that integrate easily with other systems and technologiesWorking closely with other d...
1,1,"Designs and develops high quality, scalable and efficient solutions and products on schedule Engages with system users and business analysts to identify system enhancements and/or new applications to meet business needs. Actively contributes to the development of solutions and ideas that add value. Proactively performs extensive system testing to ensure that the systems work efficiently and are developed following the applicable development methodology. Provides support during meetings as required; prepares and takes dictation, minutes, agendas, notices and manages mailings of meeting...",3,"Designs and develops high quality, scalable and efficient solutions and products on schedule Engages with system users and business analysts to identify system enhancements and/or new applications to meet business needs. Actively contributes to the development of solutions and ideas that add value. Proactively performs extensive system testing to ensure that the systems work efficiently and are developed following the applicable development methodology. Provides support during meetings as required; prepares and takes dictation, minutes, agendas, notices and manages mailings of meeting..."
2,2,"Functions as a point person for Network Strategy work. Represents Network Strategy at internal and external stakeholder meetings. Serves as a resource for functional areas in responding to Network Strategy initiatives (identifying standard or historical responses) and coordinates response with other departments as necessary.Communicates effectively and serves as a point of contact for status updates and overall project status. Coordinates activity with functional areas, lines of business, and markets to ensure schedule is met.Monitors network activities to ensure the quality and timeliness...",0,"Functions as a point person for Network Strategy work. Represents Network Strategy at internal and external stakeholder meetings. Serves as a resource for functional areas in responding to Network Strategy initiatives (identifying standard or historical responses) and coordinates response with other departments as necessary.Communicates effectively and serves as a point of contact for status updates and overall project status. Coordinates activity with functional areas, lines of business, and markets to ensure schedule is met.Monitors network activities to ensure the quality and timeliness..."
3,3,"Work on the technical design, development, release and deployment of cloud-based infrastructure and applications Working with, and supporting, the development team with application configuration for deployment, monitoring and other automation Provide operational management and support of Linux and Windows servers and containers, including server hardening, patching, network security and log management to deliver web application and service stacks Mentor and collaborate with other team members Provide on-call/out of hours support, as part of a rota with other team members",3,"Work on the technical design, development, release and deployment of cloud-based infrastructure and applications Working with, and supporting, the development team with application configuration for deployment, monitoring and other automation Provide operational management and support of Linux and Windows servers and containers, including server hardening, patching, network security and log management to deliver web application and service stacks Mentor and collaborate with other team members Provide on-call/out of hours support, as part of a rota with other team members"
4,4,"Quantify the resources required for a task/project related to an Enterprise Resource Platform (ERP) implementationFamiliarize yourself with integration involving Sales, CRM, eCommerce, supply chain, manufacturing, inventory, POS, accounting, and so much more",0,"Quantify the resources required for a task/project related to an Enterprise Resource Platform (ERP) implementationFamiliarize yourself with integration involving Sales, CRM, eCommerce, supply chain, manufacturing, inventory, POS, accounting, and so much more"
...,...,...,...,...
1511,1511,"Support detailed reporting, statistical analyses, modeling and forecasting for Teladoc’s clinical service and clinical operations teams.Collaborate closely with cross-functional stakeholders, scope and solve problems they face, and help decision-makers on the team form educated, practical strategies and recommendations.Leverage a broad set of technical skills and strong business understanding to creatively solve complex problems with strategic impact.Solid communication and stakeholder management skills plus ability to deliver complex projects in time.",1,"Support detailed reporting, statistical analyses, modeling and forecasting for Teladoc\u2019s clinical service and clinical operations teams.Collaborate closely with cross-functional stakeholders, scope and solve problems they face, and help decision-makers on the team form educated, practical strategies and recommendations.Leverage a broad set of technical skills and strong business understanding to creatively solve complex problems with strategic impact.Solid communication and stakeholder management skills plus ability to deliver complex projects in time."
1512,1512,"Collaborate with teams to support the ML technical roadmap.Offer support and troubleshooting assistance for the ML pipeline, while continuously improving stability, monitoring and alerting along the way.Collaborate on managing the AWS stack which comprises all ML resources.Collaborate on managing ML infrastructure costs.",2,"Collaborate with teams to support the ML technical roadmap.Offer support and troubleshooting assistance for the ML pipeline, while continuously improving stability, monitoring and alerting along the way.Collaborate on managing the AWS stack which comprises all ML resources.Collaborate on managing ML infrastructure costs."
1513,1513,"Work with executives and other business leaders to identify opportunities for improvement. Establish KPIs to measure the effectiveness of business decisions. Meets scheduled on time reporting requirements. Maintains a professional and courteous attitude with internal and external customer regardless of customer response. Work with a team of analysts and other associates to process information. Create presentations and reports based on recommendations and findings. Responds quickly to ad-hoc requests. Applies strong communication skills: Effectively listens, probes situations, analyzes nee...",1,"Work with executives and other business leaders to identify opportunities for improvement. Establish KPIs to measure the effectiveness of business decisions. Meets scheduled on time reporting requirements. Maintains a professional and courteous attitude with internal and external customer regardless of customer response. Work with a team of analysts and other associates to process information. Create presentations and reports based on recommendations and findings. Responds quickly to ad-hoc requests. Applies strong communication skills: Effectively listens, probes situations, analyzes nee..."
1514,1514,"Leading design ideation sessions to ensure we are building scalable, maintainable, and durable systemsDriving observability practices to increase our abilities to debug, monitor, and alert Embracing modern DevOps practices as we move to more containerized applications that are continuously deployed",3,"Leading design ideation sessions to ensure we are building scalable, maintainable, and durable systemsDriving observability practices to increase our abilities to debug, monitor, and alert Embracing modern DevOps practices as we move to more containerized applications that are continuously deployed"


In [14]:
ds = Dataset.from_pandas(train)

In [15]:
def tok_func(x): return tokenizer(x["inputs"], truncation=True)

In [16]:
tok_func(ds[0])

{'input_ids': [1, 16323, 2947, 271, 9676, 967, 1567, 272, 2091, 33618, 679, 305, 3938, 260, 22625, 267, 266, 1344, 7641, 1192, 275, 1943, 271, 21414, 1737, 277, 1205, 5170, 292, 266, 477, 1033, 264, 266, 477, 740, 267, 2231, 260, 112619, 459, 1264, 265, 607, 283, 274, 21665, 263, 16154, 735, 3634, 1075, 262, 714, 1779, 568, 260, 10374, 526, 264, 2864, 4105, 290, 374, 275, 462, 2064, 263, 745, 271, 27119, 7295, 10374, 2199, 314, 353, 967, 2737, 320, 80222, 268, 36717, 2176, 831, 271, 3308, 30805, 272, 7424, 1166, 275, 340, 1050, 263, 2737, 42212, 3390, 275, 340, 9990, 287, 17345, 271, 3308, 261, 21373, 261, 2169, 261, 24589, 285, 20330, 266, 4197, 1251, 264, 2353, 263, 266, 986, 1211, 264, 1729, 320, 558, 311, 2620, 280, 268, 451, 374, 886, 261, 500, 1943, 271, 51515, 263, 1943, 271, 22028, 5309, 263, 876, 2], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0

In [17]:
tok_ds = ds.map(tok_func, batched=True, remove_columns=("id","inputs","description"))

  0%|          | 0/2 [00:00<?, ?ba/s]

In [18]:
tok_ds[0].keys()

dict_keys(['label', 'input_ids', 'token_type_ids', 'attention_mask'])

In [19]:
def get_dds(df, train=True):
    ds = Dataset.from_pandas(df)
    to_remove = ["id","inputs","description"]
    tok_ds = ds.map(tok_func, batched=True, remove_columns=to_remove)
    if train:
        return DatasetDict({"train":tok_ds.select(trn_idxs), "test": tok_ds.select(val_idxs)})
    else: 
        return tok_ds

In [20]:
lr,bs = 8e-5,16
wd,epochs = 0.01,5

In [21]:
from sklearn.metrics import log_loss
import torch.nn.functional as F
def score(preds): return {'f1_score': f1_score(preds.label_ids, np.argmax(F.softmax(torch.Tensor(preds.predictions)),axis=1),average='macro')}  

In [22]:
def get_trainer(dds):
    args = TrainingArguments('outputs', learning_rate=lr, warmup_ratio=0.1, lr_scheduler_type='cosine', fp16=True,
        evaluation_strategy="epoch", per_device_train_batch_size=bs, per_device_eval_batch_size=bs*2,
        num_train_epochs=epochs, weight_decay=wd, report_to='none')
    model = AutoModelForSequenceClassification.from_pretrained(model_nm, num_labels=4)
    return Trainer(model, args, train_dataset=dds['train'], eval_dataset=dds['test'],
                   tokenizer=tokenizer, compute_metrics=score)

In [23]:
skf = StratifiedKFold(n_splits=5,shuffle=True,random_state=2022)

In [24]:
preds = []
for fold ,(trn_idxs, val_idxs) in enumerate(skf.split(train,train.label)):
  print("="*20 , f" fold {fold} ", "="*20)
  dds = DatasetDict({"train":tok_ds.select(trn_idxs),
             "test": tok_ds.select(val_idxs)})
  trainer = get_trainer(dds)
  trainer.train()
  test_ds = get_dds(test,train=False)
  pred = F.softmax(torch.Tensor(trainer.predict(test_ds).predictions)).numpy().astype(float)
  preds.append(pred)



Epoch,Training Loss,Validation Loss,F1 Score
1,No log,0.928094,0.474558
2,No log,0.865292,0.503822
3,No log,0.812988,0.668722
4,No log,0.842402,0.681061
5,No log,0.84564,0.669969


  0%|          | 0/2 [00:00<?, ?ba/s]



Epoch,Training Loss,Validation Loss,F1 Score
1,No log,0.931842,0.385633
2,No log,0.768275,0.539802
3,No log,0.717236,0.657507
4,No log,0.699617,0.720954
5,No log,0.745761,0.719936


  0%|          | 0/2 [00:00<?, ?ba/s]



Epoch,Training Loss,Validation Loss,F1 Score
1,No log,0.856099,0.524196
2,No log,0.994571,0.549109
3,No log,0.778968,0.561082
4,No log,0.683958,0.730012
5,No log,0.713403,0.702438


  0%|          | 0/2 [00:00<?, ?ba/s]



Epoch,Training Loss,Validation Loss,F1 Score
1,No log,0.910776,0.515821
2,No log,0.701477,0.590919
3,No log,0.678265,0.649397
4,No log,0.716754,0.657726
5,No log,0.713478,0.662761


  0%|          | 0/2 [00:00<?, ?ba/s]



Epoch,Training Loss,Validation Loss,F1 Score
1,No log,0.981985,0.481216
2,No log,0.785602,0.532705
3,No log,0.716682,0.642572
4,No log,0.748177,0.66212
5,No log,0.787696,0.655285


  0%|          | 0/2 [00:00<?, ?ba/s]

In [25]:
sub = submission_df.copy()
sub.columns = ["id","label"]
for i in range(5):
  sub[f"fold{i+1}"] = np.argmax(preds[i],axis=1)

In [26]:
sub["label"] = sub.loc[:,"fold1":"fold5"].mode(axis=1)[0]
sub["label"] = sub["label"].astype("int")
sub["label"] = sub["label"].apply(lambda x : 4 if x==0 else x)
sub[["id","label"]].to_csv(os.path.join(OUTPUT_SUB_DIR,"submission07.csv"),index=False,header=False)
sub[["id","label"]]

Unnamed: 0,id,label
0,1516,1
1,1517,4
2,1518,3
3,1519,4
4,1520,3
...,...,...
1512,3028,3
1513,3029,1
1514,3030,3
1515,3031,1


In [27]:
sub

Unnamed: 0,id,label,fold1,fold2,fold3,fold4,fold5
0,1516,1,1,1,1,1,1
1,1517,4,0,0,0,0,0
2,1518,3,3,3,3,3,3
3,1519,4,0,0,0,0,0
4,1520,3,3,3,2,3,3
...,...,...,...,...,...,...,...
1512,3028,3,3,3,0,3,3
1513,3029,1,1,1,1,1,1
1514,3030,3,3,3,3,3,3
1515,3031,1,1,1,1,0,1
