# About this notebook
- [Luke](https://arxiv.org/pdf/2010.01057v1.pdf)-base starter notebook
- [Training notebook](https://www.kaggle.com/yasufuminakama/jigsaw4-luke-base-starter-train)
- Approach References
    - https://www.kaggle.com/c/jigsaw-toxic-severity-rating/discussion/286471
    - https://www.kaggle.com/debarshichanda/pytorch-w-b-jigsaw-starter
    - https://www.kaggle.com/debarshichanda/0-816-jigsaw-inference
    - Thanks for sharing @debarshichanda

# Directory settings

In [None]:
# ====================================================
# Directory settings
# ====================================================
import os

OUTPUT_DIR = './'
if not os.path.exists(OUTPUT_DIR):
    os.makedirs(OUTPUT_DIR)

# CFG

In [None]:
# ====================================================
# CFG
# ====================================================
class CFG:
    model_dir='../input/luke-0860/jigsaw-server-ruddit-luke/'
    num_workers=4
    model="studio-ousia/luke-base"
    batch_size=128
    fc_dropout=0.
    text="text"
    target="target"
    target_size=1
    head=32
    tail=32
    seed=42
    n_fold=5


CFG.max_len = CFG.head + CFG.tail

# Library

In [None]:
# ====================================================
# Library
# ====================================================
import os
import gc
import re
import sys
import json
import time
import math
import string
import pickle
import random
import joblib
import itertools
import warnings
warnings.filterwarnings("ignore")

import scipy as sp
import numpy as np
import pandas as pd
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
from tqdm.auto import tqdm
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import StratifiedKFold, GroupKFold, KFold

import torch
import torch.nn as nn
from torch.nn import Parameter
import torch.nn.functional as F
from torch.optim import Adam, SGD, AdamW
from torch.utils.data import DataLoader, Dataset

os.system('pip uninstall -q transformers -y')
os.system('pip uninstall -q tokenizers -y')
os.system('pip uninstall -q huggingface_hub -y')

os.system('mkdir -p /tmp/pip/cache-tokenizers/')
os.system('cp ../input/tokenizers-0103/tokenizers-0.10.3-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl /tmp/pip/cache-tokenizers/')
os.system('pip install -q --no-index --find-links /tmp/pip/cache-tokenizers/ tokenizers')

os.system('mkdir -p /tmp/pip/cache-huggingface-hub/')
os.system('cp ../input/huggingface-hub-008/huggingface_hub-0.0.8-py3-none-any.whl /tmp/pip/cache-huggingface-hub/')
os.system('pip install -q --no-index --find-links /tmp/pip/cache-huggingface-hub/ huggingface_hub')

os.system('mkdir -p /tmp/pip/cache-transformers/')
os.system('cp ../input/transformers-470/transformers-4.7.0-py3-none-any.whl /tmp/pip/cache-transformers/')
os.system('pip install -q --no-index --find-links /tmp/pip/cache-transformers/ transformers')

import tokenizers
import transformers
print(f"tokenizers.__version__: {tokenizers.__version__}")
print(f"transformers.__version__: {transformers.__version__}")
from transformers import LukeTokenizer, LukeModel, LukeConfig
from transformers import get_linear_schedule_with_warmup, get_cosine_schedule_with_warmup

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Utils

In [None]:
# ====================================================
# Utils
# ====================================================
def get_score(df):
    score = len(df[df['less_toxic_pred'] < df['more_toxic_pred']]) / len(df)
    return score


def get_logger(filename=OUTPUT_DIR+'train'):
    from logging import getLogger, INFO, StreamHandler, FileHandler, Formatter
    logger = getLogger(__name__)
    logger.setLevel(INFO)
    handler1 = StreamHandler()
    handler1.setFormatter(Formatter("%(message)s"))
    handler2 = FileHandler(filename=f"{filename}.log")
    handler2.setFormatter(Formatter("%(message)s"))
    logger.addHandler(handler1)
    logger.addHandler(handler2)
    return logger

LOGGER = get_logger()

def seed_everything(seed=42):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    
seed_everything(seed=42)

# Data Loading

In [None]:
# ====================================================
# Data Loading
# ====================================================
test = pd.read_csv('../input/jigsaw-toxic-severity-rating/comments_to_score.csv')
submission = pd.read_csv('../input/jigsaw-toxic-severity-rating/sample_submission.csv')
print(test.shape, submission.shape)
display(test.head())
display(submission.head())

# tokenizer

In [None]:
CFG.tokenizer = LukeTokenizer.from_pretrained(CFG.model_dir+'tokenizer/')

# Dataset

In [None]:
# ====================================================
# Dataset
# ====================================================
def prepare_input(text, cfg):
    if cfg.tail == 0:
        inputs = cfg.tokenizer.encode_plus(text, 
                                           return_tensors=None, 
                                           add_special_tokens=True, 
                                           max_length=cfg.max_len,
                                           pad_to_max_length=True,
                                           truncation=True)
        for k, v in inputs.items():
            inputs[k] = torch.tensor(v, dtype=torch.long)
    else:
        inputs = cfg.tokenizer.encode_plus(text,
                                           return_tensors=None, 
                                           add_special_tokens=True, 
                                           truncation=True)
        for k, v in inputs.items():
            v_length = len(v)
            if v_length > cfg.max_len:
                v = np.hstack([v[:cfg.head], v[-cfg.tail:]])
            if k == 'input_ids':
                new_v = np.ones(cfg.max_len) * cfg.tokenizer.pad_token_id
            else:
                new_v = np.zeros(cfg.max_len)
            new_v[:v_length] = v 
            inputs[k] = torch.tensor(new_v, dtype=torch.long)
    return inputs


class TestDataset(Dataset):
    def __init__(self, cfg, df):
        self.cfg = cfg
        self.text = df[cfg.text].fillna("none").values

    def __len__(self):
        return len(self.text)

    def __getitem__(self, item):
        text = str(self.text[item])
        inputs = prepare_input(text, self.cfg)
        return inputs

# Model

In [None]:
# ====================================================
# Model
# ====================================================
class CustomModel(nn.Module):
    def __init__(self, cfg, config_path=None, pretrained=False):
        super().__init__()
        self.cfg = cfg
        if config_path is None:
            self.config = LukeConfig.from_pretrained(cfg.model, output_hidden_states=True)
        else:
            self.config = torch.load(config_path)
        if pretrained:
            self.model = LukeModel.from_pretrained(cfg.model, config=self.config)
        else:
            self.model = LukeModel(self.config)
        self.fc_dropout = nn.Dropout(cfg.fc_dropout)
        self.fc = nn.Linear(self.config.hidden_size, cfg.target_size)
        
    def feature(self, inputs):
        outputs = self.model(**inputs)
        last_hidden_states = outputs[0]
        feature = torch.mean(last_hidden_states, 1)
        return feature

    def forward(self, inputs):
        feature = self.feature(inputs)
        output = self.fc(self.fc_dropout(feature))
        return output

# inference

In [None]:
# ====================================================
# inference
# ====================================================
def inference_fn(test_loader, model, device):
    preds = []
    model.eval()
    model.to(device)
    tk0 = tqdm(test_loader, total=len(test_loader))
    for inputs in tk0:
        for k, v in inputs.items():
            inputs[k] = v.to(device)
        with torch.no_grad():
            y_preds = model(inputs)
        preds.append(y_preds.sigmoid().to('cpu').numpy())
    predictions = np.concatenate(preds)
    return predictions

In [None]:
test_dataset = TestDataset(CFG, test)
test_loader = DataLoader(test_dataset, batch_size=CFG.batch_size, shuffle=False, 
                         num_workers=CFG.num_workers, pin_memory=True, drop_last=False)
config_path = CFG.model_dir+"config.pth"
predictions = []
for fold in range(CFG.n_fold):
    model = CustomModel(CFG, config_path=config_path, pretrained=False)
    state = torch.load(CFG.model_dir+f"{CFG.model.replace('/', '-')}_fold{fold}_best.pth", map_location=torch.device('cpu'))
    model.load_state_dict(state['model'])
    prediction = inference_fn(test_loader, model, device)
    predictions.append(prediction)
    del model, state; gc.collect()
    torch.cuda.empty_cache()

In [None]:
predictions1=np.mean(predictions, axis=0)
predictions1=np.squeeze(predictions1,axis=1)
del predictions

# tfidf

In [None]:
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
import seaborn as sns
from collections import defaultdict
import time
import scipy.optimize as optimize
import lightgbm as lgb
from bs4 import BeautifulSoup


from sklearn.model_selection import train_test_split
from sklearn.linear_model import Ridge, LinearRegression
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import mean_squared_error
from scipy.stats import rankdata

from collections import defaultdict
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.base import TransformerMixin, BaseEstimator
from tqdm import tqdm

import time
import re 
import scipy
from scipy import sparse
import gc 
from IPython.display import display, HTML
from pprint import pprint
import warnings
warnings.filterwarnings("ignore")

In [None]:
Fold_type=2
is_test=False
translate_aug=False
fast_vec=True
#记得改kdict 里面的tranlate的值

fold_num_k=5
fold_num_s=1

fast_factor=1

In [None]:

# data_names=["jc_","rud_","jcc1_","jcc2_","jc_fr_","jc_de_","jc_es_"]
# data_names=["jc_","rud_"]
data_names=["jc_","rud_","jcc1_","jcc2_"]

jc_num,jc_rate,rud_rate=3,4,1
factor_data=[jc_rate/jc_num,rud_rate,jc_rate/jc_num,jc_rate/jc_num,jc_rate/jc_num,jc_rate/jc_num,jc_rate/jc_num]

#这里是用于翻译增强 可以不用管
translate_data=["jc_s_"]
translate_language=["fr_text","de_text","es_text"]
clean_data={"jcc1_s_":1,"jcc2_s_":2}

#这里是选择模型 可选参数 包括 ridge 和gbm factor用于指定相应权值 ，gbm重复试验的结果看效果不好，选择放弃
model_choice=["ridge"]
factor=[1]


system_path=r"../input"
model_all=["ridge","gbm"]
for model_name in model_all:
    if model_name not in os.listdir():
        os.makedirs(f"{model_name}")
out_path=r"./"

# DATA

In [1]:
#第一届 jigsaw比赛 数据（challenge） Toxic Comment Classification Challenge

jc_path=os.path.join(system_path,"jigsaw-toxic-comment-classification-challenge")
jc_trans_path=os.path.join(system_path,"jc-trans")
#ruddit 数据\
run_path=os.path.join(system_path,"ruddit-jigsaw-dataset/Dataset")
#第二届 jigsaw比赛 对少数人群不歧视
juc_path=os.path.join(system_path,"jigsaw-unintended-bias-in-toxicity-classification")

#本次比赛数据 作为val
jts_path=os.path.join(system_path,"jigsaw-toxic-severity-rating")

# #数据抽样存储路径
gbm_save_path=os.path.join(out_path,"gbm")
ridge_save_path=os.path.join(out_path,"ridge")

In [None]:
#验证集和测试集
df_val = pd.read_csv(os.path.join(jts_path,"validation_data.csv"))

df_test = pd.read_csv(os.path.join(jts_path,"comments_to_score.csv"))

In [None]:
#第一届比赛数据 以0/1为分值 
features = ["toxic","severe_toxic","obscene","threat","insult","identity_hate"]

jc_train_df = pd.read_csv(os.path.join(jc_path,"train.csv"))
jc_test_df = pd.read_csv(os.path.join(jc_path,"test.csv"))
temp_df = pd.read_csv(os.path.join(jc_path,"test_labels.csv"))

jc_test_df = jc_test_df.merge ( temp_df, on ="id")
#drop test data not used for scoring
jc_test_df = jc_test_df.query ("toxic != -1")
jc_test_df=jc_test_df[jc_test_df[features].sum(axis=1)>0]

jc_df = jc_train_df.append ( jc_test_df ) 

# jc_df=jc_train_df

# 将代表有毒行为的筛选出来
jc_df["toxic_subtype_sum"]=jc_df[features].sum(axis=1)
jc_df["toxic_behaviour"]=jc_df["toxic_subtype_sum"].map(lambda x: x > 0)

tot_toxic_behaviour = jc_df["toxic_behaviour"].sum()
print(f'comments with toxic behaviour:{tot_toxic_behaviour}')
jc_df=jc_df.reset_index(drop=True)

In [None]:
#第一届比赛 数据预处理
# toxic = 1.0
# severe_toxic = 2.0
# obscene = 1.0
# threat = 1.0
# insult = 1.0
# identity_hate = 2.0
cat_mtpl = {'obscene': 0.16, 'toxic': 0.32, 'threat': 1.5, 
            'insult': 0.64, 'severe_toxic': 1.5, 'identity_hate': 1.5}
PSEUDO_LABEL_WEIGHT = 0.033
toxic_labels=[k for k in cat_mtpl.keys()]
def create_train (df):
    
    for category in cat_mtpl:
        df[category] = df[category] * cat_mtpl[category]
    df['y'] = df.loc[:, toxic_labels].sum(axis=1)
    
    
    df = df[["id",'comment_text', 'y', 'toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']].rename(columns={'comment_text': 'text'})
    return df
        
jc_df = create_train (jc_df)
jc_df.shape

In [None]:
#ruddit 数据
rud_df = pd.read_csv(os.path.join(run_path,"ruddit_with_text.csv"))

print(f"rud_df:{rud_df.shape}")
rud_df['y'] = rud_df['offensiveness_score'].map(lambda x: 0.0 if x <=0 else x)
# rud_df['y'] = rud_df['offensiveness_score']

rud_df = rud_df[['txt', 'y']].rename(columns={'txt': 'text'})
dele_flag="[deleted]"
rud_df=rud_df.query("text!=@dele_flag")

In [None]:
#clean data
def clean1(text):

    # Clean some punctutations
    text=re.sub(r'\n', r' \n ',text)
    text=re.sub(r'([a-zA-Z]+)([/!?.])([a-zA-Z]+)',r'\1 \2 \3',text)
    
    # Replace repeating characters more than 3 times to length of 3
    text=re.sub(r'([*!?\'])\1\1{2,}',r'\1\1\1',text)
    
    # Add space around repeating characters
    text=re.sub(r'([*!?\']+)',r' \1 ',text)
    
    # patterns with repeating characters 
    text=re.sub(r'([a-zA-Z])\1{2,}\b',r'\1\1',text)
    text=re.sub(r'([a-zA-Z])\1\1{2,}\B',r'\1\1\1',text)
    text=re.sub(r'[ ]{2,}',' ',text)
    text = text.strip()
    
    return text


def clean2(text):
    '''
    Cleans text into a basic form for NLP. Operations include the following:-
    1. Remove special charecters like &, #, etc
    2. Removes extra spaces
    3. Removes embedded URL links
    4. Removes HTML tags
    5. Removes emojis
    
    text - Text piece to be cleaned.
    '''
    template = re.compile(r'https?://\S+|www\.\S+') #Removes website links
    text = template.sub(r'', text)
    
    soup = BeautifulSoup(text, 'lxml') #Removes HTML tags
    only_text = soup.get_text()
    text = only_text
    
    emoji_pattern = re.compile("["
                               u"\U0001F600-\U0001F64F"  # emoticons
                               u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                               u"\U0001F680-\U0001F6FF"  # transport & map symbols
                               u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                               u"\U00002702-\U000027B0"
                               u"\U000024C2-\U0001F251"
                               "]+", flags=re.UNICODE)
    text = emoji_pattern.sub(r'', text)
    
    text = re.sub(r"[^a-zA-Z\d]", " ", text) #Remove special Charecters
    text = re.sub(' +', ' ', text) #Remove Extra Spaces
    text = text.strip() # remove spaces at the beginning and at the end of string

    return text
def clean3(text):
    '''
    1+2
    '''
    text=re.sub(r'\n', r' \n ',text)
    text=re.sub(r'([a-zA-Z]+)([/!?.])([a-zA-Z]+)',r'\1 \2 \3',text)
    
    # Replace repeating characters more than 3 times to length of 3
    text=re.sub(r'([*!?\'])\1\1{2,}',r'\1\1\1',text)
    
    # Add space around repeating characters
    text=re.sub(r'([*!?\']+)',r' \1 ',text)
    
    # patterns with repeating characters 
    text=re.sub(r'([a-zA-Z])\1{2,}\b',r'\1\1',text)
    text=re.sub(r'([a-zA-Z])\1\1{2,}\B',r'\1\1\1',text)
    text=re.sub(r'[ ]{2,}',' ',text)
    text = text.strip()
    
    template = re.compile(r'https?://\S+|www\.\S+') #Removes website links
    text = template.sub(r'', text)
    
    soup = BeautifulSoup(text, 'lxml') #Removes HTML tags
    only_text = soup.get_text()
    text = only_text
    
    emoji_pattern = re.compile("["
                               u"\U0001F600-\U0001F64F"  # emoticons
                               u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                               u"\U0001F680-\U0001F6FF"  # transport & map symbols
                               u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                               u"\U00002702-\U000027B0"
                               u"\U000024C2-\U0001F251"
                               "]+", flags=re.UNICODE)
    text = emoji_pattern.sub(r'', text)
    
    text = re.sub(r"[^a-zA-Z\d]", " ", text) #Remove special Charecters
    text = re.sub(' +', ' ', text) #Remove Extra Spaces
    text = text.strip() # remove spaces at the beginning and at the end of string

    return text

tqdm.pandas()
# clean 数据
def clean_df(df,clean_type):
    clean_df=df.copy()
    
    if clean_type==1:
        
        clean_df['text']=clean_df['text'].progress_apply(clean1)
    elif clean_type==2:
        
        clean_df['text']=clean_df['text'].progress_apply(clean2)
    elif clean_type==3:
        clean_df['text']=clean_df['text'].progress_apply(clean3)
    return clean_df
jcc1_df=clean_df(jc_df,1)
jcc2_df=clean_df(jc_df,2)
# jcc3_df=clean_df(jc_df,3)

In [2]:
def create_fold_sample(df,n_folds=3,frac_1=0.8,frac_1_factor=1.5,select_num=0,balance=False,translate=False):
    df_dict={}
    #正样本 大于等于select_num
    select_list=-(df['y'] <select_num)
    min_len=select_list.sum()
    if translate==True:
        min_len=4*min_len
    for fld in range(n_folds):
        if balance==False:
#             df_y0_undersample=df[df.y<select_num].sample(n=int(min_len*frac_1*frac_1_factor), random_state = 10*(fld+1))
            df_y0_undersample=df[df.y<select_num].sample(n=int(min_len*frac_1*frac_1_factor), random_state = 201)

            tmp_df = pd.concat([df[select_list].sample(frac=frac_1, random_state = 10*(fld+1)),df_y0_undersample ])
        else:
            tmp_df = df.sample(frac=frac_1, random_state = 10*(fld+1))
            
        df_dict[fld]=tmp_df

    return df_dict

if is_test==True:
    jc_df=jc_df[0:400]
    rud_df=rud_df[0:400]
    jcc_df=jcc_df[0:400]

In [3]:
if is_test==True:

    jc_df_sdict=create_fold_sample(jc_df,n_folds=fold_num_s,frac_1=0.8,frac_1_factor=1.5,select_num=0.001,balance=True,translate=translate_aug)
    rud_df_sdict=create_fold_sample(rud_df,n_folds=fold_num_s,frac_1=0.8,frac_1_factor=1.5,select_num=0.5,balance=True)
    jcc1_df_sdict=create_fold_sample(jcc1_df,n_folds=fold_num_s,frac_1=0.8,frac_1_factor=1.5,select_num=0.001,balance=True)
    jcc2_df_sdict=create_fold_sample(jcc2_df,n_folds=fold_num_s,frac_1=0.8,frac_1_factor=1.5,select_num=0.001,balance=True)
    
if is_test==False:

    jc_df_sdict=create_fold_sample(jc_df,n_folds=fold_num_s,frac_1=1,frac_1_factor=1.5,select_num=0.001,balance=False,translate=translate_aug)
    rud_df_sdict=create_fold_sample(rud_df,n_folds=fold_num_s,frac_1=1,frac_1_factor=1,select_num=0.5,balance=True)
    jcc1_df_sdict=create_fold_sample(jcc1_df,n_folds=fold_num_s,frac_1=1,frac_1_factor=1.5,select_num=0.001,balance=False)
    jcc2_df_sdict=create_fold_sample(jcc2_df,n_folds=fold_num_s,frac_1=1,frac_1_factor=1.5,select_num=0.001,balance=False)

    

In [None]:
from scipy.sparse import hstack
from scipy import sparse
from gensim.models import KeyedVectors, FastText

fmodel = FastText.load('../input/jigsaw-regression-based-data/FastText-jigsaw-256D/Jigsaw-Fasttext-Word-Embeddings-256D.bin')
def splitter(text):
    tokens = []
    
    for word in text.split(' '):
        tokens.append(word)
    
    return tokens

def vectorizer(text):
    tokens = splitter(text)
    
    x = np.mean(fmodel.wv[tokens], axis = 0).reshape(1, -1)
    
    return np.squeeze(x,axis=0)   
def text2fasttextarray(text):
    X_fast=[]
    for t in text:
        X_fast.append(vectorizer(t))
    X_fast=np.matrix(X_fast)
    
    return X_fast

# RIDGE

In [None]:
import joblib
def ridge_cv(df_dic,n_folds,model_pre="jc_k_ridge_",df_val=df_val,clean_prm=0,translate=False):
    val_preds_arr1 = np.zeros((df_val.shape[0], n_folds))
    val_preds_arr2 = np.zeros((df_val.shape[0], n_folds))
    test_preds_arr = np.zeros((df_test.shape[0], n_folds))
    for fld in tqdm(range(n_folds)):
        df = df_dic[fld]
        vec = TfidfVectorizer(analyzer='char_wb', max_df=0.5, min_df=3, ngram_range=(3, 5) )
        vec_pre=model_pre+"vec_"
        if translate!=True:
            text=df["text"]
            y=df["y"]
        else:
            trans_df=df.dropna(axis=0,subset = ["fr_text"])
            text=pd.concat([df["text"],trans_df["fr_text"],trans_df["es_text"],trans_df["de_text"]])
            y=pd.concat([df["y"],trans_df["y"],trans_df["y"],trans_df["y"]])
        X=vec.fit_transform(text)
#         joblib.dump(vec,os.path.join(ridge_save_path,f'{vec_pre}{fld}.pkl')) #保存模型 文件后缀为.pkl
        model=Ridge(alpha=0.5)
        model.fit(X,y)
        if fast_vec==True:
            X_f=text2fasttextarray(text)
            model_f=Ridge(alpha=0.5)
            model_f.fit(X_f,y)
        if clean_prm==1:
            X_less_toxic = df_val.apply(lambda row :clean1(row["less_toxic"]),axis=1)
            X_more_toxic = df_val.apply(lambda row :clean1(row["more_toxic"]),axis=1)
            X_test = df_test.apply(lambda row :clean1(row["text"]),axis=1)
    
        elif clean_prm==2:
            X_less_toxic = df_val.apply(lambda row :clean2(row["less_toxic"]),axis=1)
            X_more_toxic = df_val.apply(lambda row :clean2(row["more_toxic"]),axis=1)
            X_test = df_test.apply(lambda row :clean2(row["text"]),axis=1)
        elif clean_prm==3:
            X_less_toxic = df_val.apply(lambda row :clean3(row["less_toxic"]),axis=1)
            X_more_toxic = df_val.apply(lambda row :clean3(row["more_toxic"]),axis=1)
            X_test = df_test.apply(lambda row :clean3(row["text"]),axis=1)

        else:
            X_less_toxic = df_val['less_toxic']
            X_more_toxic = df_val['more_toxic']
            X_test = df_test['text']
        if fast_vec==True:
            X_less_toxic_f=text2fasttextarray(X_less_toxic)
            X_more_toxic_f=text2fasttextarray(X_more_toxic)
            X_test_f=text2fasttextarray(X_test)
            
            val_preds_arr1[:,fld]=model_f.predict(X_less_toxic_f)*fast_factor
            val_preds_arr2[:,fld]=model_f.predict(X_more_toxic_f)*fast_factor
            test_preds_arr[:,fld]=model_f.predict(X_test_f)*fast_factor
#             joblib.dump(model,os.path.join(ridge_save_path,f'{model_pre}_f_{fld}.pkl')) #保存模型 文件后缀为.pkl
            
        X_less_toxic = vec.transform(X_less_toxic)
        X_more_toxic = vec.transform(X_more_toxic)
        X_test = vec.transform(X_test)
        
            
        val_preds_arr1[:,fld] += model.predict(X_less_toxic)
        val_preds_arr2[:,fld] += model.predict(X_more_toxic)

        test_preds_arr[:,fld] += model.predict(X_test)
            
#         joblib.dump(model,os.path.join(ridge_save_path,f'{model_pre}{fld}.pkl')) #保存模型 文件后缀为.pkl
        del model,vec
        
    p1=val_preds_arr1.mean(axis=1)
    p2=val_preds_arr2.mean(axis=1)
    pv=test_preds_arr.mean(axis=1)
    print(f'Validation Accuracy is { np.round((p1 < p2).mean() * 100,2)}')        
    return p1,p2,pv

In [None]:
def lightgbm_cv(df_dic,n_folds,model_pre="jc_k_gbm_",df_val=df_val,clean_prm=False,translate=False):
    return 0

In [4]:
p1=defaultdict()
p2=defaultdict()
pv=defaultdict()

val_data=df_val

func_dict={"ridge":ridge_cv,"gbm":lightgbm_cv}
# func_dict.get(x)

if Fold_type==2:
    pre_names=[ data_name+"s_" for data_name in data_names]
    name2dict={"jc_s_":jc_df_sdict,"rud_s_":rud_df_sdict,
               "jcc1_s_":jcc1_df_sdict,"jcc2_s_":jcc2_df_sdict}
    fold_num=fold_num_s

p1_ensenmble = np.zeros((val_data.shape[0]))
p2_ensenmble = np.zeros((val_data.shape[0]))
score=np.zeros((df_test.shape[0]))

for d_index,pre_name in enumerate(pre_names):
    ###model_pre_ridge:jc_s_ridge_ pre_name:jc_s_ model:jc_s_ridge_{fold} vec:jc_s_ridge_vec_{fold}
    #pre_name jc_s_ model_name jc_s_ridge_
    clean_prm=False
    translate=False
    
    p1[pre_name],p2[pre_name]=np.zeros((val_data.shape[0])),np.zeros((val_data.shape[0]))
    pv[pre_name]=np.zeros((df_test.shape[0]))
    if pre_name in clean_data.keys():
        clean_prm=clean_data[pre_name]

    for index,model_name in enumerate(model_choice):
        cv_func=func_dict.get(model_name)
        model_pre=pre_name+model_name+"_"
        p1[model_pre],p2[model_pre],pv[model_pre]=cv_func(name2dict[pre_name],n_folds=fold_num,df_val=val_data,
                                                        model_pre=model_pre,clean_prm=clean_prm,translate=translate)

        p1[pre_name]= p1[pre_name]+ p1[model_pre]*factor[index]
        p2[pre_name]= p2[pre_name]+ p2[model_pre]*factor[index]
        pv[pre_name]= pv[pre_name]+ pv[model_pre]*factor[index]

    kmax=max(p1[pre_name].max(),p2[pre_name].max())
    result=df_val.copy()
    result["less_value"]=p1[pre_name]
    result["more_value"]=p2[pre_name]
    result.to_csv(f"{pre_name}.csv")
    p1_ensenmble=p1_ensenmble+factor_data[d_index]*p1[pre_name]/kmax
    p2_ensenmble=p2_ensenmble+factor_data[d_index]*p2[pre_name]/kmax
    score=score+factor_data[d_index]*pv[pre_name]/kmax

print(f' Validation Accuracy is { np.round((p1_ensenmble < p2_ensenmble).mean() * 100,4)}') 
predictions2=score

# ENSENMBLE

In [None]:

from sklearn import preprocessing
predictions1=preprocessing.scale(predictions1)
predictions2=preprocessing.scale(predictions2)

# predictions1 = (predictions1-predictions1.min())/(predictions1.max()-predictions1.min())
# predictions2=(predictions2-predictions2.min())/(predictions2.max()-predictions2.min())

In [None]:
predictions=0.67*predictions1+0.4*predictions2

# submission

In [None]:
submission['score'] = predictions 
submission['score'] = submission['score'].rank(method='first')
submission[['comment_id', 'score']].to_csv('submission.csv', index=False)