<a href="https://colab.research.google.com/github/bonnefco/P8/blob/main/P8_FORK_Ensemble.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Ensemble of models

1. feedback_deberta_large_LB0.619 / Score: 0.619
> https://www.kaggle.com/code/brandonhu0215/feedback-deberta-large-lb0-619

2. RoBerta-base Inference v2.0 / Score: 0.649
> https://www.kaggle.com/code/arvissu/roberta-base-inference-v2-0

3. Ensemble Learning 
> https://www.kaggle.com/code/renokan/fork-ensemble-deberta-roberta


#### If these notebooks are helpful, please upvote the original versions:


# 1. Import & Def & Set & Load

In [None]:
import gc
import os
import pickle
import glob

from text_unidecode import unidecode
from typing import Dict, List, Tuple
import codecs

import numpy as np
import pandas as pd

from tqdm import tqdm

import seaborn as sns

import torch
import torch.nn as nn
import torch.nn.functional as F

from torch.nn import Parameter
from torch.utils.data import Dataset, DataLoader

from transformers import AutoModel, AutoTokenizer, AutoConfig

import warnings
warnings.simplefilter('ignore')

In [None]:
def replace_encoding_with_utf8(error: UnicodeError) -> Tuple[bytes, int]:
    return error.object[error.start : error.end].encode("utf-8"), error.end


def replace_decoding_with_cp1252(error: UnicodeError) -> Tuple[str, int]:
    return error.object[error.start : error.end].decode("cp1252"), error.end

codecs.register_error("replace_encoding_with_utf8", replace_encoding_with_utf8)
codecs.register_error("replace_decoding_with_cp1252", replace_decoding_with_cp1252)


def resolve_encodings_and_normalize(text: str) -> str:
    text = (
        text.encode("raw_unicode_escape")
        .decode("utf-8", errors="replace_decoding_with_cp1252")
        .encode("cp1252", errors="replace_encoding_with_utf8")
        .decode("utf-8", errors="replace_decoding_with_cp1252")
    )
    
    text = unidecode(text)
    
    return text


def fetch_essay(essay_id: str, txt_dir: str):
    essay_path = os.path.join(COMP_DIR + txt_dir, essay_id + '.txt')
    essay_text = open(essay_path, 'r').read()
    
    return essay_text


def prepare_input(cfg, text, text_2=None):
    inputs = cfg.tokenizer(text, text_2,
                           padding="max_length",
                           add_special_tokens=True,
                           max_length=cfg.max_len,
                           truncation=True)

    for k, v in inputs.items():
        inputs[k] = torch.tensor(v, dtype=torch.long)
        
    return inputs


def inference_fn(test_loader, model, device):
    preds = []
    model.eval()
    model.to(device)
    tk0 = tqdm(test_loader, total=len(test_loader))
    
    for inputs in tk0:
        for k, v in inputs.items():
            inputs[k] = v.to(device)
            
        with torch.no_grad():
            output = model(inputs)
        
        preds.append(F.softmax(output).to('cpu').numpy())

    return np.concatenate(preds)  


def show_gradient(df, n_row=None):
    if not n_row:
        n_row = 5

    return df.head(n_row) \
                .assign(all_mean=lambda x: x.mean(axis=1)) \
                    .style.background_gradient(cmap=cm, axis=1)

In [None]:
pd.set_option('display.precision', 4)
cm = sns.light_palette('green', as_cmap=True)
props_param = "color:white; font-weight:bold; background-color:green;"

N_ROW = 10

COMP_DIR = "../input/feedback-prize-effectiveness/"
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [None]:
test_path = COMP_DIR + "test.csv"
submission_path = COMP_DIR + "sample_submission.csv"

test_origin = pd.read_csv(test_path)
submission_origin = pd.read_csv(submission_path)

In [None]:
test_origin.head()

Unnamed: 0,discourse_id,essay_id,discourse_text,discourse_type
0,a261b6e14276,D72CB1C11673,Making choices in life can be very difficult. ...,Lead
1,5a88900e7dc1,D72CB1C11673,Seeking multiple opinions can help a person ma...,Position
2,9790d835736b,D72CB1C11673,it can decrease stress levels,Claim
3,75ce6d68b67b,D72CB1C11673,a great chance to learn something new,Claim
4,93578d946723,D72CB1C11673,can be very helpful and beneficial.,Claim


# 2. Check unidecode(text)

```
def resolve_encodings_and_normalize(text: str) -> str:
    text = (
        text.encode("raw_unicode_escape")
        .decode("utf-8", errors="replace_decoding_with_cp1252")
        .encode("cp1252", errors="replace_encoding_with_utf8")
        .decode("utf-8", errors="replace_decoding_with_cp1252")
    )
    
    text = unidecode(text)
```

In [None]:
data_path = "../input/feedback-prize-effectiveness/train.csv"
cols_list = ['essay_id', 'discourse_text']
idxs_list = [49, 80, 945, 947, 1870]

temp = pd.read_csv(data_path, usecols=cols_list).loc[idxs_list, :]
temp

Unnamed: 0,essay_id,discourse_text
49,0158970BC5D2,"Often times throughout middle school, my teach..."
80,01AFC67DF935,President Obama has done nothing to improve ou...
945,11B9AC1814C8,"The article says, ¨... humans have sent numero..."
947,11B9AC1814C8,"However, the article says that this can only g..."
1870,22E8627A3CB9,Many people are able to tell when a peer is an...


In [None]:
temp['discourse_text_UPD'] = temp['discourse_text'].apply(resolve_encodings_and_normalize)
temp['essay_text'] = temp['essay_id'].transform(fetch_essay, txt_dir='train')
temp['essay_text_UPD'] = temp['essay_text'].apply(resolve_encodings_and_normalize)

In [None]:
for n, row in enumerate(temp.iterrows()):
    indx, data = row
    disc_text = data.discourse_text
    disc_text_upd = data.discourse_text_UPD

# 3. Extract predictions

## 3.1 DeBerta

In [None]:
class TestDataset(Dataset):
    def __init__(self, cfg, df):
        self.cfg = cfg
        self.text = df['text'].values

    def __len__(self):
        return len(self.text)

    def __getitem__(self, item):    
        text = self.text[item]
        inputs = prepare_input(self.cfg, text)
        
        return inputs

class CustomModel(nn.Module):
    def __init__(self, cfg, config_path=None, pretrained=False):
        super().__init__()
        self.cfg = cfg
        
        if config_path is None:
            self.config = AutoConfig.from_pretrained(cfg.model, output_hidden_states=True)
        else:
            self.config = torch.load(config_path)
        
        if pretrained:
            self.model = AutoModel.from_pretrained(cfg.model, config=self.config)
        else:
            self.model = AutoModel.from_config(self.config)
        
        self.bilstm = nn.LSTM(self.config.hidden_size, (self.config.hidden_size) // 2, num_layers=2, 
                              dropout=self.config.hidden_dropout_prob, batch_first=True,
                              bidirectional=True)
        
        # self.dropout = nn.Dropout(0.2)
        self.dropout1 = nn.Dropout(0.1)
        self.dropout2 = nn.Dropout(0.2)
        self.dropout3 = nn.Dropout(0.3)
        self.dropout4 = nn.Dropout(0.4)
        self.dropout5 = nn.Dropout(0.5)
        
        self.output = nn.Sequential(
            nn.Linear(self.config.hidden_size, 3)  # self.cfg.target_size
        )
                
    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.bias is not None:
                module.bias.data.zero_()
        elif isinstance(module, nn.Embedding):
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.padding_idx is not None:
                module.weight.data[module.padding_idx].zero_()
        elif isinstance(module, nn.LayerNorm):
            module.bias.data.zero_()
            module.weight.data.fill_(1.0)

    def forward(self, inputs):
        sequence_output = self.model(**inputs)[0][:, 0, :]

        logits1 = self.output(self.dropout1(sequence_output))
        logits2 = self.output(self.dropout2(sequence_output))
        logits3 = self.output(self.dropout3(sequence_output))
        logits4 = self.output(self.dropout4(sequence_output))
        logits5 = self.output(self.dropout5(sequence_output))
        logits = (logits1 + logits2 + logits3 + logits4 + logits5) / 5

        return logits

In [None]:
class CFG:
    path = "../input/feedback-deberta-large-051/"
    config_path = path+'config.pth'
    model = "microsoft/deberta-large"
    num_workers = 2
    batch_size = 16
    max_len = 512
    seed = 42
    n_fold = 4
    
CFG.tokenizer = AutoTokenizer.from_pretrained(CFG.path + 'tokenizer')

In [None]:
df = test_origin.copy()
SEP = CFG.tokenizer.sep_token

df['discourse_text'] = df['discourse_text'].apply(resolve_encodings_and_normalize)
df['essay_text'] = df['essay_id'].transform(fetch_essay, txt_dir='test')
df['essay_text'] = df['essay_text'].apply(resolve_encodings_and_normalize)
df['text'] = df['discourse_type'] + ' ' + df['discourse_text'] + SEP + df['essay_text']

df.head()

Unnamed: 0,discourse_id,essay_id,discourse_text,discourse_type,essay_text,text
0,a261b6e14276,D72CB1C11673,Making choices in life can be very difficult. ...,Lead,Making choices in life can be very difficult. ...,Lead Making choices in life can be very diffic...
1,5a88900e7dc1,D72CB1C11673,Seeking multiple opinions can help a person ma...,Position,Making choices in life can be very difficult. ...,Position Seeking multiple opinions can help a ...
2,9790d835736b,D72CB1C11673,it can decrease stress levels,Claim,Making choices in life can be very difficult. ...,Claim it can decrease stress levels [SEP]Makin...
3,75ce6d68b67b,D72CB1C11673,a great chance to learn something new,Claim,Making choices in life can be very difficult. ...,Claim a great chance to learn something new [S...
4,93578d946723,D72CB1C11673,can be very helpful and beneficial.,Claim,Making choices in life can be very difficult. ...,Claim can be very helpful and beneficial. [SEP...


In [None]:
test_dataset = TestDataset(CFG, df)
test_loader = DataLoader(test_dataset,
                         batch_size=CFG.batch_size,
                         shuffle=False,
                         num_workers=CFG.num_workers,
                         pin_memory=True, drop_last=False)

In [None]:
deberta_predictions = []

for fold in range(CFG.n_fold):
    model = CustomModel(CFG, config_path=CFG.config_path, pretrained=False)
    state = torch.load(CFG.path+f"{CFG.model.replace('/', '-')}_fold{fold}_best.pth",
                       map_location=torch.device('cpu'))
    
    model.load_state_dict(state['model'])
    prediction = inference_fn(test_loader, model, DEVICE)
    
    deberta_predictions.append(prediction)
    
    del model, state, prediction; gc.collect()
    torch.cuda.empty_cache()

100%|██████████| 1/1 [00:02<00:00,  2.64s/it]
100%|██████████| 1/1 [00:01<00:00,  1.31s/it]
100%|██████████| 1/1 [00:01<00:00,  1.43s/it]
100%|██████████| 1/1 [00:01<00:00,  1.34s/it]


In [None]:
deb_ineffective = []
deb_effective = []
deb_adequate = []

for x in deberta_predictions:
    deb_ineffective.append(x[:, 0])
    deb_adequate.append(x[:, 1])
    deb_effective.append(x[:, 2])

In [None]:
deb_adequate = pd.DataFrame(deb_adequate).T
deb_effective = pd.DataFrame(deb_effective).T
deb_ineffective = pd.DataFrame(deb_ineffective).T

## 3.2 RoBerta

In [None]:
class TestDataset(Dataset):
    def __init__(self, cfg, df):
        self.cfg = cfg
        self.discourse = df['discourse'].values
        self.essay = df['essay'].values
        
    def __len__(self):
        return len(self.discourse)
    
    def __getitem__(self, item):
        discourse = self.discourse[item]
        essay = self.essay[item]
        
        inputs = prepare_input(self.cfg, discourse, essay)
        
        return inputs
        
class FeedBackModel(nn.Module):
    def __init__(self, model_path):
        super(FeedBackModel, self).__init__()
        self.model = AutoModel.from_pretrained(model_path)
        self.linear = nn.Linear(768, 3)

    def forward(self, inputs):
        last_hidden_states = self.model(**inputs)[0][:, 0, :]
        outputs = self.linear(last_hidden_states)
        
        return outputs

In [None]:
model_list = pickle.load(
    open("../input/feedback-roberta-ep1/roberta_modellist_ep2.pkl", "rb")
)

class CFG:
    path = "../input/roberta-base/"
    n_fold = 5
    batch = 16
    max_len = 512
    num_workers = 2
    
CFG.tokenizer = AutoTokenizer.from_pretrained(CFG.path)

In [None]:
df = test_origin.copy()

txt_sep = " "
df['discourse'] = df['discourse_type'].str.lower().str.strip() + txt_sep \
                + df['discourse_text'].str.lower().str.strip()

df['essay'] = df['essay_id'].transform(fetch_essay, txt_dir='test').str.lower().str.strip()
df.head()

Unnamed: 0,discourse_id,essay_id,discourse_text,discourse_type,discourse,essay
0,a261b6e14276,D72CB1C11673,Making choices in life can be very difficult. ...,Lead,lead making choices in life can be very diffic...,making choices in life can be very difficult. ...
1,5a88900e7dc1,D72CB1C11673,Seeking multiple opinions can help a person ma...,Position,position seeking multiple opinions can help a ...,making choices in life can be very difficult. ...
2,9790d835736b,D72CB1C11673,it can decrease stress levels,Claim,claim it can decrease stress levels,making choices in life can be very difficult. ...
3,75ce6d68b67b,D72CB1C11673,a great chance to learn something new,Claim,claim a great chance to learn something new,making choices in life can be very difficult. ...
4,93578d946723,D72CB1C11673,can be very helpful and beneficial.,Claim,claim can be very helpful and beneficial.,making choices in life can be very difficult. ...


In [None]:
test_dataset = TestDataset(CFG, df)
test_loader = DataLoader(test_dataset, batch_size=CFG.batch,
                         shuffle=False, num_workers=CFG.num_workers,
                         pin_memory=True, drop_last=False)

In [None]:
roberta_predicts = []
for i in range(CFG.n_fold):
    model = model_list[i]
    
    prediction = inference_fn(test_loader, model, DEVICE)
    roberta_predicts.append(prediction)
    
    del model, prediction
    torch.cuda.empty_cache()    
    gc.collect()
    
del model_list
gc.collect()

100%|██████████| 1/1 [00:00<00:00,  1.39it/s]
100%|██████████| 1/1 [00:00<00:00,  1.61it/s]
100%|██████████| 1/1 [00:00<00:00,  1.62it/s]
100%|██████████| 1/1 [00:00<00:00,  1.68it/s]
100%|██████████| 1/1 [00:00<00:00,  1.64it/s]


0

In [None]:
rob_ineffective = []
rob_effective = []
rob_adequate = []

for x in roberta_predicts:
    rob_ineffective.append(x[:, 0])
    rob_adequate.append(x[:, 1])
    rob_effective.append(x[:, 2])

In [None]:
rob_ineffective = pd.DataFrame(rob_ineffective).T
rob_adequate = pd.DataFrame(rob_adequate).T
rob_effective = pd.DataFrame(rob_effective).T

# 4. Create submission

In [None]:
level_names = ['deberta', 'roberta']

ineffective_ = pd.concat(
    [deb_ineffective, rob_ineffective],
    keys=level_names, axis=1
)

adequate_ = pd.concat(
    [deb_adequate, rob_adequate],
    keys=level_names, axis=1
)

effective_ = pd.concat(
    [deb_effective, rob_effective],
    keys=level_names, axis=1
)

In [None]:
effective_

Unnamed: 0_level_0,deberta,deberta,deberta,deberta,roberta,roberta,roberta,roberta,roberta
Unnamed: 0_level_1,0,1,2,3,0,1,2,3,4
0,0.7356,0.6535,0.5426,0.7304,0.6763,0.3151,0.4963,0.5036,0.3095
1,0.0751,0.0679,0.1186,0.0613,0.2098,0.0775,0.1647,0.4651,0.0716
2,0.2572,0.1327,0.3419,0.1444,0.1218,0.3984,0.328,0.663,0.3686
3,0.2438,0.1749,0.3226,0.2025,0.2906,0.4764,0.4813,0.7316,0.2415
4,0.3722,0.2085,0.2839,0.3968,0.2377,0.3935,0.3081,0.7647,0.3664
5,0.6856,0.5745,0.5444,0.6297,0.6155,0.5099,0.6627,0.8036,0.5353
6,0.7861,0.6904,0.762,0.8096,0.5916,0.4909,0.7046,0.7744,0.4546
7,0.4695,0.2386,0.3393,0.2042,0.4077,0.4634,0.6043,0.8302,0.3845
8,0.6978,0.4881,0.3636,0.59,0.5397,0.5754,0.6081,0.8121,0.7404
9,0.5313,0.3137,0.4576,0.3424,0.0936,0.1487,0.194,0.2859,0.309


In [None]:
submission = submission_origin.copy()

w_ = [.75, .25]  # ['deberta', 'roberta']
d_ = [('Ineffective', ineffective_),
      ('Adequate', adequate_),
      ('Effective', effective_)]

for x in d_:
    col_name, df = x
    submission[col_name] = pd.DataFrame(
        {col: df[col].mean(axis=1) for col in level_names}
    ).mul(w_).sum(axis=1)    

# Wisdom of the crowd

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import sklearn
import ast
from scipy import stats

import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer


from nltk.corpus import wordnet
from nltk.corpus import stopwords
nltk.data.path.append('/kaggle/input/corporafolder')

from sklearn.feature_extraction.text import TfidfVectorizer, TfidfTransformer
from sklearn.preprocessing import MultiLabelBinarizer, LabelBinarizer
from sklearn.decomposition import TruncatedSVD, PCA
from sklearn.preprocessing import StandardScaler, RobustScaler
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import make_column_transformer
from sklearn.model_selection import train_test_split, GridSearchCV, learning_curve
from sklearn.svm import LinearSVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import r2_score, precision_score, recall_score, f1_score, classification_report, plot_roc_curve
from sklearn.multioutput import MultiOutputClassifier
from sklearn.multiclass import OneVsRestClassifier, OneVsOneClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, VotingClassifier
from sklearn.dummy import DummyClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split, GridSearchCV, learning_curve

import joblib
import ast
import time

from sklearn.linear_model import LogisticRegression

In [None]:
df_effective_args = pd.read_csv('../input/train-light-csv/train_light.csv',encoding="utf-8",header=(0))

In [None]:
lemmatizer = WordNetLemmatizer()

def create_lemmatization(list_of_words_tokenized) :
  list_of_words_tokenized_lemmantized = []
  for token in list_of_words_tokenized:
    lemmetized_word = lemmatizer.lemmatize(token)
    if len(lemmetized_word) > 1 : 
      list_of_words_tokenized_lemmantized.append(lemmetized_word)
  return list_of_words_tokenized_lemmantized


def create_token(string):
  tokenizer = nltk.RegexpTokenizer(r'\w+')
  string = tokenizer.tokenize(string.lower())
  list_tokens_elements = [element for element in string if len(element)>1]
  return list_tokens_elements

df_effective_args['discourse_text_tokenized'] = df_effective_args['discourse_text'].apply(create_token)
df_effective_args['discourse_text_lemmatized'] = df_effective_args['discourse_text_tokenized'].apply(create_lemmatization)



In [None]:
list_all_words = [j for i in df_effective_args['discourse_text_lemmatized'].tolist() for j in i]
dictionnaire_des_frequences = nltk.FreqDist(list_all_words)

In [None]:
stop_words_corpus = []
nombre_de_mot_selectionne = 25
for i in range(0, nombre_de_mot_selectionne):
  stop_words_corpus.append(dictionnaire_des_frequences.most_common(nombre_de_mot_selectionne+1)[i][0])
stop_words_nltk = list(set(stopwords.words('english')))
stop_words_merged = list(set(stop_words_nltk+stop_words_corpus))

In [None]:
def delete_stop_words_in_corpus(x, list_of_stop_words = stop_words_merged) :
  y = x.copy()
  for element in list_of_stop_words :
    while element in y: y.remove(element)
  return y

df_effective_args['no_stop_words'] = df_effective_args['discourse_text_lemmatized'].transform(delete_stop_words_in_corpus)

list_all_words = [j for i in df_effective_args['no_stop_words'].tolist() for j in i]

In [None]:
def delete_y_words(nb_words_to_save, liste_words):

  dictionnaire_des_frequences = nltk.FreqDist(liste_words)
  dictionnaire_des_frequences = dictionnaire_des_frequences.most_common(nb_words_to_save)
  words = []
  for i in range(0, nb_words_to_save):
    words.append(dictionnaire_des_frequences[i][0])

  return words

nb_words_to_save = 2000
liste_words_to_keep = delete_y_words(nb_words_to_save, list_all_words)

In [None]:
def delete_if_not_in(liste, liste_words_to_keep = liste_words_to_keep):

  words_in_list = [i for i in liste if i in liste_words_to_keep]

  return list(words_in_list)

df_effective_args['only_'+str(nb_words_to_save)+'_tokens'] = df_effective_args['no_stop_words'].apply(delete_if_not_in)

In [None]:
def counter_len_in_text(string):
  return len(string)

df_effective_args['Longueur_texte'] = df_effective_args['only_'+str(nb_words_to_save)+'_tokens'].apply(counter_len_in_text)

In [None]:
df_effective_args = df_effective_args[df_effective_args['Longueur_texte']<150]

In [None]:
df_effective_args['type_and_tokens'] = df_effective_args['discourse_type'].map(lambda i: [i]) + df_effective_args['only_'+str(nb_words_to_save)+'_tokens']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(df_effective_args['type_and_tokens'], df_effective_args['discourse_effectiveness'], test_size=0.2, random_state=1)

In [None]:
list_words = X_train.tolist()
list_corpus = []
for element in list_words :
  list_corpus.append(' '.join(element))
tfidftvecto =  TfidfVectorizer()
tfidf_model = tfidftvecto.fit(list_corpus)
tfidf_tokens = tfidftvecto.get_feature_names_out()
X_train_encoded = pd.DataFrame(data = tfidftvecto.fit_transform(list_corpus).toarray(),columns = tfidf_tokens)

In [None]:
list_words_test = X_test.tolist()
list_corpus_test = []
for element in list_words_test :
  list_corpus_test.append(' '.join(element))
X_test_encoded = pd.DataFrame(data = tfidf_model.transform(list_corpus_test).toarray(),columns = tfidf_tokens)

In [None]:
lb = LabelBinarizer()
df_labels_train = pd.DataFrame(lb.fit_transform(y_train), columns=lb.classes_)
df_labels_test = pd.DataFrame(lb.fit_transform(y_test), columns=lb.classes_)

In [None]:
numerical_features = list(X_train_encoded.select_dtypes(['int']).columns)+list(X_train_encoded.select_dtypes(['float']).columns)
numerical_pipeline = make_pipeline(StandardScaler())

R_F = RandomForestClassifier()
multilabel_classifier = OneVsRestClassifier(R_F)
preprocessor = make_column_transformer((numerical_pipeline, numerical_features))
model_R_F = make_pipeline(preprocessor, multilabel_classifier)

start_time_fit = time.time()
model_R_F.fit(X_train_encoded, df_labels_train)

elapsed_time_fit = time.time() - start_time_fit #time

start_time_predict = time.time()
df_labels_pred = model_R_F.predict(X_test_encoded)

elapsed_time_predict = time.time() - start_time_predict #time
print(elapsed_time_fit, elapsed_time_predict) #time

print(classification_report(df_labels_test, df_labels_pred))

263.056866645813 2.595913887023926
              precision    recall  f1-score   support

           0       0.66      0.82      0.73      4161
           1       0.74      0.34      0.47      1882
           2       0.59      0.11      0.18      1280

   micro avg       0.67      0.57      0.62      7323
   macro avg       0.66      0.42      0.46      7323
weighted avg       0.67      0.57      0.57      7323
 samples avg       0.57      0.57      0.57      7323



In [None]:
numerical_features = list(X_train_encoded.select_dtypes(['int']).columns)+list(X_train_encoded.select_dtypes(['float']).columns)
numerical_pipeline = make_pipeline(StandardScaler())

GB_C = GradientBoostingClassifier()
multilabel_classifier = OneVsRestClassifier(GB_C)
preprocessor = make_column_transformer((numerical_pipeline, numerical_features))
model_GB_C = make_pipeline(preprocessor, multilabel_classifier)

start_time_fit = time.time()
model_GB_C.fit(X_train_encoded, df_labels_train)

elapsed_time_fit = time.time() - start_time_fit #time

start_time_predict = time.time()
df_labels_pred = model_GB_C.predict(X_test_encoded)

elapsed_time_predict = time.time() - start_time_predict #time
print(elapsed_time_fit, elapsed_time_predict) #time

print(classification_report(df_labels_test, df_labels_pred))

711.1832120418549 0.441908597946167
              precision    recall  f1-score   support

           0       0.63      0.90      0.74      4161
           1       0.79      0.27      0.40      1882
           2       0.71      0.06      0.10      1280

   micro avg       0.64      0.59      0.62      7323
   macro avg       0.71      0.41      0.41      7323
weighted avg       0.68      0.59      0.54      7323
 samples avg       0.59      0.59      0.59      7323



In [None]:
numerical_features = list(X_train_encoded.select_dtypes(['int']).columns)+list(X_train_encoded.select_dtypes(['float']).columns)
numerical_pipeline = make_pipeline(StandardScaler())

L_R = LogisticRegression(max_iter= 1500)
multilabel_classifier = OneVsRestClassifier(L_R)
preprocessor = make_column_transformer((numerical_pipeline, numerical_features))
model_L_R = make_pipeline(preprocessor, multilabel_classifier)

start_time_fit = time.time()
model_L_R.fit(X_train_encoded, df_labels_train)

elapsed_time_fit = time.time() - start_time_fit #time

start_time_predict = time.time()
df_labels_pred = model_L_R.predict(X_test_encoded)

elapsed_time_predict = time.time() - start_time_predict #time
print(elapsed_time_fit, elapsed_time_predict) #time

print(classification_report(df_labels_test, df_labels_pred))

50.16052508354187 0.45408129692077637
              precision    recall  f1-score   support

           0       0.65      0.76      0.70      4161
           1       0.67      0.49      0.57      1882
           2       0.46      0.23      0.31      1280

   micro avg       0.64      0.60      0.62      7323
   macro avg       0.59      0.49      0.53      7323
weighted avg       0.62      0.60      0.60      7323
 samples avg       0.57      0.60      0.58      7323



In [None]:
def pipeline_from_dataset_to_tokens(df, nb_words_to_save):
  df['discourse_text_tokenized'] = df['discourse_text'].apply(create_token)
  df['discourse_text_lemmatized'] = df['discourse_text_tokenized'].apply(create_lemmatization)
  df['no_stop_words'] = df['discourse_text_lemmatized'].transform(delete_stop_words_in_corpus)
  df['only_'+str(nb_words_to_save)+'_tokens'] = df['no_stop_words'].apply(delete_if_not_in)
  df['Longueur_texte'] = df['only_'+str(nb_words_to_save)+'_tokens'].apply(counter_len_in_text)
  df['type_and_tokens'] = df['discourse_type'].map(lambda i: [i]) + df['only_'+str(nb_words_to_save)+'_tokens']
  #df = df[df['Longueur_texte']<150]
  print(df.shape)
  return df

In [None]:
df_needed_to_predict = pd.read_csv("../input/feedback-prize-effectiveness/test.csv")
df_needed_to_predict_take_token = pipeline_from_dataset_to_tokens(df_needed_to_predict, nb_words_to_save = nb_words_to_save)
X_to_predict = df_needed_to_predict_take_token['type_and_tokens']

(10, 10)


In [None]:
list_words_test = X_to_predict.tolist()
list_corpus_test = []
for element in list_words_test :
  list_corpus_test.append(' '.join(element))
X_to_predict_encoded = pd.DataFrame(data = tfidf_model.transform(list_corpus_test).toarray(),columns = tfidf_tokens)

In [None]:
proba_R_F = model_R_F.predict_proba(X_to_predict_encoded)
proba_R_F_normalized = (proba_R_F.T/proba_R_F.sum(axis=1)).T

proba_L_R = model_L_R.predict_proba(X_to_predict_encoded)
proba_L_R_normalized = (proba_L_R.T/proba_L_R.sum(axis=1)).T

proba_GB = model_GB_C.predict_proba(X_to_predict_encoded)
proba_GB_normalized = (proba_GB.T/proba_GB.sum(axis=1)).T

combinaison_all = (proba_GB_normalized + proba_L_R_normalized + proba_R_F_normalized)/3

In [None]:
df_predicted = pd.DataFrame(combinaison_all, columns = list(lb.classes_))
df_predicted['discourse_id'] = df_needed_to_predict['discourse_id']
df_predicted = df_predicted[['discourse_id' ,'Ineffective', 'Adequate', 'Effective']]

pondération

In [None]:
df_predicted[['Ineffective', 'Adequate', 'Effective']] = df_predicted[['Ineffective', 'Adequate', 'Effective']]*0.05

In [None]:
submission[['Ineffective', 'Adequate', 'Effective']] = df_predicted[['Ineffective', 'Adequate', 'Effective']] + submission[['Ineffective', 'Adequate', 'Effective']]

Soumission

In [None]:
submission.to_csv('submission.csv',index=False)