In [None]:
import io
import os
import gc
import re
import pickle
import random
import termcolor
import warnings
import shutil
from collections import Counter
from functools import partial
from datetime import datetime
from dataclasses import dataclass
from pathlib import Path
from typing import List

import pandas as pd
import numpy as np
from tqdm.auto import tqdm

from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import  accuracy_score

import lightgbm as lgbm

In [None]:
!pip install -q git+https://github.com/facebookresearch/fastText.git
!pip install -q texthero

  Building wheel for fasttext (setup.py) ... [?25l[?25hdone
[K     |████████████████████████████████| 1.4MB 12.0MB/s 
[K     |████████████████████████████████| 245kB 36.5MB/s 
[?25h  Building wheel for nltk (setup.py) ... [?25l[?25hdone


In [None]:
import fasttext
import texthero
from nltk.corpus import stopwords

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [None]:
stopwords_list = stopwords.words('english') + stopwords.words('french')

# Config

In [None]:
DEBUG = True
SAVE_PATH = None
SEED = 42
NOW = datetime.strftime(datetime.now() , "%m%d")

SAVE_PATH = Path(f'/content/lightning-logs/')
SAVE_PATH.mkdir(exist_ok=True)

NUM_WORKERS = os.cpu_count()

print("DEBUG:\t", DEBUG)
print("SAVE_PATH:\t", SAVE_PATH)
print("NUM_WORKERS:\t", NUM_WORKERS)

DEBUG:	 True
SAVE_PATH:	 /content/lightning-logs
NUM_WORKERS:	 2


In [None]:
# @dataclass
# class Config:
#     max_seq_len:int = 192
#     num_fold:int = 5

#     lr:float = 3e-5
#     batch_size: int = 128
#     num_epoch:int = 10
#     max_grad_norm:float = 1.0
#     gradient_accumulation_steps: int = 1
#     warmup_steps: int = 0
#     weight_decay: float = 0.0
#     adam_beta1: float = 0.9
#     adam_beta2: float = 0.999
#     adam_epsilon: float= 1e-8
#     max_grad_norm: float = 1.0

#     save_top_k:int = 1

#     def __post_init__(self):
#         pass


# cfg = Config()
# cfg

# Helper

In [None]:
def save_pkl(dir, name, obj):
    dir.mkdir(exist_ok=True)
    with open(dir / name, 'wb') as f:
        pickle.dump(obj, f)

def load_pkl(dir, name):
    with open(dir / name, 'rb') as f:
        return pickle.load(f)

def set_seed(seed=42):
    os.environ['PYTHONHASHSEED'] = str(seed)
    random.seed(seed)
    np.random.seed(seed)

In [None]:
set_seed(SEED)

# Load Data

In [None]:
train_df = pd.read_csv("../input/Train.csv")
test_df = pd.read_csv("../input/Test.csv")

In [None]:
LABEL2ID = {label:i for i, label in enumerate(train_df['label'].unique())}
ID2LABEL = {v:k for k, v in LABEL2ID.items()}

train_df['label_ids'] = train_df['label'].map(LABEL2ID)

In [None]:
train_df

Unnamed: 0,ID,text,label,label_ids
0,13P0QT0,3sbaaaaaaaaaaaaaaaaaaaa lek ou le seim riahi o...,-1,0
1,SKCLXCJ,cha3eb fey9elkoum menghir ta7ayoul ou kressi,-1,0
2,V1TVXIJ,bereau degage nathef ya slim walahi ya7chiw fi...,-1,0
3,U0TTYY8,ak slouma,1,1
4,68DX797,entom titmanou lina a7na 3iid moubarik a7na ch...,-1,0
...,...,...,...,...
69995,ZRSR7TZ,pff bayna beli kbira f wejhakk yakhiii rouhi r...,-1,0
69996,QNQVEIH,aman lmara jeya zidou t3am9ou fel a7deeth akth...,-1,0
69997,LJ2K9MD,winha nakhtabha hhhhh,-1,0
69998,5RZ1T7I,fachel enta w houwa,-1,0


In [None]:
train_targets = train_df['label_ids'].values

In [None]:
all_texts = pd.concat([train_df['text'].str.lower(), test_df['text'].str.lower()])

In [None]:
all_texts = texthero.remove_stopwords(all_texts, stopwords_list)
all_texts = texthero.remove_whitespace(all_texts)

# Train Fasttext

In [None]:
with open("/content/data.txt", "w") as f:
    for line in all_texts:
        f.write(line+"\n")

In [None]:
%%time
fattext_model = fasttext.train_unsupervised("/content/data.txt", model='skipgram', dim=300, wordNgrams=2, epoch=10)

CPU times: user 5min 45s, sys: 860 ms, total: 5min 45s
Wall time: 5min 47s


In [None]:
all_features = [fattext_model.get_sentence_vector(text) for text in tqdm(all_texts)]
all_features = np.vstack(all_features)

HBox(children=(FloatProgress(value=0.0, max=100000.0), HTML(value='')))




In [None]:
train_features = all_features[:len(train_df)]
test_features = all_features[len(train_df):]

# make cv

In [None]:
cv = list(StratifiedKFold(n_splits=5).split(train_df, train_df['label_ids']))

In [None]:
LGBM_PARAMS = {
    "n_estimators": 10000,

    "max_depth": 6,
    
    'feature_fraction': 0.8,
    'bagging_fraction': 0.8,
    'bagging_freq': 3,

    'random_state': 42,
}

In [None]:
%%time
val_scores = []
val_preds = np.zeros((len(train_df), len(ID2LABEL)), dtype="float32")
test_preds = np.zeros((5, len(test_df), len(ID2LABEL)), dtype="float32")

for fold in range(5):
    print('='*30)
    print(f'======fold: {fold} start======')

    trn_idx, val_idx = cv[fold]

    trn_features, val_features = train_features[trn_idx], train_features[val_idx]
    trn_targets, val_targets = train_targets[trn_idx], train_targets[val_idx]

    model = lgbm.LGBMClassifier(**LGBM_PARAMS)
    
    model.fit(
        X=trn_features,
        y=trn_targets,
        eval_set=[(val_features, val_targets)],
        early_stopping_rounds=50,
        verbose=50,
    )

    val_pred = model.predict(val_features)
    score = accuracy_score(val_targets, val_pred)

    print(f"score {score:.4f}")

    val_preds[val_idx] = model.predict_proba(val_features)
    test_preds[fold] = model.predict_proba(test_features)

Training until validation scores don't improve for 50 rounds.
[50]	valid_0's multi_logloss: 0.503554
[100]	valid_0's multi_logloss: 0.482186
[150]	valid_0's multi_logloss: 0.474796
[200]	valid_0's multi_logloss: 0.471721
[250]	valid_0's multi_logloss: 0.470496
[300]	valid_0's multi_logloss: 0.470539
Early stopping, best iteration is:
[258]	valid_0's multi_logloss: 0.470413
score 0.8005
Training until validation scores don't improve for 50 rounds.
[50]	valid_0's multi_logloss: 0.508428
[100]	valid_0's multi_logloss: 0.486094
[150]	valid_0's multi_logloss: 0.480544
[200]	valid_0's multi_logloss: 0.478113
[250]	valid_0's multi_logloss: 0.477114
[300]	valid_0's multi_logloss: 0.476873
Early stopping, best iteration is:
[270]	valid_0's multi_logloss: 0.476495
score 0.7982
Training until validation scores don't improve for 50 rounds.
[50]	valid_0's multi_logloss: 0.513115
[100]	valid_0's multi_logloss: 0.494126
[150]	valid_0's multi_logloss: 0.488353
[200]	valid_0's multi_logloss: 0.485559
[

In [None]:
print(f"all oof score {accuracy_score(train_targets, np.argmax(val_preds, axis=-1)):.4f}")

all oof score 0.8025


In [None]:
save_pkl(Path("../output"), "93-fasttext-lightgbm-val_preds.pkl", val_preds)
save_pkl(Path("../output"), "93-fasttext-lightgbm-test_preds.pkl", test_preds)