In [None]:
import os
os.chdir("/content/drive/MyDrive/Colab Notebooks/zindi/sentiment-analysis-for-tunisian/notebook")

In [None]:
import io
import re
import os
import gc
import pickle
import random
import termcolor
import warnings
import shutil
import math
from functools import partial
from datetime import datetime
from dataclasses import dataclass
from pathlib import Path
from typing import List

from sklearn.metrics import accuracy_score
from sklearn.linear_model import  LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import GaussianNB, MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import StratifiedKFold

import pandas as pd
import numpy as np

import lightgbm as lgbm

In [None]:
train_df = pd.read_csv("../input/Train.csv")
test_df = pd.read_csv("../input/Test.csv")

LABEL2ID = {label:i for i, label in enumerate(train_df['label'].unique())}
ID2LABEL = {v:k for k, v in LABEL2ID.items()}

train_df['label_ids'] = train_df['label'].map(LABEL2ID)
train_targets = train_df['label_ids'].values

In [None]:
def save_pkl(dir, name, obj):
    dir.mkdir(exist_ok=True)
    with open(dir / name, 'wb') as f:
        pickle.dump(obj, f)

def load_pkl(dir, name):
    with open(dir / name, 'rb') as f:
        return pickle.load(f)

In [None]:
arbic_bert_test_preds = load_pkl(Path("../output"), "29-arabic-base-domain-adapt_test_preds.pkl").mean(axis=0)
bert_test_preds = load_pkl(Path("../output"), "28-bert-base-domain-adapt_test_preds.pkl").mean(axis=0)
lstm_test_preds = load_pkl(Path("../output"), "91-sentencepiece-lstm_test_preds.pkl").mean(axis=0)

fasttext_test_preds = load_pkl(Path("../output"), "93-fasttext-lightgbm-test_preds.pkl").mean(axis=0)
catboos_test_preds = load_pkl(Path("../output"), "94-catboost-test_preds.pkl").mean(axis=0)

multinomial_test_preds = load_pkl(Path("../output"), "95-tfidf_test-pred.pkl")

In [None]:
arbic_bert_val_preds = load_pkl(Path("../output"), "29-arabic-base-domain-adapt_val_preds.pkl")
bert_val_preds = load_pkl(Path("../output"), "28-bert-base-domain-adapt_val_preds.pkl")

lstm_val_preds = load_pkl(Path("../output"), "91-sentencepiece-lstm_val_preds.pkl")

fasttext_val_preds = load_pkl(Path("../output"), "93-fasttext-lightgbm-val_preds.pkl")
catboost_val_preds = load_pkl(Path("../output"), "94-catboost-val_preds.pkl")

multinomial_val_preds = load_pkl(Path("../output"), "95-tfidf_val-pred.pkl")

In [None]:
print(accuracy_score(train_df['label_ids'], np.argmax(arbic_bert_val_preds, axis=-1)))
print(accuracy_score(train_df['label_ids'], np.argmax(bert_val_preds, axis=-1)))
print(accuracy_score(train_df['label_ids'], np.argmax(lstm_val_preds, axis=-1)))
print(accuracy_score(train_df['label_ids'], np.argmax(fasttext_val_preds, axis=-1)))
print(accuracy_score(train_df['label_ids'], np.argmax(catboost_val_preds, axis=-1)))
print(accuracy_score(train_df['label_ids'], np.argmax(multinomial_val_preds, axis=-1)))

0.8099285714285714
0.8055428571428571
0.7993428571428571
0.8025142857142857
0.7893428571428571
0.7951142857142857


In [None]:
train_df['length'] = train_df['text'].str.len()

In [None]:
train_df[train_df['length']>500]

Unnamed: 0,ID,text,label,label_ids,length
19,UE0MZZ9,siédet raéies el joumhouria ama ba3ed ena cheb...,-1,0,520
39,A0FK974,samhini ya baia kif nekteb cava kif nab3etnal9...,-1,0,1321
437,ACB5U9Q,brabi n7eb ngoul 7aja barek lilli ya7kiw 3ala ...,1,1,605
604,LYB0JZU,oui el bled temchi en cas yssaybouha el fousse...,-1,0,511
726,7Z63IRB,sélem alaykom brabi eni mowatén men soussa mch...,0,2,968
...,...,...,...,...,...
68229,HWEZ6F3,syassiyn mta3 tounis mafikom 7ata wa7ed sada9 ...,-1,0,920
68560,VNBRG21,9adech fisa3 tet9an3o awel haja houa mayefheme...,-1,0,517
68931,FX8VSHA,chbihhh hava mrith wala chbih ma3ach t9aren za...,-1,0,680
69823,XRWKW1E,ena de ma part berjouleya ma 3jebtnich fiha sl...,-1,0,553


In [None]:
train_df[train_df['length']>500]['label'].value_counts()

-1    196
 1     68
 0      5
Name: label, dtype: int64

In [None]:
long_text_index = train_df[train_df['length']>500].index.tolist()

In [None]:
np.unique(train_targets[long_text_index], return_counts=True)

(array([0, 1, 2]), array([196,  68,   5]))

In [None]:
print(accuracy_score(train_df.iloc[long_text_index]['label_ids'], np.argmax(arbic_bert_val_preds[long_text_index], axis=-1)))
print(accuracy_score(train_df.iloc[long_text_index]['label_ids'], np.argmax(bert_val_preds[long_text_index], axis=-1)))
print(accuracy_score(train_df.iloc[long_text_index]['label_ids'], np.argmax(lstm_val_preds[long_text_index], axis=-1)))
print(accuracy_score(train_df.iloc[long_text_index]['label_ids'], np.argmax(fasttext_val_preds[long_text_index], axis=-1)))
print(accuracy_score(train_df.iloc[long_text_index]['label_ids'], np.argmax(catboost_val_preds[long_text_index], axis=-1)))
print(accuracy_score(train_df.iloc[long_text_index]['label_ids'], np.argmax(multinomial_val_preds[long_text_index], axis=-1)))

0.7732342007434945
0.7546468401486989
0.7509293680297398
0.7397769516728625
0.7397769516728625
0.7360594795539034


In [None]:
simple_mean_stack = np.stack([
    arbic_bert_val_preds,
    bert_val_preds,
    lstm_val_preds,
    fasttext_val_preds,
    catboost_val_preds,
    multinomial_val_preds,
])

In [None]:
print(accuracy_score(train_df['label_ids'], np.argmax(simple_mean_stack.mean(axis=0), axis=-1)))

0.8410428571428571


In [None]:
train_features = np.hstack([
    arbic_bert_val_preds,
    bert_val_preds,
    lstm_val_preds,
    fasttext_val_preds,
    catboost_val_preds,
    multinomial_val_preds,
])

test_features = np.hstack([
        arbic_bert_test_preds,
        bert_test_preds,
        lstm_test_preds,
        fasttext_test_preds,
        catboos_test_preds,
        multinomial_test_preds
])

In [None]:
cv = list(StratifiedKFold(n_splits=5).split(train_df, train_df['label_ids']))

In [None]:
val_scores = []
val_linear_preds = np.zeros((len(train_df), len(ID2LABEL)), dtype="float32")
test_linear_preds = np.zeros((5, len(test_df), len(ID2LABEL)), dtype="float32")

for fold in range(5):
    print('='*30)
    print(f'======fold: {fold} start======')

    trn_idx, val_idx = cv[fold]

    trn_features, val_features = train_features[trn_idx], train_features[val_idx]
    trn_targets, val_targets = train_targets[trn_idx], train_targets[val_idx]

    model = LogisticRegression(max_iter=1000, C=0.1)
    
    model.fit(X=trn_features,y=trn_targets)

    val_pred = model.predict(val_features)

    score = accuracy_score(val_targets, val_pred)
    print(f"score {score:.4f}")

    val_linear_preds[val_idx] = model.predict_proba(val_features)
    test_linear_preds[fold] = model.predict_proba(test_features)

print(f"all oof score {accuracy_score(train_targets, np.argmax(val_linear_preds, axis=-1)):.4f}")

score 0.8461
score 0.8426
score 0.8362
score 0.8485
score 0.8496
all oof score 0.8446


In [None]:
test_linear_preds= test_linear_preds.mean(axis=0)

In [None]:
val_scores = []
val_tree_preds = np.zeros((len(train_df), len(ID2LABEL)), dtype="float32")
test_tree_preds = np.zeros((5, len(test_df), len(ID2LABEL)), dtype="float32")

for fold in range(5):
    print('='*30)
    print(f'======fold: {fold} start======')

    trn_idx, val_idx = cv[fold]

    trn_features, val_features = train_features[trn_idx], train_features[val_idx]
    trn_targets, val_targets = train_targets[trn_idx], train_targets[val_idx]

    model = lgbm.LGBMClassifier(**{"n_estimators": 10000})
    
    model.fit(
        X=trn_features,
        y=trn_targets,
        eval_set=[(val_features, val_targets)],
        early_stopping_rounds=50,
        verbose=50,
    )

    val_pred = model.predict(val_features)
    score = accuracy_score(val_targets, val_pred)

    print(f"score {score:.4f}")
    val_tree_preds[val_idx] = model.predict_proba(val_features)
    test_tree_preds[fold] = model.predict_proba(test_features)

print(f"all oof score {accuracy_score(train_targets, np.argmax(val_tree_preds, axis=-1)):.4f}")

Training until validation scores don't improve for 50 rounds.
[50]	valid_0's multi_logloss: 0.380855
[100]	valid_0's multi_logloss: 0.379462
[150]	valid_0's multi_logloss: 0.379772
Early stopping, best iteration is:
[120]	valid_0's multi_logloss: 0.379281
score 0.8474
Training until validation scores don't improve for 50 rounds.
[50]	valid_0's multi_logloss: 0.387332
[100]	valid_0's multi_logloss: 0.388121
Early stopping, best iteration is:
[72]	valid_0's multi_logloss: 0.387116
score 0.8410
Training until validation scores don't improve for 50 rounds.
[50]	valid_0's multi_logloss: 0.399725
[100]	valid_0's multi_logloss: 0.40014
Early stopping, best iteration is:
[68]	valid_0's multi_logloss: 0.399193
score 0.8357
Training until validation scores don't improve for 50 rounds.
[50]	valid_0's multi_logloss: 0.375166
[100]	valid_0's multi_logloss: 0.373549
[150]	valid_0's multi_logloss: 0.374042
Early stopping, best iteration is:
[115]	valid_0's multi_logloss: 0.373515
score 0.8500
Trainin

In [None]:
test_tree_preds = test_tree_preds.mean(axis=0)

In [None]:
val_preds = (val_linear_preds + val_tree_preds) / 2

In [None]:
print(f"all oof score {accuracy_score(train_targets, np.argmax(val_preds, axis=-1)):.4f}")

all oof score 0.8455


In [None]:
test_preds = (test_linear_preds + test_tree_preds) / 2

In [None]:
sub_df = pd.read_csv('../input/SampleSubmission.csv')
sub_df['ID'] = sub_df['ID'].apply(lambda x: x.rjust(7, '0'))
sub_df['label'] = np.argmax(test_preds, axis=-1)

In [None]:
sub_df['label'] = sub_df['label'].map(ID2LABEL)

In [None]:
sub_df['label'].value_counts()

-1    14896
 1    14863
 0      241
Name: label, dtype: int64

In [None]:
sub_df.to_csv('../output/00-09-ensemble.csv', index=False)