In [1]:
import os

import pandas as pd

from sklearn.dummy import DummyClassifier
from utils import save_submission

In [2]:
DATA_PATH = './data'
train_file = 'train.csv'
test_file = 'test.csv'

trn = pd.read_csv(os.path.join(DATA_PATH, train_file))
tst = pd.read_csv(os.path.join(DATA_PATH, test_file))

print(f'\n\nTRAIN INFO:\n')
trn.info()
print(f'\n\nTEST INFO:\n')
tst.info()



TRAIN INFO:

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3817 entries, 0 to 3816
Data columns (total 18 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   comment_id      3817 non-null   int64 
 1   sentence_pos    3817 non-null   int64 
 2   reply_to        3817 non-null   int64 
 3   sentence        3817 non-null   object
 4   racial_target   3817 non-null   int64 
 5   other_target    3817 non-null   int64 
 6   implicit        3817 non-null   int64 
 7   stereotype      3817 non-null   int64 
 8   xenophobia      3817 non-null   int64 
 9   suffering       3817 non-null   int64 
 10  economic        3817 non-null   int64 
 11  migration       3817 non-null   int64 
 12  culture         3817 non-null   int64 
 13  benefits        3817 non-null   int64 
 14  health          3817 non-null   int64 
 15  security        3817 non-null   int64 
 16  dehumanisation  3817 non-null   int64 
 17  others          3817 non-null   int64

## TASK 1: Stereotype detection (binary classification)

In [3]:
target_col = 'stereotype'
feat_cols = 'sentence'
results = tst[['comment_id', 'sentence_pos']]

cls = DummyClassifier(strategy='stratified')
cls.fit(trn[feat_cols].values, trn[target_col].values)
results[target_col] = cls.predict(tst[feat_cols]).tolist()

save_submission(results_df=results,
                team_name='example-team',
                task_number=1,
                results_path='./results/',
                attempt=2)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  results[target_col] = cls.predict(tst[feat_cols]).tolist()


## TASK 2: Stereotype hierarchical classification (binary + multi-label classification)

In [4]:
t1_target_col = 'stereotype'
t2_target_cols = ['xenophobia', 'suffering', 'economic', 'migration', 'culture',
                  'benefits', 'health', 'security', 'dehumanisation', 'others']
feat_cols = 'sentence'
results = tst[['comment_id', 'sentence_pos']]

cls = DummyClassifier(strategy='stratified')
cls.fit(trn[feat_cols].values, trn[t1_target_col].values)
results[t1_target_col] = cls.predict(tst[feat_cols]).tolist()

t2_trn = trn[trn[t1_target_col] == 1]
tst_mask = results[t1_target_col].astype(bool)
t2_tst = tst[tst_mask]
for c in t2_target_cols:
    results[c] = 0
    cls = DummyClassifier(strategy='stratified')
    cls.fit(t2_trn[feat_cols].values, t2_trn[c].values)
    results.loc[tst_mask, c] = cls.predict(t2_tst[feat_cols]).tolist()

save_submission(results_df=results,
                team_name='example-team',
                task_number=2,
                results_path='./results/',
                attempt=2)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  results[t1_target_col] = cls.predict(tst[feat_cols]).tolist()
