## Data Preparation and Training

In [None]:
from sklearn.preprocessing import StandardScaler
import pandas as pd
import numpy as np

# Read data.
df_train = pd.read_pickle(f"../../data/all_features_with_finetuned_train.pkl")
df_dev = pd.read_pickle(f"../../data/all_features_with_finetuned_dev.pkl")
df_test = pd.read_pickle(f"../../data/all_features_with_finetuned_test.pkl")

# get all columns with numeric values (float or int)
numeric_cols = [
    col for col in df_train.columns if df_train[col].dtype in ["float64", "int64"]
]
numeric_cols.remove("label")
numeric_cols.remove("id")
numeric_cols.remove("y_pred_roberta")

# scale all numeric columns
for col in numeric_cols:
    scaler = StandardScaler()
    scaler.fit(df_train[col].values.reshape(-1, 1))
    df_train[col] = scaler.transform(df_train[col].values.reshape(-1, 1))
    df_dev[col] = scaler.transform(df_dev[col].values.reshape(-1, 1))
    df_test[col] = scaler.transform(df_test[col].values.reshape(-1, 1))

# From df_train drop all columns that are not numeric.
X_train = df_train.select_dtypes(include=np.number)
X_dev = df_dev.select_dtypes(include=np.number)
X_test = df_test.select_dtypes(include=np.number)

# drop id, text, label if present
if "id" in X_train.columns:
    X_train.drop(["id"], axis=1, inplace=True)
X_train = X_train.drop(["label"], axis=1)
if "id" in X_dev.columns:
    X_dev.drop(["id"], axis=1, inplace=True)
X_dev = X_dev.drop(["label"], axis=1)
if "id" in df_test.columns:
    X_test.drop(["id"], axis=1, inplace=True)

y_train = df_train["label"]
y_dev = df_dev["label"]


keep = [
    "ratio_ADJ_NOUN",
    "ratio_VERB_word_count",
    "ratio_NOUN_word_count",
    "ratio_ADJ_word_count",
    "ratio_ADV_word_count",
    "ratio_PRON_word_count",
    "ratio_DET_word_count",
    "ratio_ADP_word_count",
    "ratio_NUM_word_count",
    "ratio_CONJ_word_count",
    "ratio_negative_stem_word_count",
    "ratio_cefrj_stem_word_count",
    "ratio_a1_stem_word_count",
    "ratio_a2_stem_word_count",
    "ratio_b1_stem_word_count",
    "ratio_b2_stem_word_count",
    "ratio_c1_stem_word_count",
    "ratio_c2_stem_word_count",
    "mean_log_freq_content_words",
    "num_freq1_words",
    "prop_freq_content_words",
    "prop_unfreq_content_words",
    "X_pca_0",
    "X_umap_0",
    "NEGATIVE",
    "POSITIVE",
    "anger",
    "avg_passive_constructions",
    "avg_word_length",
    "disgust",
    "fear",
    "formal",
    "informal",
    "joy",
    "max_depth",
    "mean_depth",
    "n_negation_words",
    "n_punctuation",
    "n_unique_words",
    "n_vowels",
    "n_words",
    "neutral",
    "non_toxic",
    "ratio_top10_content_words",
    "readability",
    "sadness",
    "surprise",
    "toxic",
    "verb_noun_ratio",
    "X_umap_jac_0",
    "logits_0_roberta",
]

# keep only the features in keep = no intercorrelation
X_train_keep = X_train[keep]
X_dev_keep = X_dev[keep]
X_test_keep = X_test[keep]

# keep only the features with high correlation with label
mydf = df_train.copy()
mydf.drop(["id"], axis=1, inplace=True)
df_corr1 = mydf.corr(method="pearson", numeric_only=True)
df_corr1 = df_corr1.round(2)
# sort correlations between features and label by absolute value
features_label_corr = df_corr1["label"].abs().sort_values(ascending=False)

# give feature labels in a list for features with correlation < 0.1
low_corr = features_label_corr[features_label_corr < 0.1].index.tolist()

# for f in low_corr: remove f from keep
keep_high = [x for x in keep if x not in low_corr]

# keep only the features in keep_high
X_train_keep_high = X_train[keep_high]
X_dev_keep_high = X_dev[keep_high]
X_test_keep_high = X_test[keep_high]

In [None]:
# divide data by roberta prediction
bool_rob1 = X_train["y_pred_roberta"] == 1
bool_rob0 = X_train["y_pred_roberta"] == 0
bool_dev_rob1 = X_dev["y_pred_roberta"] == 1
bool_dev_rob0 = X_dev["y_pred_roberta"] == 0
bool_test_rob1 = X_test["y_pred_roberta"] == 1
bool_test_rob0 = X_test["y_pred_roberta"] == 0

X_train1 = X_train_keep_high[bool_rob1]
X_train0 = X_train_keep_high[bool_rob0]
X_dev1 = X_dev_keep_high[bool_dev_rob1]
X_dev0 = X_dev_keep_high[bool_dev_rob0]
X_test1 = X_test_keep_high[bool_test_rob1]
X_test0 = X_test_keep_high[bool_test_rob0]

y_train1 = y_train[bool_rob1]
y_train0 = y_train[bool_rob0]
y_dev1 = y_dev[bool_dev_rob1]
y_dev0 = y_dev[bool_dev_rob0]

y_test_id1 = df_test["id"][bool_test_rob1]
y_test_id0 = df_test["id"][bool_test_rob0]


print(X_train.shape, X_train1.shape, X_train0.shape)
print(X_dev.shape, X_dev1.shape, X_dev0.shape)
print(X_test.shape, X_test1.shape, X_test0.shape)
print()
print(y_test_id0[:5], y_test_id1[:5])

In [None]:
# combine train and dev
X_train_dev1 = pd.concat([X_train1, X_dev1])
y_train_dev1 = pd.concat([y_train1, y_dev1])
X_train_dev0 = pd.concat([X_train0, X_dev0])
y_train_dev0 = pd.concat([y_train0, y_dev0])

np.random.seed(42)
shuffler1 = np.random.permutation(len(X_train_dev1))
shuffler0 = np.random.permutation(len(X_train_dev0))
X_train_dev1 = X_train_dev1.iloc[shuffler1]
y_train_dev1 = y_train_dev1.iloc[shuffler1]
X_train_dev0 = X_train_dev0.iloc[shuffler0]
y_train_dev0 = y_train_dev0.iloc[shuffler0]

In [None]:
# # save X_train_dev_1 to dataframe
# X_train_dev1.to_pickle("../../data/submission/X_train_dev1_submission.pkl")
# X_train_dev0.to_pickle("../../data/submission/X_train_dev0_submission.pkl")
# X_test1.to_pickle("../../data/submission/X_test1_submission.pkl")
# X_test0.to_pickle("../../data/submission/X_test0_submission.pkl")

# Prediction

In [None]:
# Classifier.
# MLP
from sklearn.neural_network import MLPClassifier

clf1 = MLPClassifier(random_state=1, max_iter=300).fit(X_train_dev1, y_train_dev1)
clf0 = MLPClassifier(random_state=1, max_iter=300).fit(X_train_dev0, y_train_dev0)

y_pred1 = clf1.predict(X_test1)
y_pred0 = clf0.predict(X_test0)

In [None]:
subm_df1 = pd.DataFrame()
subm_df1["label"] = y_pred1
subm_df1["id"] = [int(i) for i in y_test_id1]
subm_df0 = pd.DataFrame()
subm_df0["label"] = y_pred0
subm_df0["id"] = [int(i) for i in y_test_id0]
submission_df = pd.concat([subm_df1, subm_df0])

# Sort by id.
submission_df.sort_values(by=["id"], inplace=True)
submission_df.head()

In [None]:
submission_df["label"].value_counts()

In [None]:
submission_df.to_csv(
    "submission_MLP_standard_noCorr_withFinetuned_two_models.csv",
    index=False,
)