In [65]:
from imblearn.over_sampling import SMOTE, ADASYN
import pandas as pd
import pickle
import numpy as np
from sklearn.preprocessing import OneHotEncoder
import math

In [10]:
with open('all_dataset.pkl', 'rb') as f:
    data = pickle.load(f)

data.dropna()

Unnamed: 0,mins,teacher_emotion,teacher_energy,teacher_sentiment,teacher_language,teacher_objectivity,teacher_positivity,student_emotion
0,1,annoyed,energetic,neither,b2,objective,neutral,sad
1,1,annoyed,energetic,neither,b2,objective,neutral,neutral
2,1,annoyed,energetic,neither,b2,objective,neutral,neutral
3,1,annoyed,energetic,neither,b2,objective,neutral,fearful
4,1,annoyed,energetic,neither,b2,objective,neutral,fearful
...,...,...,...,...,...,...,...,...
3522,59,neutral,monotonic,encouraging,b2,objective,positive,fearful
3523,59,neutral,monotonic,encouraging,b2,objective,positive,fearful
3524,59,neutral,monotonic,encouraging,b2,subjective,positive,happy
3525,59,neutral,monotonic,informative,b2,subjective,positive,neutral


In [43]:
enc = OneHotEncoder(handle_unknown='ignore')
columns_x = ['teacher_emotion', 'teacher_energy', 'teacher_sentiment', 'teacher_language', 'teacher_objectivity', 'teacher_positivity']
columns_y = ["student_emotion"]

In [44]:
X = data.loc[:, data.columns != 'student_emotion']
y = data[["student_emotion"]]

In [45]:
def fix_label(name, labels):
    name = [name] * labels.shape[0]
    res = tuple(zip(name,labels))
    fun = lambda a : f"{a[0]}_{a[1]}"
    return list(map(fun,res))

In [46]:
def encode(data, columns):
    base = pd.DataFrame()
    for name in columns:
        sub_data = data[[name]]
        label = sub_data[name].unique()
        labels = fix_label(name, label)
        data_fit = enc.fit_transform(sub_data.values).toarray()
        res = pd.DataFrame(data_fit, columns = labels)
        base = pd.concat([base,res], axis=1)
    return base.to_numpy(), base.columns

In [47]:
data_x, x_columns = encode(X,columns_x)
data_y, y_columns = encode(y,columns_y)

In [70]:
seed = 42
smote = SMOTE(sampling_strategy='auto', random_state=seed)
ada = ADASYN(random_state=seed, sampling_strategy='minority')

In [49]:
print(data_x.shape)
print(data_y.shape)

(51322, 27)
(51322, 7)


In [68]:
X, y = smote.fit_resample(data_x, data_y)

In [72]:
XX, yy = ada.fit_resample(data_x, data_y)

In [74]:
print(X.shape)
print(y.shape)

(144487, 27)
(144487, 7)


In [76]:
from sklearn.model_selection import train_test_split as tts

X_train, X_test, y_train, y_test = tts(XX, yy, test_size=0.25, random_state=42)


In [61]:
with open("train_data_X_smote.npy", "wb") as f:
    np.save(f,X_train)
with open("train_data_y_smote.npy", "wb") as f:
    np.save(f,y_train)
with open("test_data_y_smote.npy", "wb") as f:
    np.save(f,y_test)
with open("test_data_X_smote.npy", "wb") as f:
    np.save(f,X_test)

In [77]:
with open("train_data_X_adasyn.npy", "wb") as f:
    np.save(f,X_train)
with open("train_data_y_adasyn.npy", "wb") as f:
    np.save(f,y_train)
with open("test_data_y_adasyn.npy", "wb") as f:
    np.save(f,y_test)
with open("test_data_X_adasyn.npy", "wb") as f:
    np.save(f,X_test)