In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split

In [7]:
df= pd.read_csv('sentiment-40k.csv', index_col=0)
df= df.loc[(df.labels!=1) & (df.labels!=3)]

print(df['labels'].value_counts()) 

0    13993
2     5348
5     4950
4     3167
Name: labels, dtype: int64


In [8]:
# relabel and balance
min_len = min(df['labels'].value_counts()) # anger

# Map labels to correspond to ours
int2label = {
  2: 'sad',
  4: 'anger',
  5: 'joy',
  0: 'non',
}

df["labels"].replace(int2label,inplace=True)

# Map labels to correspond to ours numerically
int2label = {
  'sad':0,
  'anger':1,
  'joy':2,
  'non':3,
}

df["labels"].replace(int2label,inplace=True)

print(df['labels'].value_counts()) 

3    13993
0     5348
2     4950
1     3167
Name: labels, dtype: int64


In [9]:
# balance the dataset
min_len = min(df['labels'].value_counts())

df_balanced = pd.DataFrame()
for label in range(0,4):
    df_emotion = df[df['labels']==label]
    df_emotion = df_emotion.sample(min_len, random_state=0)
    df_balanced = pd.concat([df_balanced, df_emotion])

# check
print(df_balanced['labels'].value_counts())

0    3167
1    3167
2    3167
3    3167
Name: labels, dtype: int64


In [10]:
df_train, df_test = train_test_split(df_balanced, test_size=0.2, shuffle=True, random_state=0, stratify=df_balanced['labels'])

# check
print(df_train['labels'].value_counts())
print(df_test['labels'].value_counts())

2    2534
3    2534
1    2533
0    2533
Name: labels, dtype: int64
1    634
0    634
3    633
2    633
Name: labels, dtype: int64


In [11]:
df_train.to_csv('sentiment-40k_train.csv')
df_test.to_csv('sentiment-40k_test.csv')