# Emotion Dataset

In [16]:
import pickle
import google_trans_new
import pandas as pd
import math
import numpy as np

from sklearn.model_selection import train_test_split
from google_trans_new import google_translator

## Part One: Clean data
### Load the pickle file and remove extraneous emotions

In [2]:
# External Twitter Dataset
with open('merged_training.pkl', 'rb') as f:
    data = pickle.load(f)

# Remove surprise and love (extra emotions)
data = data.loc[(data.emotions!='surprise') & (data.emotions!='love')]

### Balance the dataset

In [11]:
# Assess the dataset
len(data)
data.emotions.value_counts() # minority emotion is 'fear'
min_len = 2500 # data.emotions.value_counts().fear # fear count

# Balance the dataset
df_balanced = pd.DataFrame()
emotions = ['sadness','anger','joy','fear']
for emotion in emotions:
    df = data[data['emotions']==emotion]
    df = df.sample(min_len, random_state=0)
    df_balanced = pd.concat([df_balanced, df])

df_balanced = df_balanced.rename(columns={'emotions':'labels'})


### Map labels to numeric form

In [12]:
# Map labels to numeric 
label2int = {
  "sadness": int(0),
  "joy": int(2),
  "anger": int(1),
  "fear": int(3),
}

df_balanced["labels"].replace(label2int,inplace=True)


### Train-Test Split

In [13]:
# Train Test Split
df_train, df_test = train_test_split(df_balanced, test_size=0.2, shuffle=True, random_state=0, stratify=df_balanced['labels'])

# Check distribution: approx equal
print(df_train['labels'].value_counts(normalize=True)*100)
print(df_test['labels'].value_counts(normalize=True)*100)

1    25.0
0    25.0
2    25.0
3    25.0
Name: labels, dtype: float64
1    25.0
0    25.0
3    25.0
2    25.0
Name: labels, dtype: float64


### Export

In [14]:
# Export To CSV
df_balanced = df_balanced.sample(frac=1).reset_index(drop=True) # shuffle the dataset
df_balanced.to_csv('twitter_full.csv')
df_train.to_csv('twitter_train.csv', index=False)
df_test.to_csv('twitter_test.csv', index=False)


## Part Two: Translate Dataset into Mandarin - Google Cloud Translation API

In [17]:
translator = google_translator()

text_en = list(df_balanced['text'])
text_translated = np.array([])

for i in range(500,10500,500):
    translations = np.array([translator.translate(text=text, lang_tgt='zh-cn') for text in text_en[i-500:i]])
    print(f"translated rows {i-500} to {i}")
    text_translated = np.append(text_translated,translations)



google_new_transError: 429 (Too Many Requests) from TTS API. Probable cause: Unknown

In [None]:
df_zh = pd.DataFrame()
df_zh['text']=text_translated
df_zh['labels']=df_balanced['labels']

df_zh.to_csv('twitter_emotions_zh.csv')