# Emotion Dataset

In [None]:
!pip3 install google_trans_new
!mkdir -p 'drive/MyDrive/translate'

In [2]:
import pickle
import google_trans_new
import pandas as pd
import math
import numpy as np

from sklearn.model_selection import train_test_split
from google_trans_new import google_translator

## Part One: Clean data
### Load the pickle file and remove extraneous emotions

In [2]:
# External Twitter Dataset
with open('merged_training.pkl', 'rb') as f:
    data = pickle.load(f)

# Remove surprise and love (extra emotions)
data = data.loc[(data.emotions!='surprise') & (data.emotions!='love')]

### Balance the dataset

In [24]:
# Assess the dataset
len(data)
data.emotions.value_counts() # minority emotion is 'fear'
min_len = data.emotions.value_counts().fear # fear count

# Balance the dataset
df_balanced = pd.DataFrame()
emotions = ['sadness','anger','joy','fear']
for emotion in emotions:
    df = data[data['emotions']==emotion]
    df = df.sample(min_len, random_state=0)
    df_balanced = pd.concat([df_balanced, df])

df_balanced = df_balanced.rename(columns={'emotions':'labels'})


### Map labels to numeric form

In [25]:
# Map labels to numeric 
label2int = {
  "sadness": int(0),
  "joy": int(2),
  "anger": int(1),
  "fear": int(3),
}

df_balanced["labels"].replace(label2int,inplace=True)


### Train-Test Split

In [26]:
# Train Test Split
df_train, df_test = train_test_split(df_balanced, test_size=0.2, shuffle=True, random_state=0, stratify=df_balanced['labels'])

# Check distribution: approx equal
print(df_train['labels'].value_counts(normalize=True)*100)
print(df_test['labels'].value_counts(normalize=True)*100)

2    25.000327
3    25.000327
1    24.999673
0    24.999673
Name: labels, dtype: float64
1    25.00131
0    25.00131
2    24.99869
3    24.99869
Name: labels, dtype: float64


### Export

In [27]:
# Export To CSV
df_balanced = df_balanced.sample(frac=1).reset_index(drop=True) # shuffle the dataset
df_balanced.to_csv('twitter_full.csv')
df_train.to_csv('twitter_train.csv', index=False)
df_test.to_csv('twitter_test.csv', index=False)


## Part Two: Translate Dataset into Mandarin - Google Cloud Translation API

In [35]:
# translate only ~10000 datasets
# extract a balanced set using train-test-split
_, df_extract = train_test_split(df_balanced, test_size=0.053, shuffle=True, random_state=0, stratify=df_balanced['labels'])

# Check distribution: approx equal
print(df_extract['labels'].head(10000).value_counts(normalize=True)*100)

# export to csv
df_extract.to_csv('twitter_extracted.csv', index=False)

1    25.04
0    25.00
2    24.99
3    24.97
Name: labels, dtype: float64


In [37]:
translator = google_translator()

df = pd.read_csv('twitter_extracted.csv')
text_en = df['text']
text_translated = np.array([])

# Need to run in batches and sessions due to API Limits
for i in range(9000,10000,500):
    translations = np.array([translator.translate(text=text, lang_tgt='zh-cn') for text in text_en[i:i+500]])
    print(f"translated rows {i} to {i+500}")
    text_translated = np.append(text_translated,translations)



translated rows 9000 to 9500
translated rows 9500 to 10000


In [38]:
# check length is correct
print(len(text_translated))

1000


In [40]:
# export chunk to csv
df_zh = pd.DataFrame()
df_zh['text']=text_translated
df_zh['labels']=df['labels'].iloc[9000:10000]

df_zh.to_csv('twitter_emotions_zh10k.csv')

In [3]:
# combine all datasets
df_zh3k = pd.read_csv('twitter_emotions_zh3000.csv')
df_zh6k = pd.read_csv('twitter_emotions_zh6000.csv')
df_zh9k = pd.read_csv('twitter_emotions_zh9000.csv')
df_zh10k = pd.read_csv('twitter_emotions_zh10k.csv')
df_en = pd.read_csv('twitter_extracted.csv').head(10000)

# concatenate
df_concat = pd.concat([df_zh3k,df_zh6k,df_zh9k,df_zh10k,df_en])
df_concat = df_concat.sample(frac=1) # shuffle the dataset

df_concat.to_csv('twitter_emotions_enzh.csv')

In [4]:
# Check distribution: approx equal
print(df_concat['labels'].value_counts(normalize=True)*100)


1    25.04
0    25.00
2    24.99
3    24.97
Name: labels, dtype: float64
