In [1]:
import pandas as pd
tweets_df = pd.read_csv('tweets.csv', encoding='utf8', dtype=str, escapechar='\\')
votes_df = pd.read_csv('votes-1.csv', encoding='utf8', dtype=str, escapechar='\\')

In [2]:
gordon_df = votes_df[votes_df.user == 'cltse']
gordon_df = gordon_df[['tweets_id', 'emotions']]
emotions_list = ['Joy', 'Trust', 'Fear', 'Surprise', 'Sadness', 'Disgust', 'Anger', 'Anticipation', 'Neutral']
for e in emotions_list:
    gordon_df[e] = 0

gordon_df.head()

Unnamed: 0,tweets_id,emotions,Joy,Trust,Fear,Surprise,Sadness,Disgust,Anger,Anticipation,Neutral
1,11451,[99],0,0,0,0,0,0,0,0,0
2,5351,[99],0,0,0,0,0,0,0,0,0
4,15852,"[4,17]",0,0,0,0,0,0,0,0,0
20,390,[999],0,0,0,0,0,0,0,0,0
21,2386,[99],0,0,0,0,0,0,0,0,0


In [3]:
emotion_dic = { 
    '1':'Joy','2':'Joy',
    '3':'Trust','4':'Trust',
    '5':'Fear','6':'Fear',
    '7':'Surprise','8':'Surprise',
    '9':'Sadness','10':'Sadness',
    '11':'Disgust','12':'Disgust',
    '13':'Anger','14':'Anger',
    '15':'Anticipation','16':'Anticipation',
    '99':'Neutral'
}

def separateEmotion(row):   
    emotions = row.emotions[1:-1].split(",")
    for e in emotions:
        if e in emotion_dic:
            row[emotion_dic[e]] = 1
    return row

gordon_df = gordon_df.apply(separateEmotion, axis=1)
gordon_df.head()

Unnamed: 0,tweets_id,emotions,Joy,Trust,Fear,Surprise,Sadness,Disgust,Anger,Anticipation,Neutral
1,11451,[99],0,0,0,0,0,0,0,0,1
2,5351,[99],0,0,0,0,0,0,0,0,1
4,15852,"[4,17]",0,1,0,0,0,0,0,0,0
20,390,[999],0,0,0,0,0,0,0,0,0
21,2386,[99],0,0,0,0,0,0,0,0,1


In [4]:
gordon_df = gordon_df.merge(tweets_df[['id', 'text', 'quoted_text']], how='left', left_on='tweets_id', right_on='id')
gordon_df = gordon_df[['text', 'quoted_text', 'Joy', 'Trust', 'Fear', 'Surprise', 'Sadness', 'Disgust', 'Anger', 'Anticipation', 'Neutral']]
gordon_df = gordon_df.drop('quoted_text',axis=1)
gordon_df.head()

Unnamed: 0,text,Joy,Trust,Fear,Surprise,Sadness,Disgust,Anger,Anticipation,Neutral
0,.@LSHTM Director Peter Piot discusses the orig...,0,0,0,0,0,0,0,0,1
1,What Kids Want To Know About Coronavirus: An O...,0,0,0,0,0,0,0,0,1
2,Due to the recent cases with Coronavirus th...,0,1,0,0,0,0,0,0,0
3,@TrollNouvelles @JeanMessiha La Russie est un ...,0,0,0,0,0,0,0,0,0
4,RT @thejournal_ie: Northern Ireland's Departme...,0,0,0,0,0,0,0,0,1


In [5]:
def filterUrl(df):
    import time
    time_start = time.time()
    print("  -> filterUrl()", end='')
    
    url_regex = r'^RT @\w+:'
    df['text'] = df['text'].str.replace(url_regex, '').astype(str)
    
    processed = len(df.index)
    time = time.time() - time_start
    print(" - Processed: {:,} | Time: {:,.3f} sec".format(processed, time))
    
    return df

def filterUrl2(df):
    import time
    time_start = time.time()
    print("  -> filterUrl()", end='')
    
    url_regex = r'@\w+'
    df['text'] = df['text'].str.replace(url_regex, '').astype(str)
    
    processed = len(df.index)
    time = time.time() - time_start
    print(" - Processed: {:,} | Time: {:,.3f} sec".format(processed, time))
    
    return df

def filterUrl3(df):
    import time
    time_start = time.time()
    print("  -> filterUrl()", end='')
    
    url_regex = r'#\w+'
    df['text'] = df['text'].str.replace(url_regex, '').astype(str)
    
    processed = len(df.index)
    time = time.time() - time_start
    print(" - Processed: {:,} | Time: {:,.3f} sec".format(processed, time))
    
    return df


import demoji

def checkNonEngAndEmoji(row):
    text = row['text']
    text_emoji = demoji.replace(text, '')
    row['text'] = text_emoji
        
    return row


In [6]:
gordon_df = filterUrl(gordon_df)
gordon_df = filterUrl2(gordon_df)
gordon_df = filterUrl3(gordon_df)
gordon_df = gordon_df.apply(checkNonEngAndEmoji, axis=1)

  -> filterUrl() - Processed: 2,204 | Time: 0.029 sec
  -> filterUrl() - Processed: 2,204 | Time: 0.006 sec
  -> filterUrl() - Processed: 2,204 | Time: 0.007 sec


In [38]:
gordon_df.head()

Unnamed: 0,text,Joy,Trust,Fear,Surprise,Sadness,Disgust,Anger,Anticipation,Neutral
0,. Director Peter Piot discusses the origins an...,0,0,0,0,0,0,0,0,1
1,What Kids Want To Know About Coronavirus: An O...,0,0,0,0,0,0,0,0,1
2,Due to the recent cases with Coronavirus th...,0,1,0,0,0,0,0,0,0
4,Northern Ireland's Department of Health said ...,0,0,0,0,0,0,0,0,1
5,Okay so I got blocked by a bunch of people for...,1,0,0,1,1,0,0,0,0


In [28]:
gordon_df['drop'] = False

def checkNonEngAndEmoji1(row):
    if not (row[1:].T !=0).any():
        row['drop'] = True
        
    return row


In [29]:
gordon_df = gordon_df.apply(checkNonEngAndEmoji1, axis=1)

In [31]:
gordon_df = gordon_df[gordon_df['drop'] == False]

In [33]:
gordon_df = gordon_df.drop('drop',axis=1)

In [35]:
gordon_df.to_csv('gordon_tweets.csv', mode='w', header=True, index=False, encoding='utf-8')

In [39]:
len(gordon_df)

2065

In [48]:
from sklearn.model_selection import train_test_split

data = gordon_df.iloc[:,[0]]
label = gordon_df.iloc[:,1:]

X_train, X_test, y_train, y_test = train_test_split(data, label, test_size=0.2, random_state=1234)

train_set = pd.concat([X_train,y_train],axis=1)
valid_set = pd.concat([X_test,y_test],axis=1)
train_set.to_csv('train.csv', mode='w', header=True, index=False)
valid_set.to_csv('valid.csv', mode='w', header=True, index=False)

In [51]:
print(len(train_set))
print(len(valid_set))

1652
413
