In [1]:
import pandas as pd
import numpy as np
import json

import nltk
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
from nltk.tokenize import TweetTokenizer

from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer, accuracy_score, f1_score
from sklearn.metrics import roc_curve, auc
from sklearn.metrics import confusion_matrix, roc_auc_score, recall_score, precision_score
from sklearn.feature_selection import SelectKBest, f_regression
from sklearn import svm, tree, neighbors
from sklearn.metrics import classification_report
from sklearn.naive_bayes import MultinomialNB
from sklearn import ensemble

In [2]:
### training data
train_test = pd.read_csv("./dm2020-hw2-nthu/data_identification.csv")
label = pd.read_csv("./dm2020-hw2-nthu/emotion.csv")
label_dict = dict(label.values.tolist())

file = open("./dm2020-hw2-nthu/tweets_DM.json", 'rb')
data = []
for line in file.readlines():
    dic = json.loads(line)
    data.append(dic)
    
train_id = set()
test_id = set()

for a in train_test[train_test["identification"]=="train"]["tweet_id"]:
    train_id.add(a)
for b in train_test[train_test["identification"]=="test"]["tweet_id"]:
    test_id.add(b)

train = list()
test = list()
for i in range(len(data)):
    if(data[i]["_source"]["tweet"]["tweet_id"] in train_id):
        train.append(data[i])
    elif(data[i]["_source"]["tweet"]["tweet_id"] in test_id):
        test.append(data[i])
        
train_list=list()
for i in range(len(train)):
    tmp = list()
    tewwt_id = train[i]["_source"]["tweet"]["tweet_id"]
    text = train[i]["_source"]["tweet"]["text"].lower()
    label = label_dict[train[i]["_source"]["tweet"]["tweet_id"]]
    train_list.append((tewwt_id, text, label))
train_df = pd.DataFrame(train_list,columns=["tweet_id", "text", "emotion"])

test_list=list()
for j in range(len(test)):
    tmp = list()
    tewwt_id = test[j]["_source"]["tweet"]["tweet_id"]
    text = test[j]["_source"]["tweet"]["text"].lower()
    test_list.append((tewwt_id, text))
test_df = pd.DataFrame(test_list,columns=["tweet_id", "text"])

In [3]:
len(train_df)

1455563

In [4]:
train, val = train_test_split(train_df[:30000], test_size=0.2, random_state=1)
x_train = train['text'].values
x_val = val['text'].values
y_train = train['emotion']
y_val = val['emotion']

In [5]:
train_text = list()
[train_text.append(x) for x in train['text'].values]
val_text = list()
[val_text.append(x) for x in val['text'].values]

[None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,

In [6]:
# FFFeature
def f_preprocess(all_text):
    from collections import Counter
    all_text2 = ' '.join(all_text)
    # create a list of words
    words = all_text2.split()
    # Count all the words using Counter Method
    count_words = Counter(words)

    total_words = len(words)
    sorted_words = count_words.most_common(total_words)
    vocab_to_int = {w:i+1 for i, (w,c) in enumerate(sorted_words)}

    text_int = []
    for t in all_text:
        r = [vocab_to_int[w] for w in t.split()]
        text_int.append(r)
    print (text_int[0:3])

    from keras.preprocessing.sequence import pad_sequences
    PADDING_LENGTH = 105
    text_int = pad_sequences(text_int, maxlen=PADDING_LENGTH)
    return text_int

In [7]:
## deal with label (string -> one-hot)

from sklearn.preprocessing import LabelEncoder
import keras
label_encoder = LabelEncoder()
label_encoder.fit(y_train)
print('check label: ', label_encoder.classes_)

def label_encode(le, labels):
    enc = le.transform(labels)
    return keras.utils.to_categorical(enc)

def label_decode(le, one_hot_label):
    dec = np.argmax(one_hot_label, axis=1)
    return le.inverse_transform(dec)

y_train = label_encode(label_encoder, y_train)
y_val = label_encode(label_encoder, y_val)

check label:  ['anger' 'anticipation' 'disgust' 'fear' 'joy' 'sadness' 'surprise'
 'trust']


In [8]:
# ttttttest data
train_features = f_preprocess(train_text)
val_features = f_preprocess(val_text)

[[6, 46, 16276, 6, 16277, 2, 16278, 6, 263, 1526, 2, 16279, 3010, 17, 4533, 9601, 1, 1, 1, 5486, 1, 1, 3011], [61, 149, 939, 673, 2106, 14, 373, 662, 3, 20, 1, 16280, 9, 5, 1, 6934, 6, 82, 16281], [9602, 578, 22, 102, 3354, 379, 1, 16282, 6935, 16283, 4534, 4535]]
[[52, 7, 1588, 5, 1, 6, 1, 92, 174, 2912, 5016, 1, 389, 1296], [5017, 154, 2, 58, 423, 1297, 2051, 5018, 1], [5019, 339, 2, 43, 30, 5020, 788, 2, 144, 5021, 5022, 1, 5023, 5024, 90]]


In [9]:
train_features.shape

(24000, 105)

In [11]:
# I/O check
input_shape = train_features.shape[1]
print('input_shape: ', input_shape)

output_shape = len(label_encoder.classes_)
print('output_shape: ', output_shape)

input_shape:  105
output_shape:  8


In [None]:
from keras.models import Model
from keras.layers import Input, Dense
from keras.layers import ReLU, Softmax, Dropout

import tensorflow as tf
from tensorflow import keras

csv_logger = tf.keras.callbacks.CSVLogger('logs/training_log.csv')

vocab_dim = 200
n_epoch = 50
input_length = 500
units = 32

model = tf.keras.Sequential()
model.add(tf.keras.layers.Embedding(output_dim=vocab_dim,
                        input_dim=input_shape,
                        input_length=input_shape))
model.add(tf.compat.v1.keras.layers.CuDNNLSTM(64, return_sequences=True))
model.add(tf.keras.layers.GlobalMaxPool1D())
model.add(tf.keras.layers.Dropout(0.5))
model.add(tf.keras.layers.Dense(64, activation='relu'))
model.add(tf.keras.layers.Dropout(0.5))
model.add(tf.keras.layers.Dense(8, activation='sigmoid'))
model.compile(loss='categorical_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])
model.summary()

In [None]:
from keras.callbacks import CSVLogger

csv_logger = CSVLogger('logs/training_log.csv')

# training setting
epochs = 20
batch_size = 64

# training!
history = model.fit(train_features, np.array(y_train), 
                    epochs=epochs, 
                    batch_size=batch_size, 
                    callbacks=[csv_logger],
                    validation_data = (val_features, np.array(y_val)))
print('training finish')

----

In [None]:
x_test = test_df['text'].values
test_vectors = vectorizer.transform(x_test).A

In [None]:
test_predict = model.predict(test_vectors)

In [None]:
test_predict = label_decode(label_encoder, test_predict)

In [None]:
tweet_id = test_df["tweet_id"].tolist()

In [None]:
import csv
with open('results/tfidf_DNN.csv', 'w', newline='') as csvfile:
    writer  = csv.writer(csvfile)
    writer.writerow(("id","emotion"))
    for i in range(len(test_df)):
        writer.writerow((tweet_id[i], test_predict[i]))