In [1]:
import pandas as pd
import numpy as np
import json

import nltk
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
from nltk.tokenize import TweetTokenizer

from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer, accuracy_score, f1_score
from sklearn.metrics import roc_curve, auc
from sklearn.metrics import confusion_matrix, roc_auc_score, recall_score, precision_score
from sklearn.feature_selection import SelectKBest, f_regression
from sklearn import svm, tree, neighbors
from sklearn.metrics import classification_report
from sklearn.naive_bayes import MultinomialNB
from sklearn import ensemble

In [2]:
### training data
train_test = pd.read_csv("./dm2020-hw2-nthu/data_identification.csv")
label = pd.read_csv("./dm2020-hw2-nthu/emotion.csv")
label_dict = dict(label.values.tolist())

file = open("./dm2020-hw2-nthu/tweets_DM.json", 'rb')
data = []
for line in file.readlines():
    dic = json.loads(line)
    data.append(dic)
    
train_id = set()
test_id = set()

for a in train_test[train_test["identification"]=="train"]["tweet_id"]:
    train_id.add(a)
for b in train_test[train_test["identification"]=="test"]["tweet_id"]:
    test_id.add(b)

train = list()
test = list()
for i in range(len(data)):
    if(data[i]["_source"]["tweet"]["tweet_id"] in train_id):
        train.append(data[i])
    elif(data[i]["_source"]["tweet"]["tweet_id"] in test_id):
        test.append(data[i])
        
train_list=list()
for i in range(len(train)):
    tmp = list()
    tewwt_id = train[i]["_source"]["tweet"]["tweet_id"]
    text = train[i]["_source"]["tweet"]["text"].lower()
    label = label_dict[train[i]["_source"]["tweet"]["tweet_id"]]
    tags = train[i]["_source"]["tweet"]["hashtags"]
    tt = " "
    for t in tags:
        tt = tt + t + " "
    train_list.append((tewwt_id, text + tt, label))
train_df = pd.DataFrame(train_list,columns=["tweet_id", "text", "emotion"])

test_list=list()
for j in range(len(test)):
    tmp = list()
    tewwt_id = test[j]["_source"]["tweet"]["tweet_id"]
    text = test[j]["_source"]["tweet"]["text"].lower()
    tags = test[j]["_source"]["tweet"]["hashtags"]
    tt = " "
    for t in tags:
        tt = tt + t + " "
    test_list.append((tewwt_id, text + tt))
test_df = pd.DataFrame(test_list,columns=["tweet_id", "text"])

In [3]:
train, val = train_test_split(train_df, test_size=0.2, random_state=1)
x_train = train['text'].values
x_val = val['text'].values
y_train = train['emotion']
y_val = val['emotion']

In [11]:
## deal with label (string -> one-hot)

from sklearn.preprocessing import LabelEncoder
import keras

label_encoder = LabelEncoder()
label_encoder.fit(y_train)
print('check label: ', label_encoder.classes_)

def label_encode(le, labels):
    enc = le.transform(labels)
    return keras.utils.to_categorical(enc)

def label_decode(le, one_hot_label):
    dec = np.argmax(one_hot_label, axis=1)
    return le.inverse_transform(dec)

y_train = label_encode(label_encoder, y_train)
y_val = label_encode(label_encoder, y_val)

check label:  ['anger' 'anticipation' 'disgust' 'fear' 'joy' 'sadness' 'surprise'
 'trust']


In [6]:
GLOVE_FILE = './glove.twitter.27B/glove-seeds.txt'
dim = 200

def get_glove_vectors(vocab):
    """
    Extracts glove vectors from seed file only for words present in vocab.
    """
    print ('Looking for GLOVE seeds')
    glove_vectors = {}
    found = 0
    with open(GLOVE_FILE, 'r') as glove_file:
        for i, line in enumerate(glove_file):
            utils.write_status(i + 1, 0)
            tokens = line.strip().split()
            word = tokens[0]
            if vocab.get(word):
                vector = [float(e) for e in tokens[1:]]
                glove_vectors[word] = np.array(vector)
                found += 1
    print ('\n')
    return glove_vectors

In [None]:
def get_feature_vector(tweet):
    """
    Generates a feature vector for each tweet where each word is
    represented by integer index based on rank in vocabulary.
    """
    words = tweet.split()
    feature_vector = []
    for i in range(len(words) - 1):
        word = words[i]
        if vocab.get(word) is not None:
            feature_vector.append(vocab.get(word))
    if len(words) >= 1:
        if vocab.get(words[-1]) is not None:
            feature_vector.append(vocab.get(words[-1]))
    return feature_vector

In [None]:
vocab = utils.top_n_words(FREQ_DIST_FILE, vocab_size, shift=1)
    glove_vectors = get_glove_vectors(vocab)

In [6]:
train_vectors

<1164450x879 sparse matrix of type '<class 'numpy.float64'>'
	with 1940146 stored elements in Compressed Sparse Row format>

In [14]:
import tensorflow as tf
from tensorflow import keras
# from tensorflow.keras import layers
from tensorflow.python.keras import layers
from tensorflow.keras.preprocessing import sequence
from tensorflow.keras.layers import Bidirectional,LSTM, Dense, Dropout, Activation, GlobalMaxPool1D
from tensorflow.keras.callbacks import CSVLogger
from tensorflow.compat.v1.keras.layers import CuDNNLSTM

with tf.compat.v1.device("/device:GPU:0"):
    # tf.keras.layers.Embedding
    csv_logger = CSVLogger('logs/training_log_lab.csv')

    vocab_dim = 200
    batch_size = 64
    n_epoch = 30
    input_length = 880
    units = 64

    model = tf.keras.Sequential()
    model.add(tf.keras.layers.Embedding(output_dim=vocab_dim,
                        input_dim=300,
                        input_length=input_length))
    model.add(CuDNNLSTM(64, return_sequences=True))
    model.add(GlobalMaxPool1D())
    model.add(Dropout(0.2))
    model.add(Dense(64, activation='relu'))
    model.add(Dropout(0.2))
    model.add(Dense(32, activation='relu'))
    model.add(Dropout(0.2))
    model.add(Dense(8, activation='sigmoid'))
    # model.add(Dense(1))
    # model.add(Activation('sigmoid'))
    model.compile(loss='categorical_crossentropy',
                  optimizer='adam',
                  metrics=['accuracy'])
    model.summary()

Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 880, 200)          60000     
_________________________________________________________________
cu_dnnlstm (CuDNNLSTM)       (None, 880, 64)           68096     
_________________________________________________________________
global_max_pooling1d (Global (None, 64)                0         
_________________________________________________________________
dropout (Dropout)            (None, 64)                0         
_________________________________________________________________
dense (Dense)                (None, 64)                4160      
_________________________________________________________________
dropout_1 (Dropout)          (None, 64)                0         
_________________________________________________________________
dense_1 (Dense)              (None, 32)               

In [15]:
model.fit(x_train, np.array(y_train), batch_size=batch_size, epochs=n_epoch,callbacks=[csv_logger],
      validation_data=(x_val, np.array(y_val)),use_multiprocessing=True, workers=8)

Epoch 1/30


InvalidArgumentError: No OpKernel was registered to support Op 'CudnnRNNV2' used by {{node sequential_2/cu_dnnlstm/CudnnRNNV2}} with these attrs: [seed=0, dropout=0, T=DT_FLOAT, input_mode="linear_input", direction="unidirectional", rnn_mode="lstm", seed2=0, is_training=true]
Registered devices: [CPU, XLA_CPU]
Registered kernels:
  device='GPU'; T in [DT_HALF]
  device='GPU'; T in [DT_FLOAT]
  device='GPU'; T in [DT_DOUBLE]

	 [[sequential_2/cu_dnnlstm/CudnnRNNV2]] [Op:__inference_train_function_1974]

<p style="font_size:20px">------------------------------------</p>

In [None]:
x_test = test_df['text'].values
test_vectors = vectorizer.transform(x_test)

In [None]:
test_predict = clf.predict(test_vectors)

In [None]:
label_reverse = {'0':'anger','1':'anticipation','2':'disgust','3':'fear','4':'joy','5':'sadness','6':'surprise','7':'trust'}
test_predict = [label_reverse[str(t)] for t in test_predict]

In [None]:
tweet_id = test_df["tweet_id"].tolist()

In [None]:
import csv
with open('results/tfidf_naive_bagging.csv', 'w', newline='') as csvfile:
    writer  = csv.writer(csvfile)
    writer.writerow(("id","emotion"))
    for i in range(len(test_df)):
        writer.writerow((tweet_id[i], test_predict[i]))