In [1]:
import pandas as pd
import numpy as np
import random
import csv
from keras.preprocessing import sequence
from keras.preprocessing.text import Tokenizer
from keras.models import Sequential
from keras.layers.core import Dense , Dropout , Activation , Flatten
from keras.layers.convolutional import Convolution1D, MaxPooling1D
from keras.layers import Embedding, LSTM
from gensim.models.word2vec import Word2Vec
from gensim.models import KeyedVectors
from sklearn import preprocessing
from sklearn.metrics import mean_squared_error , mean_absolute_error

In [2]:
# size of the word embeddings
embeddings_dim = 300

In [3]:
# maximum number of words to consider in the representations
max_features = 20000 #Voc is: 24535 but 10273 only (>2)

In [4]:
# maximum length of a sentence
max_sent_len = 50 # Max 58, Median = 17, avg 16

In [5]:
crossval = 20
## Random Seed
seed = 27
#number of dimensions in regression problem
reg_dimensions = 1

In [6]:
print ("")
print ("Reading pre-trained word embeddings...")
embeddings = dict( )

# Load external word embeddings
embeddings = KeyedVectors.load_word2vec_format( "twitter_sgns_subset.txt.gz" , binary=False ) 
print ("Reading text data for regression and building representations...")


Reading pre-trained word embeddings...
Reading text data for regression and building representations...


In [7]:
df = pd.read_csv("vocab.csv", sep='\t')
df

Unnamed: 0,SentID,Tweet,Emotion,Rating
0,10941,At the point today where if someone says somet...,anger,0.000
1,10942,@CorningFootball IT'S GAME DAY!!!! T MIN...,anger,0.000
2,10943,This game has pissed me off more than any othe...,anger,0.000
3,10944,@spamvicious I've just found out it's Candice ...,anger,0.000
4,10945,@moocowward @mrsajhargreaves @Melly77 @GaryBar...,anger,0.000
...,...,...,...,...
7097,40855,Common app just randomly logged me out as I wa...,sadness,0.833
7098,40856,"I'd rather laugh with the rarest genius, in be...",sadness,0.688
7099,40857,If you #invest in my new #film I will stop ask...,sadness,0.458
7100,40858,"Just watched Django Unchained, Other people ma...",sadness,0.333


In [8]:
## We first estimate the entire vocabulary, here we use a file that combines Test&Train data:

full_voc_data  = list(df[["Tweet","Rating"]].itertuples(index=False, name=None))
full_data_size = int(len(full_voc_data))
all_texts = [txt for (txt, label) in full_voc_data[0:full_data_size] ]
tokenizer = Tokenizer(num_words=max_features, filters='%&()*+,-./:;<=>[\\]^_`{|}~\t\n',lower=True, split=" ")
tokenizer.fit_on_texts(all_texts) # <-- Tokenizer based on all TEXTS

In [9]:
test = pd.read_csv("test/anger_CNN-LSTM_input.txt", sep='\t')
test

Unnamed: 0,Tweet,Rating
0,At the point today where if someone says somet...,0.319
1,@CorningFootball IT'S GAME DAY!!!! T MIN...,0.144
2,This game has pissed me off more than any othe...,0.898
3,@spamvicious I've just found out it's Candice ...,0.271
4,@moocowward @mrsajhargreaves @Melly77 @GaryBar...,0.646
...,...,...
755,@ggreenwald What if the supposed animosity is ...,0.646
756,Will BYU's offense score 24+ vs WVU?,0.125
757,Id love 2 c Gyimah in action but his coach is ...,0.542
758,Forgiving means operating with God's spirit &a...,0.250


In [10]:
## NOW LOAD TEST DATA ##
TSdata = list(test[["Tweet","Rating"]].itertuples(index=False, name=None))
test_size = int(len(TSdata) )
test_texts = [ txt for ( txt, label ) in TSdata[0:test_size] ]
test_labels = [ label for ( txt , label ) in TSdata[0:test_size] ]
test_sequences = sequence.pad_sequences( tokenizer.texts_to_sequences( test_texts ) , maxlen=max_sent_len )
##############

In [11]:
## we Iterate through all training types, as well as the combination (afjs):
## Note that afjs is (yet) not used by our system.
EMOS="anger"
for i in EMOS.split():
    currentemo=str(i)
    temp_df = pd.read_csv("train/"+currentemo+"_tr_dv.csv", sep='\t')
    TRdata = list(temp_df[["Tweet","Rating"]].itertuples(index=False, name=None))
    random.shuffle( TRdata )    
    train_size = int(len(TRdata) )
    train_texts = [ txt for ( txt, label ) in TRdata[0:train_size] ]
    train_labels = [ label for ( txt , label ) in TRdata[0:train_size] ]
    train_sequences = sequence.pad_sequences( tokenizer.texts_to_sequences( train_texts ) , maxlen=max_sent_len )
    embedding_weights = np.zeros( ( max_features , embeddings_dim ) )
    for word,index in tokenizer.word_index.items():
        if index < max_features:
                try: embedding_weights[index,:] = embeddings[word]
                except: embedding_weights[index,:] = np.random.rand( 1 , embeddings_dim )
    np.random.seed(seed)
    filter_length = 3
    nb_filter = embeddings_dim
    pool_length = 2
    model = Sequential()
    model.add(Embedding(max_features, embeddings_dim, input_length=max_sent_len, weights=[embedding_weights]))
    model.add(Dropout(0.25))
    model.add(Convolution1D(filters=32, kernel_size=3, activation='relu'))
    model.add(MaxPooling1D())
    model.add(LSTM(embeddings_dim))
    model.add(Dense(reg_dimensions))
    model.add(Activation('sigmoid'))
    model.compile(loss='mean_absolute_error', optimizer='adam')
    train_sequences = np.array(train_sequences)
    train_labels = np.array(train_labels)
    model.fit( train_sequences , train_labels , epochs=30, batch_size=16)
    model.save("/content/emotion-intensity-prediction-/keras_regression/models/"+currentemo+".h5")  # creates a HDF5 file 'my_model.h5'
    

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


In [12]:
test_sequences = np.array(test_sequences)
results = model.predict( test_sequences )
# np.savetxt("/content/emotion-intensity-prediction-/keras_regression/test/pred/"+currentemo+".txt", results, newline='\n')


In [13]:
from sklearn.metrics import mean_squared_error
test_labels = np.array(test_labels)
mse = mean_squared_error(test_labels, results)

In [14]:
mse

0.01911054209786421

In [15]:
print(results)

[[0.40739197]
 [0.27555066]
 [0.5218489 ]
 [0.5577313 ]
 [0.5289149 ]
 [0.64020413]
 [0.38352755]
 [0.60591066]
 [0.36625522]
 [0.3169623 ]
 [0.56976134]
 [0.5775158 ]
 [0.4044249 ]
 [0.6542872 ]
 [0.5405839 ]
 [0.44410327]
 [0.42767614]
 [0.51057434]
 [0.5212598 ]
 [0.4728461 ]
 [0.61563355]
 [0.4869023 ]
 [0.52743626]
 [0.55211246]
 [0.37357488]
 [0.54482114]
 [0.50874007]
 [0.57501644]
 [0.5831641 ]
 [0.48305795]
 [0.38601747]
 [0.5208992 ]
 [0.64838195]
 [0.57481885]
 [0.5709633 ]
 [0.62930095]
 [0.44189745]
 [0.50338554]
 [0.53495103]
 [0.81197244]
 [0.73296404]
 [0.47111973]
 [0.45221528]
 [0.6539094 ]
 [0.5470284 ]
 [0.6619128 ]
 [0.5296949 ]
 [0.40485543]
 [0.52308166]
 [0.50044334]
 [0.4553193 ]
 [0.63059956]
 [0.5154291 ]
 [0.42319557]
 [0.7297125 ]
 [0.29121372]
 [0.2758641 ]
 [0.5014823 ]
 [0.5424186 ]
 [0.43000653]
 [0.47626245]
 [0.43422073]
 [0.42052764]
 [0.525714  ]
 [0.5200603 ]
 [0.41297847]
 [0.48041734]
 [0.46300375]
 [0.63121104]
 [0.3744856 ]
 [0.40941668]
 [0.33

In [16]:
print(test_labels)

[0.319 0.144 0.898 0.271 0.646 0.583 0.375 0.625 0.396 0.25  0.438 0.708
 0.333 0.877 0.708 0.417 0.229 0.375 0.667 0.354 0.812 0.562 0.312 0.625
 0.458 0.521 0.583 0.458 0.271 0.188 0.271 0.708 0.417 0.292 0.771 0.521
 0.271 0.729 0.5   0.938 0.917 0.479 0.375 0.648 0.667 0.755 0.792 0.521
 0.479 0.5   0.417 0.5   0.396 0.333 0.798 0.375 0.173 0.771 0.625 0.906
 0.333 0.438 0.25  0.646 0.5   0.5   0.688 0.479 0.604 0.375 0.458 0.323
 0.542 0.229 0.375 0.354 0.465 0.521 0.562 0.292 0.521 0.396 0.646 0.396
 0.438 0.875 0.688 0.438 0.396 0.333 0.438 0.583 0.562 0.271 0.833 0.91
 0.771 0.851 0.296 0.292 0.479 0.458 0.521 0.492 0.312 0.292 0.562 0.583
 0.458 0.229 0.375 0.604 0.583 0.219 0.396 0.354 0.356 0.458 0.354 0.292
 0.121 0.417 0.84  0.354 0.271 0.192 0.292 0.438 0.208 0.292 0.688 0.604
 0.583 0.354 0.375 0.354 0.333 0.5   0.271 0.25  0.583 0.542 0.5   0.417
 0.688 0.417 0.542 0.688 0.312 0.521 0.396 0.375 0.333 0.729 0.708 0.688
 0.78  0.688 0.583 0.354 0.438 0.354 0.583 0.454 0.4

In [17]:
baseline = np.ones(test_labels.shape[0])*np.mean(test_labels)

In [18]:
baseline_mse = mean_squared_error(test_labels, baseline)

In [19]:
baseline_mse

0.02950585552458449