In [23]:
from flair.embeddings import WordEmbeddings, FlairEmbeddings, StackedEmbeddings
from flair.data import Sentence
import os
import numpy as np
import time

In [24]:
DATADIR = "/Users/bent/Documents/programming/cl-twitter-personality/"

def loadData(filename, numberLines=-1, maxWords=-1):
    data = {}
    data["tweets"] = []
    data["sentiment"] = []
    
    start = time.time()
    
    file = open(os.path.join(DATADIR, filename), encoding="utf-8")
    minTokens = -1;
    for lineNumber, line in enumerate(file):
        # first line is headers
        if (lineNumber == 0):
            continue
        
        # only look at the first few lines
        if (lineNumber == numberLines + 1):
            break
        
        # one line per user
        # columns[0] is index
        # columns[1] is sentiment {0, 1}
        # columns[2] Source
        # columns[3] Tweet text
        columns = line.split(",")
        data["sentiment"].append(int(columns[1]))
        tokens = len(columns[3].split())
        if (tokens < minTokens or minTokens == -1):
            minTokens = tokens
        if (maxWords != -1):
            columns[3] = ' '.join(columns[3].split()[0:maxWords])
        data["tweets"].append(Sentence(columns[3]))
        
        if (lineNumber % 100000 == 0):
            timeTaken = time.time() - start
            timePerLine = timeTaken / (lineNumber + 1)
            timeLeft = (numberLines - lineNumber - 1) * timePerLine
            print("Line " + str(lineNumber) + " loaded. Time passed: " + str(timeTaken) + ". Time left: " + str(timeLeft) + ".")
    file.close()
    print("Min number of tokens in data: " + str(minTokens))
    return data

In [25]:
def createEmbeddings(text, embeddings, numWords):
    start = time.time()
    embeddingList = []
    for index, sentence in enumerate(text):
        sentenceEmbeddings = []
        embeddings.embed(sentence)
        for token in sentence:
            sentenceEmbeddings.append(token.embedding)
        while (len(sentenceEmbeddings) < numWords):
            sentenceEmbeddings.append(np.zeros(100))
        embeddingList.append(sentenceEmbeddings)
        if (index % 100000 == 0):
            timeTaken = time.time() - start
            timePerEmbedding = timeTaken / (index + 1)
            timeLeft = (len(text) - index - 1) * timePerEmbedding
            print("Embeddings: " + str(index + 1) + " / " + str(len(text)) + " complete. Time passed: " + str(timeTaken) + ". Time left: " + str(timeLeft) + ".")
        
    return embeddingList

In [26]:
# from https://stackoverflow.com/questions/4601373/better-way-to-shuffle-two-numpy-arrays-in-unison
def shuffle_in_unison(a, b):
    rng_state = np.random.get_state()
    np.random.shuffle(a)
    np.random.set_state(rng_state)
    np.random.shuffle(b)

In [27]:
#embeddings = StackedEmbeddings([
#                                WordEmbeddings('glove'), 
#                                FlairEmbeddings('news-forward'), 
#                                FlairEmbeddings('news-backward'),
#                              ])
embeddings = WordEmbeddings('glove')

numberWords = 35
numberLines = 100

In [28]:
print('Loading data')
data = loadData('SentimentAnalysisDataset.csv', numberLines, numberWords)
print('Data loaded')

# shuffle the data
#shuffle_in_unison(data["sentiment"], data["tweets"])

# create the embeddings
print('Creating embeddings')
data["embeddings"] = createEmbeddings(data["tweets"], embeddings, numberWords)
print('Embeddings created')

# print one piece of data to make sure everything is good
print(data["sentiment"][0])
print(data["tweets"][0])
#print(data["embeddings"][0])

Loading data
Min number of tokens in data: 2
Data loaded
Creating embeddings
Embeddings: 1 / 100 complete. Time passed: 0.0006239414215087891. Time left: 0.06177020072937012.
Embeddings created
0
Sentence: "is so sad for my APL friend............." - 7 Tokens


In [29]:
# turn embedding list and personality list into numpy array

print("Tranforming embeddings into numpy array")
for index, embeddings in enumerate(data["embeddings"]):
    for index2, embedding in enumerate(embeddings):
        # change PyTorch tensor to NumPy array
        try:
            data["embeddings"][index][index2] = embedding.numpy()
        except:
            pass
print("Done transforming embeddings")

embeddingDim = len(data["embeddings"][0][0]);

print("Transforming data into numpy arrays")
data["embeddings"] = np.array(data["embeddings"])

data["embeddings"] = np.resize(data["embeddings"], (len(data["embeddings"]), numberWords, embeddingDim, 1))

data["sentiment"] = np.array(data["sentiment"])
print("Done transforming data")

Tranforming embeddings into numpy array
Done transforming embeddings
Transforming data into numpy arrays
Done transforming data


In [30]:
X = data["embeddings"]
y = data["sentiment"]

print(X.shape, y.shape)

print("Word vector size: " + str(X.shape[2]))
print("Number of tweets: " + str(X.shape[0]))
print("Number of words per  tweet: " + str(X.shape[1]))

(100, 35, 100, 1) (100,)
Word vector size: 100
Number of tweets: 100
Number of words per  tweet: 35


In [31]:
# make the model
import tensorflow as tf
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.layers import Dense, Dropout, Activation, Flatten, Conv2D, MaxPooling2D, Concatenate, Average, Input

model = Sequential()

model.add(Conv2D(64, (3,100), input_shape=X.shape[1:]))
model.add(Activation('relu'))
model.add(MaxPooling2D(pool_size=(2,1)))

model.add(Flatten())
model.add(Dense(64))
model.add(Activation('relu'))

model.add(Dense(1))
model.add(Activation('sigmoid'))

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [32]:
model.fit(X, y, batch_size=32, validation_split=0.1, epochs=10)

Train on 90 samples, validate on 10 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x147c64e80>

In [22]:
numI = 0
numE = 0
for i in range(0, len(y)):
    if (y[i] == 0):
        numI = numI + 1
    elif (y[i] == 1):
        numE = numE + 1

print("0 Proportion: " + str(numI / len(y)))
print("1 Proportion: " + str(numE / len(y)))

0 Proportion: 0.669
1 Proportion: 0.331


In [27]:
print(data['embeddings'][0])

[[[-0.15251 ]
  [ 0.14106 ]
  [ 0.62195 ]
  ...
  [ 0.36824 ]
  [ 0.28545 ]
  [-0.58772 ]]

 [[-0.077053]
  [ 1.5622  ]
  [ 0.69068 ]
  ...
  [-0.32393 ]
  [ 0.37064 ]
  [-1.3264  ]]

 [[-0.2857  ]
  [ 0.3816  ]
  [ 0.55507 ]
  ...
  [-0.06151 ]
  [-0.090062]
  [ 0.5536  ]]

 ...

 [[-0.33979 ]
  [ 0.20941 ]
  [ 0.46348 ]
  ...
  [-0.23394 ]
  [ 0.47298 ]
  [-0.028803]]

 [[ 0.      ]
  [ 0.      ]
  [ 0.      ]
  ...
  [ 0.      ]
  [ 0.      ]
  [ 0.      ]]

 [[ 0.      ]
  [ 0.      ]
  [ 0.      ]
  ...
  [ 0.      ]
  [ 0.      ]
  [ 0.      ]]]
