In [1]:
import pickle 
import gensim
import os
import sys
import json
import numpy as np
from acrlist import acr
import tensorflow as tf
from keras.models import Sequential
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score
from keras.layers import *
from keras.optimizers import *
from keras.models import load_model

Using TensorFlow backend.


In [2]:
train_data_tweets = []
train_data_acronyms = []
test_data_tweets = []
test_data_acronyms = []

for expansion in acr['gg']:
    with open("train_data/"+str(expansion)+".txt") as file:
      tweets = file.readlines()
      for tweet in tweets:
        tweet = json.loads(tweet)
        tweet = tweet.lower()
        tweet = tweet.replace(expansion, 'gg')
        train_data_tweets.append(tweet.split())
        train_data_acronyms.append(expansion)
    with open("test_data/"+str(expansion)+".txt") as file:
      tweets = file.readlines()
      for tweet in tweets:
        tweet = json.loads(tweet)
        tweet = tweet.lower()
        tweet = tweet.replace(expansion, 'gg')
        test_data_tweets.append(tweet.split())
        test_data_acronyms.append(expansion)

In [3]:
def create_tagged_document(split_tweets, data_acronyms):
  for i, tweet in enumerate(split_tweets):
    yield gensim.models.doc2vec.TaggedDocument(words=tweet, tags=[data_acronyms[i]])
    
train_data = list(create_tagged_document(train_data_tweets, train_data_acronyms))
test_data = list(create_tagged_document(test_data_tweets, test_data_acronyms))
print(train_data[1])
print(train_data[0])
print(test_data[1])
print(test_data[0])

TaggedDocument(['hahahaha', '@andrewrobertso5', ',', 'btw', "you've", 'played', 'a', 'gg', 'just', 'now', 'https://t.co/fiooaattit'], ['good game'])
TaggedDocument(['gg', '@dame_lillard.', '#nbaplayoffs'], ['good game'])
TaggedDocument(['@vinijrmadrid', 'yh', 'i', 'agree', 'frenkie', 'is', 'overrated.', 'however,', 'have', 'you', 'ever', 'seen', 'a', 'gg', 'from', 'varane', 'without', 'ramos', 'by', 'his', 'side?'], ['good game'])
TaggedDocument(['been', 'playing', 'a', 'ton', 'of', 'mw3', 'recently', 'and', 'dang....', 'it', 'is', 'such', 'a', 'gg'], ['good game'])


In [4]:
model = gensim.models.doc2vec.Doc2Vec(vector_size=50, min_count=2, epochs=50)
model.build_vocab(train_data)
model.train(train_data, total_examples=model.corpus_count, epochs=model.epochs)

In [5]:
v = model.infer_vector("el classico was gg".split())
res_tup = model.docvecs.most_similar([v])[0]
print("Doc2Vec Vector & Prediction:")
print(np.array(v))
print(res_tup)
print(train_data[0].tags[0])
print(train_data[0].words)
print(model.infer_vector(train_data[0].words))
print(model.infer_vector(train_data[0].words, steps=20))

Doc2Vec Vector & Prediction:
[-0.0892152   0.26211968 -0.02735389 -0.15572265  0.01929537 -0.19861983
  0.26863602  0.17998381 -0.05489823  0.20054746  0.00367287 -0.01180489
  0.23448227 -0.11229099 -0.17844062  0.17398682  0.06106514 -0.11505175
  0.07442122  0.01000812  0.07821944  0.22736843  0.01226939 -0.06567281
  0.1394131  -0.21332549  0.39639238  0.09560472 -0.14940207 -0.35117173
 -0.03404063  0.36091828  0.2219844   0.06580997  0.28710154  0.18908893
  0.20198484  0.03063935 -0.02680985  0.18092312 -0.17294838 -0.15409076
  0.10034756  0.11244082  0.13534042 -0.08144362  0.40700454  0.2515625
  0.26704383 -0.22430523]
('good game', 0.5791212916374207)
good game
['gg', '@dame_lillard.', '#nbaplayoffs']
[-0.08902215  0.19461791  0.04666947 -0.09826236 -0.04084231 -0.16117726
 -0.01098028  0.14934994 -0.0390399   0.13313478  0.0411016  -0.00123444
  0.16371612 -0.00248014 -0.22178026 -0.03816758  0.12654068 -0.23927973
  0.10196987  0.13858032  0.06449875  0.03366532  0.193339

In [6]:
def classifier(X_train, Y_train, X_test, Y_test):
  #takes Doc2Vec as input layer instead of Word Embeddings, and trains classifiers for each acronym
  tf_model = Sequential()
  tf_model.add(Flatten())
  tf_model.add(Dense(128, activation="relu", input_shape=(50,)))
  tf_model.add(Dense(64, activation="relu"))
  tf_model.add(Dense(len(set(Y_train)), activation="softmax"))
  tf_model.compile(loss="categorical_crossentropy", optimizer="adam", metrics=['accuracy'])
  tf_model.fit(X_train, Y_train, batch_size=32, nb_epoch=3, verbose=1)
  score, acc = tf_model.evaluate(X_test, Y_test, verbose=1, batch_size=32)
  print("Score: %.2f" % (score))
  print("Validation Accuracy: %.2f" % (acc))
  return tf_model

In [7]:
X_train = []
X_test = []
Y_train = []
Y_test = []

def vector_for_learning(model, input_docs):
    sents = input_docs
    targets, feature_vectors = zip(*[(doc.tags[0], np.array(model.infer_vector(doc.words))) for doc in sents])
    return targets, feature_vectors
'''
for expansion in acr['gg']:
    with open("train_data/"+str(expansion)+".txt") as file:
      tweets = file.readlines()
      for tweet in tweets:
        tweet = json.loads(tweet)
        tweet = tweet.lower()
        tweet = tweet.replace(expansion, 'gg')
        X_train.append(np.array(model.infer_vector(tweet.split())))
        Y_train.append(expansion)
    with open("test_data/"+str(expansion)+".txt") as file:
      tweets = file.readlines()
      for tweet in tweets:""
        tweet = json.loads(tweet)
        tweet = tweet.lower()
        tweet = tweet.replace(expansion, 'gg')
        X_test.append(np.array(model.infer_vector(tweet.split())))
        Y_test.append(expansion)
'''
Y_train, X_train = vector_for_learning(model,train_data)
Y_test, X_test = vector_for_learning(model,test_data)
print(X_train[0])
# Y_train = np.asarray(Y_train)

[-0.09955219  0.21355869  0.00499058 -0.03739279 -0.13394399 -0.15706407
  0.06024845  0.09879696  0.04302968  0.08702505  0.01033139 -0.0045344
  0.17512321 -0.01450355 -0.15737897  0.06691764  0.16409363 -0.12645192
  0.06212486  0.14132154  0.00557158 -0.02015231  0.17861135 -0.22822525
 -0.00230958 -0.20046216  0.45161894  0.06304871  0.13707675 -0.09345617
  0.10469544 -0.05117442  0.12072166  0.01753756  0.17489526  0.22259988
  0.17537904 -0.06440052 -0.16848782  0.2161144   0.08175163 -0.09842881
 -0.10066889 -0.06417364  0.09179608 -0.01166982  0.262457   -0.03282921
 -0.04030594  0.11259526]


In [8]:
#class_model = classifier(X_train, Y_train, X_test, Y_test)

In [9]:
logreg = LogisticRegression(n_jobs=1, C=1e5)
logreg.fit(X_train, Y_train)
y_pred = logreg.predict(X_test)
print('Testing accuracy for movie plots%s' % accuracy_score(Y_test, y_pred))
print('Testing F1 score for movie plots: {}'.format(f1_score(Y_test, y_pred, average='weighted')))



Testing accuracy for movie plots0.7531451365449524
Testing F1 score for movie plots: 0.7548659004307904
