In [14]:
import pickle 
import gensim
import os
import sys
import json
import numpy as np
from acrlist import acr
import tensorflow as tf
from keras.models import Sequential
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score
from keras.layers import *
from keras.optimizers import *
from keras.models import load_model

In [21]:
train_data_tweets = []
train_data_acronyms = []
test_data_tweets = []
test_data_acronyms = []

for expansion in acr['gg']:
    with open("train_data/"+str(expansion)+".txt") as file:
      tweets = file.readlines()
      for tweet in tweets:
        tweet = json.loads(tweet)
        tweet = tweet.lower()
        tweet = tweet.replace(expansion, 'gg')
        train_data_tweets.append(tweet.split())
        train_data_acronyms.append(expansion)
    with open("test_data/"+str(expansion)+".txt") as file:
      tweets = file.readlines()
      for tweet in tweets:
        tweet = json.loads(tweet)
        tweet = tweet.lower()
        tweet = tweet.replace(expansion, 'gg')
        test_data_tweets.append(tweet.split())
        test_data_acronyms.append(expansion)

In [25]:
def create_tagged_document(split_tweets, data_acronyms):
  for i, tweet in enumerate(split_tweets):
    yield gensim.models.doc2vec.TaggedDocument(words=tweet, tags=[data_acronyms[i]])
    
train_data = list(create_tagged_document(train_data_tweets, train_data_acronyms))
test_data = list(create_tagged_document(test_data_tweets, test_data_acronyms))
print(train_data[1])
print(train_data[0])
print(test_data[1])
print(test_data[0])

TaggedDocument(['hahahaha', '@andrewrobertso5', ',', 'btw', "you've", 'played', 'a', 'gg', 'just', 'now', 'https://t.co/fiooaattit'], ['good game'])
TaggedDocument(['gg', '@dame_lillard.', '#nbaplayoffs'], ['good game'])
TaggedDocument(['@vinijrmadrid', 'yh', 'i', 'agree', 'frenkie', 'is', 'overrated.', 'however,', 'have', 'you', 'ever', 'seen', 'a', 'gg', 'from', 'varane', 'without', 'ramos', 'by', 'his', 'side?'], ['good game'])
TaggedDocument(['been', 'playing', 'a', 'ton', 'of', 'mw3', 'recently', 'and', 'dang....', 'it', 'is', 'such', 'a', 'gg'], ['good game'])


In [26]:
model = gensim.models.doc2vec.Doc2Vec(vector_size=50, min_count=2, epochs=50)
model.build_vocab(train_data)
model.train(train_data, total_examples=model.corpus_count, epochs=model.epochs)

In [27]:
v = model.infer_vector("el classico was gg".split())
res_tup = model.docvecs.most_similar([v])[0]
print("Doc2Vec Vector & Prediction:")
print(np.array(v))
print(res_tup)
print(train_data[0].tags[0])
print(train_data[0].words)
print(model.infer_vector(train_data[0].words))
print(model.infer_vector(train_data[0].words, steps=20))

Doc2Vec Vector & Prediction:
[ 0.11610815 -0.09637776  0.13905996  0.02995563 -0.07188453  0.2620153
 -0.30154127 -0.0505563  -0.00487955 -0.22807415  0.03706959 -0.01216694
  0.01565087  0.29274997  0.00140057  0.1917762   0.01242988 -0.04357467
  0.24994367  0.02817337 -0.28806245  0.05678324  0.03414504 -0.01749791
 -0.31386104  0.08919828 -0.01148174  0.13156617  0.29845026  0.3045436
 -0.12027419 -0.4585482   0.04902999  0.3655776  -0.06130926 -0.07447112
 -0.3835978   0.36913866 -0.24127743  0.13647544  0.04905547  0.14781152
  0.35666052 -0.04103333  0.07582591 -0.05212718 -0.05658853  0.01944257
  0.15880005 -0.20482947]
('good game', 0.5393985509872437)
good game
['gg', '@dame_lillard.', '#nbaplayoffs']
[-0.12985063 -0.04627874  0.03161131 -0.08467706 -0.2287561   0.06795514
 -0.07820859 -0.01356712 -0.05916797  0.01745098  0.05414018 -0.08329126
  0.00305788  0.06248035 -0.2394832   0.19107407  0.16665997  0.00084008
  0.12116086  0.17829508 -0.11109615  0.06028922 -0.0306006

In [28]:
def classifier(X_train, Y_train, X_test, Y_test):
  #takes Doc2Vec as input layer instead of Word Embeddings, and trains classifiers for each acronym
  tf_model = Sequential()
  tf_model.add(Flatten())
  tf_model.add(Dense(128, activation="relu", input_shape=(50,)))
  tf_model.add(Dense(64, activation="relu"))
  tf_model.add(Dense(len(set(Y_train)), activation="softmax"))
  tf_model.compile(loss="categorical_crossentropy", optimizer="adam", metrics=['accuracy'])
  tf_model.fit(X_train, Y_train, batch_size=32, nb_epoch=3, verbose=1)
  score, acc = tf_model.evaluate(X_test, Y_test, verbose=1, batch_size=32)
  print("Score: %.2f" % (score))
  print("Validation Accuracy: %.2f" % (acc))
  return tf_model

In [30]:
X_train = []
X_test = []
Y_train = []
Y_test = []

def vector_for_learning(model, input_docs):
    sents = input_docs
    targets, feature_vectors = zip(*[(doc.tags[0], np.array(model.infer_vector(doc.words))) for doc in sents])
    return targets, feature_vectors
'''
for expansion in acr['gg']:
    with open("train_data/"+str(expansion)+".txt") as file:
      tweets = file.readlines()
      for tweet in tweets:
        tweet = json.loads(tweet)
        tweet = tweet.lower()
        tweet = tweet.replace(expansion, 'gg')
        X_train.append(np.array(model.infer_vector(tweet.split())))
        Y_train.append(expansion)
    with open("test_data/"+str(expansion)+".txt") as file:
      tweets = file.readlines()
      for tweet in tweets:""
        tweet = json.loads(tweet)
        tweet = tweet.lower()
        tweet = tweet.replace(expansion, 'gg')
        X_test.append(np.array(model.infer_vector(tweet.split())))
        Y_test.append(expansion)
'''
Y_train, X_train = vector_for_learning(model,train_data)
Y_test, X_test = vector_for_learning(model,test_data)
print(X_train[0])
# Y_train = np.asarray(Y_train)

[-0.21349783  0.00385499 -0.0155456  -0.09196007 -0.27987805  0.04231379
 -0.07122794  0.00120678 -0.03665324  0.00331702  0.09748079 -0.02718358
  0.00217732  0.11261013 -0.22843425  0.10728273  0.12206127  0.03555213
  0.05433202  0.16443211 -0.07678624  0.14111248  0.02860417  0.04649205
 -0.33513197  0.08422614  0.08300844  0.04795912 -0.05511283  0.11051835
 -0.10859407 -0.19732864  0.02614488  0.17108893  0.10243626 -0.14008847
 -0.2484317   0.09366611 -0.06665147  0.15167387  0.13106623  0.01957054
  0.16814987 -0.07924499  0.03290019  0.03467407 -0.34103623 -0.01297903
  0.035485   -0.24149689]


In [31]:
#class_model = classifier(X_train, Y_train, X_test, Y_test)



Testing accuracy for movie plots0.7517643448910709
Testing F1 score for movie plots: 0.7537539921709248


In [32]:
logreg = LogisticRegression(n_jobs=1, C=1e5)
logreg.fit(X_train, Y_train)
y_pred = logreg.predict(X_test)
print('Testing accuracy for movie plots%s' % accuracy_score(Y_test, y_pred))
print('Testing F1 score for movie plots: {}'.format(f1_score(Y_test, y_pred, average='weighted')))



Testing accuracy for movie plots0.7517643448910709
Testing F1 score for movie plots: 0.7537539921709248
