# Acronym Expansion in English Tweets 
> Acronyms are present all across social media to express information that is repetitive and well known. But acronyms can be ambiguous because there can be many expansions of the same acronym. This project aims to disambiguate between multiple expansions of an acronym given some context.


In [2]:
import pickle 
import gensim
import os
import sys
import json
import numpy as np
from acrlist import acr
import tensorflow as tf
from keras.models import Sequential
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score
from keras.layers import *
from keras.optimizers import *
from keras.models import load_model

Using TensorFlow backend.


## Acronym List

In [5]:
for acronym, expansions in acr.items():
    print(acronym)
    print(expansions)

brb
['be right back', 'bathroom break']
cc
['carbon copy', 'i understand']
dl
['download', 'down low', 'doing laundry']
eta
['estimated time of arrival', 'edited to add']
gf
['girlfriend', 'gluten-free']
gg
['good game', 'good grief']
gl
['good luck', 'get lost']
hoas
['hold on a second', 'heck of a shot']
hw
['homework', 'hardware']
ic
['i see', 'in character']
im
['instant messenger', 'instant message']
k
['ok', 'kiss']
lol
['laughing out loud', 'league of legends', 'lots of love', 'little old lady']
na
['not available', 'not applicable']
nc
['no comment', 'nice call', 'not cool']
nm
['not much', 'nevermind']
np
['no problem', 'neopets']
ot
['off topic', 'other topic', 'overtime']
pm
['pm', 'private message']
pos
['parent over shoulder', 'piece of shit', 'power of suggestion']
re
['regarding', 'resident evil']
rotfl
['rolling on the floor laughing', "rolling over freakin' laughing"]
smh
['shaking my head', 'smash my head', 'scratching my head']
sos
['someone over shoulder', 'save our

## Pre-Processing
preprocess() prepares the dataset by replacing the expansion with the acronym and labelling it with the expansion. 

create_tagged_document() yields a tagset compatible with gensim's doc2vec module

In [2]:
def preprocess(tweet, acronym, expansion):
    tweet = json.loads(tweet)
    tweet = tweet.lower()
    tweet = tweet.replace(expansion, acronym)
    return tweet

In [5]:
train_data_tweets = []
train_data_acronyms = []
test_data_tweets = []
test_data_acronyms = []

for expansion in acr['ur']:
    with open("train_data/"+str(expansion)+".txt") as file:
      tweets = file.readlines()
      for tweet in tweets:
        train_data_tweets.append(preprocess(tweet,'ur', expansion).split())
        train_data_acronyms.append(expansion)
        
    with open("test_data/"+str(expansion)+".txt") as file:
      tweets = file.readlines()
      for tweet in tweets:
        test_data_tweets.append(preprocess(tweet,'ur', expansion).split())
        test_data_acronyms.append(expansion)

In [6]:
def create_tagged_document(split_tweets, data_acronyms):
  for i, tweet in enumerate(split_tweets):
    yield gensim.models.doc2vec.TaggedDocument(words=tweet, tags=[data_acronyms[i]])
    
train_data = list(create_tagged_document(train_data_tweets, train_data_acronyms))
test_data = list(create_tagged_document(test_data_tweets, test_data_acronyms))

In [11]:
print(train_data[1])
print()
print(test_data[0])

TaggedDocument(['imagine', 'spending', 'ur', 'entire', 'life', 'working', 'to', 'move', 'a', 'stone', 'for', 'the', 'pyramids', 'just', 'for', 'people', 'in', 'the', 'future', 'to', 'give', 'the', 'credit', 'to', 'aliens'], ['your'])

TaggedDocument(['@socialmedia2day:', "instagram's", "'checkout'", 'on-platform', 'shopping', 'tools', 'are', 'slowly', 'being', 'rolled', 'out', 'to', 'more', 'profiles', '-', 'how', 'will', 'it', 'change', 'yo…'], ['your'])


## Doc2Vec Model

In [10]:
dmodel = gensim.models.doc2vec.Doc2Vec(vector_size=50, min_count=2, epochs=50)
dmodel.build_vocab(train_data)
dmodel.train(train_data, total_examples=dmodel.corpus_count, epochs=dmodel.epochs)

### Infer vector for a sample tweet:
Now, that our Doc2Vec Model is trained, we can infer vectors for sample tweets. Here, we infer the vector for "@realdonaldtrump ur the best!".

In [13]:
v = dmodel.infer_vector("@realdonaldtrump ur the best!".split())
res_tup = dmodel.docvecs.most_similar([v])[0]

In [17]:
print(v)

[-0.3730057   0.03873225  0.18107004  0.49245355  0.01146623  0.03212615
  0.02637365 -0.26329508  0.09609044  0.00057228  0.00530779  0.04192339
  0.09749373  0.108716   -0.06374781 -0.21909876  0.19668134  0.36210972
  0.04201772  0.26893148 -0.39092022  0.29757932 -0.21173681 -0.05808842
  0.5106051  -0.4477003   0.3142922   0.10621338  0.16630854  0.07746315
  0.15002309  0.37434423  0.28614601  0.08227687 -0.2194723   0.11819626
  0.10542516  0.36624688 -0.13511929 -0.00911398  0.0438503   0.00173114
 -0.10187595 -0.18324928 -0.14399385  0.36478555 -0.24429378  0.05013009
  0.11764859  0.04702626]


### Prediction using Cosine Similarity Score
Cosine Similarity shows intimacy between two vectors in the vector space

In [26]:
print(res_tup)

('you are', 0.42928239703178406)


In [27]:
X_train = []
X_test = []
Y_train = []
Y_test = []
CATEGORIES = ['you are', 'your']

def vector_for_learning(model, input_docs):
    sents = input_docs
    targets, feature_vectors = zip(*[(CATEGORIES.index(doc.tags[0]), np.array(model.infer_vector(doc.words))) for doc in sents])
    return targets, feature_vectors

Y_train, X_train = vector_for_learning(dmodel,train_data)
Y_test, X_test = vector_for_learning(dmodel,test_data)
print(X_train[0])
print(Y_train[0])

[-1.1517795   0.59988225  0.17350073  1.4895848   0.6639548  -0.49600956
  0.4742393  -1.0350697   0.595307    0.1914485  -0.32374594  0.1414308
 -0.49800748 -0.5194397   0.5195279  -1.3431836  -0.08917648 -0.09984987
  0.02807552  0.9183444   0.7245053   0.18541253 -0.9700345  -0.1266044
  0.66724527  0.24285768  1.0017112  -0.21868709  0.6122592  -0.5060775
  0.08751622 -0.13614912  0.6777088   0.8275376  -1.668751    0.37461823
 -0.32881874  0.63455224 -0.83424807 -0.5471584  -1.0064769  -0.411073
 -0.81983805 -0.69994146 -0.22027121 -0.62154084 -0.04591395 -0.04753913
 -1.0543858  -0.1840892 ]
1


## Multilayer Perceptron (MLP) for multi-class softmax classification

In [29]:
xx = tf.keras.utils.normalize(X_train)
xtest = tf.keras.utils.normalize(X_test)
Y_train = np.array(Y_train)
model = Sequential()
model.add(Dense(64, input_shape=(50,),activation=tf.nn.relu))
model.add(Dropout(0.3))
model.add(Dense(128, activation=tf.nn.relu))
model.add(Dropout(0.3))
model.add(Dense(128, activation=tf.nn.relu))
model.add(Dropout(0.3))
model.add(Dense(64, activation=tf.nn.relu))
model.add(Dense(2, activation=tf.nn.softmax))
model.compile(
    optimizer='adam',
    loss='sparse_categorical_crossentropy',
    metrics=['accuracy']
)
model.fit(np.array(xx), np.array(Y_train), epochs=50)
val_loss, val_acc = model.evaluate(np.array(xtest), np.array(Y_test))
print(val_loss, val_acc)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50
0.6908596870458882 0.695437312561996
