In [8]:
'''
requirements:
beautifulsoup4
lxml
tensorflow
numpy
pandas
emoji
selenium

'''
import numpy as np
import pandas as pd
import tensorflow as tf

from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.layers import Attention,Bidirectional,Dense,Embedding,LSTM,GRU
from tensorflow.keras.initializers import Constant

In [12]:
sas=pd.read_csv('data/stephenasmith_sa.csv.gz',compression='gzip')
max_tweet_len=50
embedding_size=100
sas['y']=1*(sas['std_favorite_count']>-0.03)

In [14]:
def get_embedding_map(path):
    M={}
    with open(path) as f:
        for line in f:
            line_list=line.split()
            word=line_list[0]
            M[word]=np.array([float(val) for val in line_list[1:]])
    return M

In [4]:
M=get_embedding_map('glove/glove.twitter.27B.{}d.txt'.format(embedding_size))

In [5]:
vocab_size=10000
tokenizer=Tokenizer(num_words=vocab_size,
                    filters='!"$%&()*+,-./:;=?[\\]^_`{|}~\t\n',
                    oov_token='<unk>')
tokenizer.fit_on_texts(sas['full_text'])
sequences=tokenizer.texts_to_sequences(sas['full_text'])
data=pad_sequences(sequences,maxlen=max_tweet_len)

In [15]:
E=np.zeros((vocab_size,embedding_size))
for word,index in tokenizer.word_index.items():
    if index>vocab_size-1:
        break
    else:
        if word in M:
            E[index]=M[word]

In [None]:
# m=tf.keras.models.Sequential([
#     Embedding(vocab_size,
#               embedding_size,
#               embeddings_initializer=Constant(E),
#               input_length=max_tweet_len,
#               trainable=False),
#     tf.keras.layers.GRU(128,return_sequences=True,activation='relu'),
#     tf.keras.layers.GRU(128,activation='relu'),
#     tf.keras.layers.Dense(1,activation='sigmoid')
# ])

# m.compile(loss='binary_crossentropy',optimizer=tf.keras.optimizers.Adam(learning_rate=0.001),metrics=[tf.keras.metrics.Precision()])

In [30]:
m=tf.keras.models.Sequential([
    Embedding(vocab_size,
              embedding_size,
              embeddings_initializer=Constant(E),
              input_length=max_tweet_len,
              trainable=True),
    Bidirectional(LSTM(256,activation='relu',dropout=0.2)),
    Dense(1,activation='sigmoid')
])

m.compile(loss='binary_crossentropy',optimizer=tf.keras.optimizers.Adam(learning_rate=0.0005),metrics=['accuracy'])

In [31]:
m.fit(data,sas['y'].values,
      batch_size=32,
      epochs=10,
      validation_split=0.2)

Train on 28992 samples, validate on 7249 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x7f98a0563d50>

In [32]:
test='Btw    Ass  Damn and hell are allowed on my page  Thats mild  generic profanity  Nothing else '

In [33]:
def predict_score(text):
    X_new=pad_sequences(tokenizer.texts_to_sequences([text]),maxlen=max_tweet_len)
    return m.predict(X_new)

In [35]:
predict_score(test)

array([[0.9999908]], dtype=float32)

In [49]:
predict_score('thanks brother man I am grateful my dude mild profanity')

array([[0.9134425]], dtype=float32)

In [None]:
m.predict(X_new)

In [50]:
tt=pd.read_csv('data/stephenasmith.csv.gz',compression='gzip')

In [58]:
tt['created_year'].value_counts()

2012    2984
2013    2642
2018    2599
2016    2444
2015    1900
2011    1575
2014    1469
2017    1363
2019    1157
2010     868
2009     509
Name: created_year, dtype: int64

In [62]:
import matplotlib.pyplot as plt

In [66]:
np.mean(tt['std_favorite_count']>0)

0.19231163505894414

In [138]:
m.save('scoring_model.h5')

In [139]:
m=tf.keras.models.load_model('scoring_model.h5')

In [140]:
gpt2_output=pd.read_csv('gpt-2_output/gpt2_gentext_20191123_053641_temp1.0.txt',sep='\n',header=None)
def strip_tags(tweet):
    tweet=tweet.replace('<|startoftext|>','')
    tweet=tweet.replace('<|endoftext|>','')
    
    return tweet
generated_tweets=[strip_tags(tweet[0]) for tweet in gpt2_output.values if tweet[0]!='====================']

In [141]:
def predict_score(text_list):
    X_new=pad_sequences(tokenizer.texts_to_sequences(text_list),maxlen=max_tweet_len)
    return m.predict(X_new)

In [159]:
scoring=pd.DataFrame({'tweet':generated_tweets,'score':[s[0] for s in predict_score(generated_tweets)]})
top_100=scoring.sort_values('score',ascending=False).iloc[:100]

In [148]:
from nltk.translate.bleu_score import sentence_bleu

In [160]:
top_100

Unnamed: 0,tweet,score
378,"When Jesus Missing in Samoa, neither HBCU, col...",1.000000
979,I'm about to appear on REGRET radio ppl. Call ...,1.000000
942,Jesus. Cowboys have the best offense in footba...,1.000000
1017,"Oh God, Oh Jesus Do I Sound So sad. This is li...",1.000000
813,I wanted <at>TinyQuinn to win the Heisman befo...,1.000000
...,...,...
120,Haaaaaa!!!!!!! <at>KingJames is thats your day...,0.985658
169,How 'bout that Kammy Driver? My rookie season ...,0.985556
1008,Mike & Mike at 10am EST,0.985459
113,I put in Double-1's. 2-at-ATS from T. Wilson a...,0.984983


In [151]:
reference=[tweet.split() for tweet in sas['full_text']]

In [154]:
candidate='Chris Bosh and the Toronto Raptors seem to have a lot to say about me today Check out my response at <url>'

In [157]:
sentence_bleu(reference,candidate.split())

1.0

In [None]:
for tweet in top_100['tweet']