In [31]:
#imports
import pandas as pd

In [55]:
from collections import defaultdict
import collections

In [142]:
#tensorflow imports
import tensorflow as tf
from tensorflow.keras import Sequential
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import tensorflow.keras.layers as layers

In [91]:
from sklearn.model_selection import KFold

In [23]:
def load_full_data(filename="Tweets.csv"):
    df = pd.read_csv(filename)
    df = df[['text', 'airline_sentiment', 'airline_sentiment_confidence']]
    return df

In [24]:
def transform_sentiment(arr):
    def stoi(s):
        if s=='neutral':
            return 0
        if s=='positive':
            return 1
        if s=='negative':
            return 2
        print("Unknown sentiment " + s)
        return 0
    return [stoi(s) for s in arr]

In [25]:
df = load_full_data()

In [44]:
def clean_text(input):
    words = input.split()
    words = [w for w in words if not w.startswith('@')]
    return ' '.join(words)

In [45]:
df.text = df.text.apply(clean_text)

In [83]:
num_per_label = min(df.groupby(['airline_sentiment']).count().text)

pos = df[df['airline_sentiment']=='positive'].sort_values(by=['airline_sentiment_confidence'], ascending=False).head(num_per_label)
neu = df[df['airline_sentiment']=='neutral'].sort_values(by=['airline_sentiment_confidence'], ascending=False).head(num_per_label)
neg = df[df['airline_sentiment']=='negative'].sort_values(by=['airline_sentiment_confidence'], ascending=False).head(num_per_label)

df = pd.concat([pos, neu, neg])

In [86]:
words = defaultdict(int)
for t in df.text:
    for w in t.split():
        words[w]+=1
print(f"Num words: {len(words)}")
print(f"Num words that appear more than once: {len([w for w in words if words[w]>1])}")

Num words: 17646
Num words that appear more than once: 5874


In [89]:
tk = Tokenizer(num_words=len([w for w in words if words[w]>1]))
tk.fit_on_texts(df.text)

In [90]:
collections.Counter(tk.word_counts).most_common(10)

[('to', 3858),
 ('the', 2691),
 ('i', 2380),
 ('you', 2029),
 ('a', 1934),
 ('for', 1907),
 ('on', 1696),
 ('flight', 1681),
 ('and', 1548),
 ('my', 1390)]

In [145]:
text = list(df.text)
X = tk.texts_to_sequences(text)
Y = transform_sentiment(list(df.airline_sentiment))
Y = tf.keras.utils.to_categorical(Y)

X = pad_sequences(X)

In [157]:
X = tf.convert_to_tensor(X)
Y = tf.convert_to_tensor(Y)

In [178]:
model = Sequential()

model.add(layers.Embedding(input_dim=tk.num_words, output_dim=64, input_length=len(X[0])))
model.add(layers.LSTM(32))
model.add(layers.Dense(3))
model.add(layers.Softmax())

model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=["AUC"])

In [179]:
model.summary()

Model: "sequential_10"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_4 (Embedding)      (None, 33, 64)            375936    
_________________________________________________________________
lstm_13 (LSTM)               (None, 32)                12416     
_________________________________________________________________
dense_4 (Dense)              (None, 3)                 99        
_________________________________________________________________
softmax_4 (Softmax)          (None, 3)                 0         
Total params: 388,451
Trainable params: 388,451
Non-trainable params: 0
_________________________________________________________________


In [180]:
model.fit(x=X, y=Y, batch_size=128, epochs=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x7fa4b2cf1220>

In [181]:
model.predict(X[:3])

array([[2.8830732e-04, 9.9965286e-01, 5.8848324e-05],
       [3.7291026e-04, 9.9956983e-01, 5.7208097e-05],
       [2.7756885e-04, 9.9962723e-01, 9.5248368e-05]], dtype=float32)

In [182]:
Y[:3]

<tf.Tensor: shape=(3, 3), dtype=float32, numpy=
array([[0., 1., 0.],
       [0., 1., 0.],
       [0., 1., 0.]], dtype=float32)>

In [183]:
text[:3]

['Thank you so much for stepping up your game and making my day after night of elevator music. Much appreciated.',
 'thank you thank you! I finally set up the jetblue app! Yay!',
 'great flight on a brand new jet. Great seating. Beautiful plane. Big fan of this airline.']