In [1]:
import pandas as pd

### Load data

In [2]:
train_df = pd.read_pickle('data/tweets_train.pkl')
test_df = pd.read_pickle('data/tweets_test.pkl')

In [3]:
train_df.head()

Unnamed: 0,hashtag,tweet_id,text,train_or_test,class
0,[Snapchat],0x376b20,"People who post ""add me on #Snapchat"" must be ...",train,anticipation
1,"[freepress, TrumpLegacy, CNN]",0x2d5350,"@brianklaas As we see, Trump is dangerous to #...",train,sadness
3,[],0x1cd5b0,Now ISSA is stalking Tasha ðŸ˜‚ðŸ˜‚ðŸ˜‚ <LH>,train,fear
5,"[authentic, LaughOutLoud]",0x1d755c,@RISKshow @TheKevinAllison Thx for the BEST TI...,train,joy
6,[],0x2c91a8,Still waiting on those supplies Liscus. <LH>,train,anticipation


In [4]:
# texts = train_df.text.tolist()[:20]
# classes = train_df['class'].tolist()[:20]
# for i in range(len(texts)):
#     print(f"{texts[i]}\n{classes[i]}\n\n\n")

In [5]:
CLASSES = ['sadness', 'anger', 'anticipation', 'fear', 'joy', 'disgust', 'surprise', 'trust']

In [6]:
class_amount_dict = {}
for c in CLASSES:
    class_amount_dict[c] = len(train_df.loc[train_df['class'] == c])

In [7]:
class_amount_dict

{'sadness': 193437,
 'anger': 39867,
 'anticipation': 248935,
 'fear': 63999,
 'joy': 516017,
 'disgust': 139101,
 'surprise': 48729,
 'trust': 205478}

In [9]:
sample_df = train_df.groupby("class").sample(n=class_amount_dict['anger'], random_state=42)

In [10]:
sample_df.to_pickle('data/tweets_train_balanced.pkl')

In [43]:
from sklearn.model_selection import train_test_split

In [78]:
x_train, x_test, y_train, y_test = train_test_split(sample_df['text'], sample_df['class'], test_size=0.05, random_state=42)

In [79]:
len(y_test)

15947

### Try Word2Vec

In [75]:
import gensim.downloader as api
from gensim.models import KeyedVectors

## Note: this model is very huge, this will take some time ...
model_path = "data/GoogleNews-vectors-negative300.bin.gz"
w2v_google_model = KeyedVectors.load_word2vec_format(model_path, binary=True)

In [12]:
import numpy as np

In [80]:
import keras
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()
label_encoder.fit(y_train)

def label_encode(le, labels):
    enc = le.transform(labels)
    return keras.utils.np_utils.to_categorical(enc)

def label_decode(le, one_hot_label):
    dec = np.argmax(one_hot_label, axis=1)
    return le.inverse_transform(dec)

y_train = label_encode(label_encoder, y_train)
y_test = label_encode(label_encoder, y_test)

In [65]:
from tensorflow.keras.preprocessing.text import Tokenizer

tok = Tokenizer()
tok.fit_on_texts(pd.concat([train_df,test_df],ignore_index=True)['text'])
vocab_size = len(tok.word_index) + 1

In [91]:
len(test_df)

411972

In [82]:
# Answer here
convertable_count = 0
embedding_matrix = np.zeros((vocab_size, 300)) # vocab size
for k,v in tok.word_index.items():
    for mod in [k, k.capitalize(), k.upper()]:
        try:
            vec = w2v_google_model.get_vector(mod)
            embedding_matrix[v] = vec
            convertable_count += 1
            break
        except KeyError as e:
            continue   

In [83]:
train_encoded_sents = tok.texts_to_sequences(x_train)
test_encoded_sents = tok.texts_to_sequences(x_test)

In [84]:
# get max seq length
train_lens = [len(seq) for seq in train_encoded_sents]
test_lens = [len(seq) for seq in test_encoded_sents]
max_len = max([max(train_lens), max(test_lens)])
max_len

44

In [85]:
from tensorflow.keras.preprocessing.sequence import pad_sequences
X_train = pad_sequences(train_encoded_sents, maxlen=max_len, padding='post')
X_test= pad_sequences(test_encoded_sents, maxlen=max_len, padding='post')

In [86]:
from keras.models import Sequential
from keras.layers import Dense, Embedding, LSTM, Bidirectional
from keras.initializers import Constant

In [87]:
output_dim = w2v_google_model.vectors.shape[1]
keras_model = Sequential()

emb_layer = Embedding(input_dim=vocab_size, output_dim=output_dim, input_length=max_len, trainable=False, embeddings_initializer=Constant(embedding_matrix))
keras_model.add(emb_layer)
keras_model.add(LSTM(output_dim))
keras_model.add(Dense(len(class_amount_dict), activation='softmax'))

In [90]:
print(keras_model.summary())
keras_model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_2 (Embedding)     (None, 44, 300)           321244500 
                                                                 
 lstm_2 (LSTM)               (None, 300)               721200    
                                                                 
 dense_2 (Dense)             (None, 8)                 2408      
                                                                 
Total params: 321,968,108
Trainable params: 723,608
Non-trainable params: 321,244,500
_________________________________________________________________
None


In [89]:
keras_model.fit(
    X_train, y_train, 
    validation_data=(X_test, y_test),
    epochs = 9
)

Epoch 1/9
Epoch 2/9
Epoch 3/9
Epoch 4/9
Epoch 5/9
Epoch 6/9
Epoch 7/9
Epoch 8/9
Epoch 9/9


<keras.callbacks.History at 0x7fa86ac15f90>

### Preprocess for TF-IDF

In [46]:
import re
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

ps = PorterStemmer()

In [47]:
def preprocess(text):
    text = text.lower().strip()
    text = re.sub(r"""[,."'`~#%^*(\)&[\]{\}></]""", ' ', text)
    text = re.sub('(?<! )(?=[!?])|(?<=[!?()])(?! )', r' ', text)  # want to keep ? and !
    text = re.sub(r'[^\x00-\x7F]+', ' ', text).strip()
    tokens = [tok for tok in text.split() if tok not in stopwords.words('english')]
    tokens = [ps.stem(tok) for tok in tokens]
    return tokens

In [48]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [58]:
tfidf = TfidfVectorizer(tokenizer=preprocess)
#tfidf = TfidfVectorizer()
tfidf_embeddings = tfidf.fit_transform(x_train)

In [59]:
from sklearn.svm import LinearSVC

In [60]:
svm_model = LinearSVC()
svm_model.fit(tfidf_embeddings, y_train)

LinearSVC()

In [61]:
from sklearn.metrics import classification_report

In [62]:
test_embeds = tfidf.transform(x_test)

In [63]:
y_pred = svm_model.predict(test_embeds)
print(classification_report(y_true=y_test, y_pred=y_pred))

              precision    recall  f1-score   support

       anger       0.47      0.53      0.50      1996
anticipation       0.50      0.55      0.52      1921
     disgust       0.43      0.42      0.42      2049
        fear       0.54      0.57      0.55      1955
         joy       0.40      0.39      0.40      2059
     sadness       0.44      0.39      0.42      2048
    surprise       0.48      0.44      0.46      1988
       trust       0.45      0.44      0.45      1931

    accuracy                           0.47     15947
   macro avg       0.46      0.47      0.46     15947
weighted avg       0.46      0.47      0.46     15947



In [56]:
import pickle

In [57]:
pickle.dump(svm_model, open('data/svm_nopreprocess_alldata.pkl', 'wb'))
pickle.dump(tfidf, open('data/tfidf_nopreprocess_alldata.pkl', 'wb'))