Mount Google Drive

In [0]:
from google.colab import drive
drive.mount('/content/gdrive')
!ls "/content/gdrive/My Drive/SML_Project1"

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3Aietf%3Awg%3Aoauth%3A2.0%3Aoob&scope=email%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdocs.test%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive.photos.readonly%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fpeopleapi.readonly&response_type=code

Enter your authorization code:
··········
Mounted at /content/gdrive
 checkpoint
'Copy of tokenizer.ipynb'
 Data.ipynb
 mlp2.hdf5
 mlp3.hdf5
 mlp3.png
 mlp4.hdf5
 mlp4.png
 mlp.hdf5
'Pre-trained BERT contextualized word embeddings.ipynb'
 pridicted.csv
 pridicted.gsheet
 project1.ipynb
 test_tweets_unlabeled.txt
 tokenizer.ipynb
 train_tweets.txt
 wwm_uncased_L-24_H-1024_A-16
 x_train_encode.json


Lemmatization

In [0]:
import nltk
nltk.download('wordnet')
lemmatizer = nltk.stem.wordnet.WordNetLemmatizer()

def lemmatize(word):
    lemma = lemmatizer.lemmatize(word,'v')
    if lemma == word:
        lemma = lemmatizer.lemmatize(word,'n')
    return lemma


[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


Tokenizer and extract features

In [0]:
import re
from nltk.stem.porter import *

rt_str = r'RT'
capital_str = r'[A-Z]'
mention_str = r'@handle(:)?'
emoticons_str = r'[:=;][oO\-]?[D\)\]\(\]/\\OpP]'         
http_str = r'([-|:] )?http[s]?://(?:[a-z]|[0-9]|[$-_@.&amp;+]|[!*\(\),]|(?:%[0-9a-f][0-9a-f]))+'
#hashtag_str = r"(?:\#+[\w_]+[\w\'_\-]*[\w]+)"
hashtag_str = r"#"
money_str = r"$[0-9]+"
percentage_str  = r'[0-9]+%'

feature_regex_strs = [
    http_str,
    rt_str,
    emoticons_str,
    mention_str,
    hashtag_str,
    money_str,
    percentage_str
]

punctuation_strs = [
    r'\.',
    r',',
    r'!',
    r'\?',
    r':',
    r';',
    r'\'',
    r'\"',
    r'<[^>]+>',
    r'\([^>]+\)',
    r'\[[^>]+\]',
    r'\{[^>]+\}',
    r'[\^|\*|_|\-|=|\+|\/|\\|\||`|~|&]'
]

feature_regex_strs.extend(punctuation_strs)

class Tokenizer():
    """
    This class is used to tokenize the tweets and calculate the average length.
    feature vector: number of url, number of 'RT', number of emotions, number of @, number of hashtag, number of cash mentioned, number of percentage, fraction of capitals
    """
    def __init__(self):
        self.processed_length = 0
        self.processed_item = 1
        self.features = []
        self.max_seq = 0
        self.vocab = {}
        
    def num_of_match(self, patten, tweet):
        return len(re.findall(patten, tweet))

    def tokenize(self, tweets):
        """
        :param tweets: One tweet
        :return: tokens: tokenized tweet
                 feature array including captical number and @ number
        """
        
        other_features = []
        for feature_regex_str in feature_regex_strs:
            other_features.append(self.num_of_match(feature_regex_str, tweets))
            tweets = re.sub(feature_regex_str, '', tweets)
        
        other_features.append(float(self.num_of_match(capital_str, tweets)) / len(tweets))
        
        self.features.append(other_features)

        regex_str = [
            r'(?:(?:\d+,?)+(?:\.?\d+)?)',  # number
            r"(?:[a-z][a-z'\-_]+[a-z])",  # word with - and '
            r'(?:[\w_]+)',
            r'(?:\S)'
        ]

        tokens_re = re.compile(r'(' + '|'.join(regex_str) + ')', re.VERBOSE | re.IGNORECASE)

        tokens = tokens_re.findall(tweets)
        tokens = [token.lower() if token.isalpha() else token for token in tokens]
        tokens = [lemmatize(token) if token.isalpha() else token for token in tokens]
        
        token_length = 0
        for token in tokens:
            token_length += len(token)
            if token not in self.vocab:
                self.vocab[token] = len(self.vocab)
        self.processed_length += token_length
        self.processed_item += 1
        self.max_seq = max(self.max_seq, len(tokens))
        return tokens
      
    def avg_length(self):
        """
        :return: Average length of tokens
        """
        return float(self.processed_length) / self.processed_item 
      
    def get_other_features(self):
        return self.features
      
    def get_max_seq(self):
        return self.max_seq
    
    def get_vocab(self):
        return self.vocab
      
mytokenizer = Tokenizer()

#tweet_example = 'RT @handle: Cool SEO \'post\' by @handle ! :) #RRPP #PR RT @handle: Top 10 #SEO Tips ? for #Public Relations - http://ow.ly/Bh7L'
#tweet_example2 = 'RT @handle: Note to webmasters: <the full roll> out of Caffeine won\'t happen until after the holidays. More info: http://bit.ly/4GELv6s'
#tokens_example = mytokenizer.tokenize(tweet_example)
#tokens_example2 = mytokenizer.tokenize(tweet_example2)
#print(tokens_example)
#print(tokens_example2)
#print(mytokenizer.get_other_features())
#print(len(mytokenizer.get_other_features()[0]))

Read data

In [0]:
from sklearn.model_selection import train_test_split

X = []
Y = []
with open ('/content/gdrive/My Drive/SML_Project1/train_tweets.txt') as fp:
    for line in fp:
        data = line.split("\t")
        X.append(data[1])
        Y.append(int(data[0]))
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.1, random_state=90051)

real_to_label = {}
label_to_real = {}
Y_train_label = []
Y_test_label = []
for user in Y_train:
    if user not in real_to_label.keys():
        new_label = len(real_to_label)
        real_to_label[user] = len(real_to_label)
        label_to_real[new_label] = user
    Y_train_label.append(real_to_label[user])
for user in Y_test:
    if user not in real_to_label.keys():
        new_label = len(real_to_label)
        real_to_label[user] = len(real_to_label)
        label_to_real[new_label] = user
    Y_test_label.append(real_to_label[user])

X_final_test = []
with open ('/content/gdrive/My Drive/SML_Project1/test_tweets_unlabeled.txt') as fp:
    for line in fp:
        X_final_test.append(line)
    
print(len(X_train))
print(len(Y_train))
print(len(Y_train_label))
print(len(X_test))
print(len(Y_test))
print(len(Y_test_label))
print(len(X_final_test))
print(len(real_to_label))
print(len(label_to_real))

296038
296038
296038
32894
32894
32894
35437
9297
9297


Tokenize X_train, X_test and X_final_test

In [0]:
import numpy as np

X_train_tokens = []
X_test_tokens = []
X_final_test_tokens = []

for tweet in X_train:
    X_train_tokens.append(mytokenizer.tokenize(tweet))
for tweet in X_test:
    X_test_tokens.append(mytokenizer.tokenize(tweet))
for tweet in X_final_test:
    X_final_test_tokens.append(mytokenizer.tokenize(tweet))
    
other_features = np.array(mytokenizer.get_other_features())
additional_info = other_features[:len(X_train_tokens), :]
additional_info_val = other_features[len(X_train_tokens) : len(X_train_tokens) + len(X_test_tokens), :]
additional_info_test = other_features[-len(X_final_test_tokens) : , :]
print(len(other_features))
print(len(additional_info))
print(len(additional_info[0]))
print(len(additional_info_val))
print(len(additional_info_val[0]))
print(len(additional_info_test))
print(len(additional_info_test[0]))

364369
296038
21
32894
21
35437
21


Construct vocab and word2vec model

In [0]:
from gensim.models import Word2Vec
dimension = 100
vocab = mytokenizer.get_vocab()
print('<unk>' in vocab)
print(len(vocab))
common_texts = []
common_texts.extend(X_train_tokens)
common_texts.extend(X_test_tokens)
common_texts.extend(X_final_test_tokens)
w2v_model = Word2Vec(common_texts, size=dimension, window=5, min_count=1)
# print(w2v_model['top'])

False
158974


Embedding metrix

In [0]:
embedding_matrix = np.zeros((len(vocab) + 1, dimension))
for word, i in vocab.items():
    try:
        embedding_vector = w2v_model[str(word)]
        embedding_matrix[i] = embedding_vector
    except KeyError:
        continue
print(embedding_matrix.shape)
# print(embedding_matrix[0])
# print(embedding_matrix[-1])

  after removing the cwd from sys.path.


(158975, 100)


padding

In [0]:
from keras.preprocessing.sequence import pad_sequences
max_length = mytokenizer.get_max_seq()
print(max_length)
padding = '<unk>'
vocab[padding] = len(vocab)
print(len(vocab))
X_train_tokens = pad_sequences(X_train_tokens, dtype=object, maxlen=max_length, value=padding)
X_test_tokens = pad_sequences(X_test_tokens, dtype=object, maxlen=max_length, value=padding)
X_final_test_tokens = pad_sequences(X_final_test_tokens, dtype=object, maxlen=max_length, value=padding)

Using TensorFlow backend.


118
158975


token to sequence

In [0]:
def token2seq(tokens_list):
    seqs_list = []
    for tokens in tokens_list:
        seqs = []
        for token in tokens:
            seqs.append(vocab[token])
        seqs_list.append(seqs)
    return seqs_list
X_train_tokens = token2seq(X_train_tokens)
X_test_tokens = token2seq(X_test_tokens)
X_final_test_tokens = token2seq(X_final_test_tokens)

Picture of training

In [0]:
from keras.callbacks import Callback
import matplotlib.pyplot as plt
import matplotlib
matplotlib.use('Agg')

class LossHistory(Callback):
    def on_train_begin(self, logs={}):
        self.losses = {'batch':[], 'epoch':[]}
        self.accuracy = {'batch':[], 'epoch':[]}
        self.val_loss = {'batch':[], 'epoch':[]}
        self.val_acc = {'batch':[], 'epoch':[]}

    def on_batch_end(self, batch, logs={}):
        self.losses['batch'].append(logs.get('loss'))
        self.accuracy['batch'].append(logs.get('acc'))
        self.val_loss['batch'].append(logs.get('val_loss'))
        self.val_acc['batch'].append(logs.get('val_acc'))

    def on_epoch_end(self, batch, logs={}):
        self.losses['epoch'].append(logs.get('loss'))
        self.accuracy['epoch'].append(logs.get('acc'))
        self.val_loss['epoch'].append(logs.get('val_loss'))
        self.val_acc['epoch'].append(logs.get('val_acc'))

    def loss_plot(self, loss_type, savepath):
        iters = range(len(self.losses[loss_type]))
        plt.figure()
        # acc
        plt.plot(iters, self.accuracy[loss_type], 'r', label='train acc')
        # loss
        plt.plot(iters, self.losses[loss_type], 'g', label='train loss')
        # val_acc
        plt.plot(iters, self.val_acc[loss_type], 'b', label='val acc')
        # val_loss
        plt.plot(iters, self.val_loss[loss_type], 'k', label='val loss')
        plt.grid(True)
        plt.xlabel(loss_type)
        plt.ylabel('acc-loss')
        plt.legend(loc="upper right")    
        plt.savefig(savepath)

label data

In [0]:
import keras
num_classes = len(label_to_real)
Y_train_label = keras.utils.to_categorical(Y_train_label, num_classes=num_classes)  # to one-hot metrics
Y_test_label = keras.utils.to_categorical(Y_test_label, num_classes=num_classes)
history = LossHistory()

Text CNN

In [0]:
from keras.models import Model
from keras.layers import Input, Embedding, Conv1D, MaxPooling1D, Flatten, Dropout, concatenate, Dense, Activation
from keras.optimizers import SGD, Adam
from keras import regularizers
from keras.callbacks import ModelCheckpoint, EarlyStopping

print(len(vocab))

def TextCNN(auxilury_input_size = (2, ), num_classes = 10001):
    
    auxilury_input = Input(shape=auxilury_input_size)
    main_input = Input(shape=(max_length, ), dtype='float64')
    
    embed = Embedding(len(vocab), dimension, input_length=max_length, weights=[embedding_matrix], trainable=False)(main_input)

    cnn1 = Conv1D(256, 3, padding='same', strides=1, activation='relu')(embed)
    cnn1 = MaxPooling1D(pool_size=38)(cnn1)
    cnn2 = Conv1D(256, 4, padding='same', strides=1, activation='relu')(embed)
    cnn2 = MaxPooling1D(pool_size=37)(cnn2)
    cnn3 = Conv1D(256, 5, padding='same', strides=1, activation='relu')(embed)
    cnn3 = MaxPooling1D(pool_size=36)(cnn3)

    cnn = concatenate([cnn1, cnn2, cnn3], axis=-1)
    flat = Flatten()(cnn)
#     drop = Dropout(0.2)(flat)
    
    all_input = concatenate([flat, auxilury_input], axis=-1)
    main_output = Dense(num_classes, activation='softmax', kernel_regularizer=regularizers.l2(1e-3))(all_input)

    model = Model(inputs=[main_input, auxilury_input], outputs=main_output)
    sgd = SGD(lr=0.01, decay=1e-8, momentum=0.9, nesterov=True)
    model.compile(loss='categorical_crossentropy', optimizer=sgd, metrics=['accuracy'])    
    
    return model
 
model_path = '/content/gdrive/My Drive/SML_Project1/mlp5.hdf5'
result_image_path = '/content/gdrive/My Drive/SML_Project1/mlp5.png'
auxilury_input_size = (len(additional_info[0]),)

model = TextCNN(auxilury_input_size = auxilury_input_size, num_classes = num_classes)
model_checkpoint = ModelCheckpoint(model_path, monitor='loss', verbose=1, save_best_only=True)
model.fit([X_train_tokens, additional_info], Y_train_label, batch_size=16, epochs=20, validation_data=([X_test_tokens, additional_info_val], Y_test_label), 
          callbacks=[model_checkpoint, EarlyStopping(monitor="val_acc", patience=3), history])
history.loss_plot('epoch', result_image_path)


158975
Train on 296038 samples, validate on 32894 samples
Epoch 1/20

Epoch 00001: loss improved from inf to 9.28489, saving model to /content/gdrive/My Drive/SML_Project1/mlp5.hdf5
Epoch 2/20

Epoch 00002: loss improved from 9.28489 to 8.71183, saving model to /content/gdrive/My Drive/SML_Project1/mlp5.hdf5
Epoch 3/20

Epoch 00003: loss did not improve from 8.71183
Epoch 4/20

Epoch 00004: loss did not improve from 8.71183
Epoch 5/20

Epoch 00005: loss did not improve from 8.71183
Epoch 6/20

Epoch 00006: loss did not improve from 8.71183
Epoch 7/20

Epoch 00007: loss did not improve from 8.71183
Epoch 8/20

Epoch 00008: loss did not improve from 8.71183
Epoch 9/20

Epoch 00009: loss did not improve from 8.71183
Epoch 10/20

In [0]:
import pandas as pd
output_file = "/content/gdrive/My Drive/SML_Project1/pridicted.csv"
result = model.predict([X_final_test_tokens, additional_info_test], verbose=1)
test_users = []
for res in result:
    t_list = res.tolist()
    test_users.append(label_to_real[t_list.index(max(t_list))])
df = pd.DataFrame({'Id': np.arange(1, len(test_users) + 1),
                  'Predicted' : np.array(test_users)})
print(df)
df.to_csv(output_file,index=False)