Mount Google Drive

In [7]:
from google.colab import drive
drive.mount('/content/gdrive')
!ls "/content/gdrive/My Drive/SML_Project1"

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).
 checkpoint
'Copy of tokenizer.ipynb'
 Data.ipynb
 mlp2.hdf5
 mlp3.hdf5
 mlp3.png
 mlp4.hdf5
 mlp4.png
 mlp5.hdf5
 mlp.hdf5
 mlp_reg.hdf5
 mlp_sgd_30+.png
 mlp_sgd.hdf5
 mlp_sgd.png
'Pre-trained BERT contextualized word embeddings.ipynb'
 pridicted.csv
 pridicted.gsheet
 project1.ipynb
 test_tweets_unlabeled.txt
 train_tweets.txt
 word2vec.ipynb
 wwm_uncased_L-24_H-1024_A-16
 x_train_encode.json


Lemmatization

In [14]:
import nltk
nltk.download('wordnet')
lemmatizer = nltk.stem.wordnet.WordNetLemmatizer()

def lemmatize(word):
    lemma = lemmatizer.lemmatize(word,'v')
    if lemma == word:
        lemma = lemmatizer.lemmatize(word,'n')
    return lemma


[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Tokenizer and extract features

In [0]:
import re
from nltk.stem.porter import *

rt_str = r'RT'
capital_str = r'[A-Z]'
mention_str = r'@handle(:)?'
emoticons_str = r'[:=;][oO\-]?[D\)\]\(\]/\\OpP]'         
http_str = r'([-|:] )?http[s]?://(?:[a-z]|[0-9]|[$-_@.&amp;+]|[!*\(\),]|(?:%[0-9a-f][0-9a-f]))+'
#hashtag_str = r"(?:\#+[\w_]+[\w\'_\-]*[\w]+)"
hashtag_str = r"#"
money_str = r"$[0-9]+"
percentage_str  = r'[0-9]+%'

feature_regex_strs = [
    http_str,
    rt_str,
    emoticons_str,
    mention_str,
    hashtag_str,
    money_str,
    percentage_str
]

punctuation_strs = [
    r'\.',
    r',',
    r'!',
    r'\?',
    r':',
    r';',
    r'\'',
    r'\"',
    r'<[^>]+>',
    r'\([^>]+\)',
    r'\[[^>]+\]',
    r'\{[^>]+\}',
    r'[\^|\*|_|\-|=|\+|\/|\\|\||`|~|&]'
]

feature_regex_strs.extend(punctuation_strs)

class Tokenizer():
    """
    This class is used to tokenize the tweets and calculate the average length.
    feature vector: number of url, number of 'RT', number of emotions, number of @, number of hashtag, number of cash mentioned, number of percentage, fraction of capitals
    """
    def __init__(self):
        self.processed_length = 0
        self.processed_item = 1
        self.features = []
        self.max_seq = 0
        self.vocab = {}
        
    def num_of_match(self, patten, tweet):
        return len(re.findall(patten, tweet))

    def tokenize(self, tweets):
        """
        :param tweets: One tweet
        :return: tokens: tokenized tweet
                 feature array including captical number and @ number
        """
        
        other_features = []
        for feature_regex_str in feature_regex_strs:
            other_features.append(self.num_of_match(feature_regex_str, tweets))
            tweets = re.sub(feature_regex_str, '', tweets)
        
        other_features.append(float(self.num_of_match(capital_str, tweets)) / len(tweets))
        
        self.features.append(other_features)

        regex_str = [
            r'(?:(?:\d+,?)+(?:\.?\d+)?)',  # number
            r"(?:[a-z][a-z'\-_]+[a-z])",  # word with - and '
            r'(?:[\w_]+)',
            r'(?:\S)'
        ]

        tokens_re = re.compile(r'(' + '|'.join(regex_str) + ')', re.VERBOSE | re.IGNORECASE)

        tokens = tokens_re.findall(tweets)
        tokens = [token.lower() if token.isalpha() else token for token in tokens]
        tokens = [lemmatize(token) if token.isalpha() else token for token in tokens]
        
        token_length = 0
        for token in tokens:
            token_length += len(token)
            if token not in self.vocab:
                self.vocab[token] = len(self.vocab)
        self.processed_length += token_length
        self.processed_item += 1
        self.max_seq = max(self.max_seq, len(tokens))
        return tokens
      
    def avg_length(self):
        """
        :return: Average length of tokens
        """
        return float(self.processed_length) / self.processed_item 
      
    def get_other_features(self):
        return self.features
      
    def get_max_seq(self):
        return self.max_seq
    
    def get_vocab(self):
        return self.vocab
      
mytokenizer = Tokenizer()

#tweet_example = 'RT @handle: Cool SEO \'post\' by @handle ! :) #RRPP #PR RT @handle: Top 10 #SEO Tips ? for #Public Relations - http://ow.ly/Bh7L'
#tweet_example2 = 'RT @handle: Note to webmasters: <the full roll> out of Caffeine won\'t happen until after the holidays. More info: http://bit.ly/4GELv6s'
#tokens_example = mytokenizer.tokenize(tweet_example)
#tokens_example2 = mytokenizer.tokenize(tweet_example2)
#print(tokens_example)
#print(tokens_example2)
#print(mytokenizer.get_other_features())
#print(len(mytokenizer.get_other_features()[0]))

Read data

In [41]:
from sklearn.model_selection import train_test_split

X = []
Y = []
with open ('/content/gdrive/My Drive/SML_Project1/train_tweets.txt') as fp:
    for line in fp:
        data = line.split("\t")
        X.append(data[1])
        Y.append(int(data[0]))

X_test = []
with open ('/content/gdrive/My Drive/SML_Project1/test_tweets_unlabeled.txt') as fp:
    for line in fp:
        X_test.append(line)
    
print(len(X))
print(len(Y))
print(len(X_final_test))

328932
328932
35437


Tokenize X_train, X_test

In [0]:
import numpy as np

X_train_tokens = []
X_test_tokens = []

for tweet in X:
    X_train_tokens.append(mytokenizer.tokenize(tweet))
for tweet in X_test:
    X_test_tokens.append(mytokenizer.tokenize(tweet))


In [44]:
vocab = mytokenizer.get_vocab()
print(len(vocab))

def token2seq(tokens_list):
    seqs_list = []
    for tokens in tokens_list:
        seqs = []
        for token in tokens:
            seqs.append(vocab[token])
        seqs_list.append(seqs)
    return seqs_list

X_train_tokens = token2seq(X_train_tokens)
X_test_tokens = token2seq(X_test_tokens)

print(len(X_train_tokens))
print(len(X_test_tokens))
print(X_train_tokens[34])

158974
328932
35437
[39, 22, 27, 231, 232, 233, 92, 69, 234, 235, 13, 236, 86, 39, 237, 106]


In [45]:
from keras.preprocessing.sequence import pad_sequences
max_length = mytokenizer.get_max_seq()
print(max_length)
padding = 0
X_train_tokens = pad_sequences(X_train_tokens, dtype=int, maxlen=max_length, value=padding)
X_test_tokens = pad_sequences(X_test_tokens, dtype=int, maxlen=max_length, value=padding)

118


In [0]:
from sklearn.naive_bayes import GaussianNB
clf = GaussianNB()
clf.fit(X_train_tokens, Y)
Y_final_test = clf.predict(X_test_tokens)

In [48]:
print(len(Y_final_test))

35437


In [49]:
import pandas as pd
output_file = "/content/gdrive/My Drive/SML_Project1/pridicted_NB.csv"
df = pd.DataFrame({'Id': np.arange(1, len(Y_final_test) + 1),
                  'Predicted' : np.array(Y_final_test)})
print(df)
df.to_csv(output_file,index=False)

          Id  Predicted
0          1       3763
1          2        935
2          3       8620
3          4       7254
4          5       8357
5          6       6074
6          7       1059
7          8       6664
8          9       6873
9         10       9566
10        11       3854
11        12       8706
12        13       4486
13        14       8481
14        15       3854
15        16       8490
16        17       3316
17        18       1997
18        19       6531
19        20       4837
20        21       3480
21        22       2019
22        23       4650
23        24       5132
24        25       5809
25        26       6490
26        27       5362
27        28       7561
28        29        369
29        30       6225
...      ...        ...
35407  35408       2489
35408  35409       3091
35409  35410       8796
35410  35411       8353
35411  35412       5458
35412  35413       1237
35413  35414       5528
35414  35415       5034
35415  35416       6362
35416  35417    