In [248]:
import numpy as np
import pandas as pd

train_df = pd.read_csv('../dataset/train.csv')
test_df = pd.read_csv('../dataset/test.csv')

In [249]:
train_df.head()

Unnamed: 0,id,original_text,lang,retweet_count,original_author,sentiment_class
0,1.245025e+18,Happy #MothersDay to all you amazing mothers o...,en,0,BeenXXPired,0
1,1.245759e+18,Happy Mothers Day Mum - I'm sorry I can't be t...,en,1,FestiveFeeling,0
2,1.246087e+18,Happy mothers day To all This doing a mothers ...,en,0,KrisAllenSak,-1
3,1.244803e+18,Happy mothers day to this beautiful woman...ro...,en,0,Queenuchee,0
4,1.244876e+18,Remembering the 3 most amazing ladies who made...,en,0,brittan17446794,-1


In [250]:
from __future__ import division
from collections import Counter
import re, nltk

WORDS = nltk.corpus.brown.words()
COUNTS = Counter(WORDS)

def pdist(counter):
    "Make a probability distribution, given evidence from a Counter."
    N = sum(counter.values())
    return lambda x: counter[x]/N

P = pdist(COUNTS)

def Pwords(words):
    "Probability of words, assuming each word is independent of others."
    return product(P(w) for w in words)

def product(nums):
    "Multiply the numbers together.  (Like `sum`, but with multiplication.)"
    result = 1
    for x in nums:
        result *= x
    return result

def splits(text, start=0, L=20):
    "Return a list of all (first, rest) pairs; start <= len(first) <= L."
    return [(text[:i], text[i:]) 
            for i in range(start, min(len(text), L)+1)]

def segment(text):
    "Return a list of words that is the most probable segmentation of text."
    if not text: 
        return []
    else:
        candidates = ([first] + segment(rest) 
                      for (first, rest) in splits(text, 1))
        return max(candidates, key=Pwords)
    
# print(segment('mothersday'))

In [251]:
from nltk.tokenize import TweetTokenizer # a tweet tokenizer from nltk.
import re
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer

tokenizer = TweetTokenizer()

#list of stop words in 
stops = set(stopwords.words('english'))
#removing words which are not stop words from "stops" (key elements in emotional analysis)
not_stop = ["aren't","couldn't","didn't","doesn't","don't",
            "hadn't","hasn't","haven't","isn't","mightn't",
            "mustn't","needn't","no","nor","not","shan't",
            "shouldn't","wasn't","weren't","wouldn't"]

for i in not_stop:
    stops.remove(i)

def tokenize(tweet):
    
    tweet = str(tweet.lower())
#     tweet = tweet.split(" ")
    tweet = " ".join([w for w in tweet.split(" ") if not w in stops])
    tweet = re.sub('http[s]?://[\s](?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', '', tweet)
    tweet = re.sub('pic.twitter.com/(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', '', tweet)
    tweet = re.sub('https://www.[\s](?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|[\s]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', '', tweet)
    tweet = re.sub('[^A-Za-z0-9\s]+', '', tweet)
    tokens = tokenizer.tokenize(tweet)
    tokens = list(filter(lambda t: not t.startswith('@'), tokens))
    tokens = list(filter(lambda t: not t.startswith('#'), tokens))
    return tokens

    
def postprocess(data):
    data['tokens'] = data['original_text'].apply(tokenize)  ## progress_map is a variant of the map function plus a progress bar. Handy to monitor DataFrame creations.
    print
    data = data[data.tokens != 'NC']
    return data

data = postprocess(train_df)

In [252]:
print(tokenize("Happy mothers day to all the mothers at this very strange time, especially 51.walsh, catherinemwalsh20 and bailesae you guys set the standard!! @Old Stratford https://www. instagram.com/p/B-Cb1r4D1_Kc wn9a_vQzxtCkRXEd9Eu4X6bZ2o0/?igshid=1wzimo2v391rp …"))

['happy', 'mothers', 'day', 'mothers', 'strange', 'time', 'especially', '51walsh', 'catherinemwalsh', '20', 'bailesae', 'guys', 'set', 'standard', 'old', 'stratford']


In [253]:
data.head()

Unnamed: 0,id,original_text,lang,retweet_count,original_author,sentiment_class,tokens
0,1.245025e+18,Happy #MothersDay to all you amazing mothers o...,en,0,BeenXXPired,0,"[happy, mothersday, amazing, mothers, there, k..."
1,1.245759e+18,Happy Mothers Day Mum - I'm sorry I can't be t...,en,1,FestiveFeeling,0,"[happy, mothers, day, mum, im, sorry, cant, br..."
2,1.246087e+18,Happy mothers day To all This doing a mothers ...,en,0,KrisAllenSak,-1,"[happy, mothers, day, mothers, days, work, tod..."
3,1.244803e+18,Happy mothers day to this beautiful woman...ro...,en,0,Queenuchee,0,"[happy, mothers, day, beautiful, womanroyalty,..."
4,1.244876e+18,Remembering the 3 most amazing ladies who made...,en,0,brittan17446794,-1,"[remembering, 3, amazing, ladies, made, am, la..."


In [254]:
# from sklearn.model_selection import train_test_split
# x_train_to_labelize, x_val_to_labelize, y_train, y_val = train_test_split(np.array(data['tokens']),
#                                                     np.array(data['sentiment_class']), test_size=0.2)

x_train_to_labelize = np.array(data['tokens'])
y_train = np.array(data['sentiment_class'])

In [255]:
from tqdm import tqdm
from gensim.models import doc2vec, Word2Vec
tqdm.pandas(desc="progress-bar")
LabeledSentence = doc2vec.LabeledSentence

def labelizeTweets(tweets, label_type):
    labelized = []
    for i,v in tqdm(enumerate(tweets)):
        label = '%s_%s'%(label_type,i)
        labelized.append(LabeledSentence(v, [label]))
    return labelized

x_train = labelizeTweets(x_train_to_labelize, 'TRAIN')
# x_val = labelizeTweets(x_val_to_labelize, 'VAL')

  from pandas import Panel
  # Remove the CWD from sys.path while we load stuff.
3235it [00:00, 70759.95it/s]


In [256]:
x_train[0]

LabeledSentence(words=['happy', 'mothersday', 'amazing', 'mothers', 'there', 'know', 'hard', 'not', 'able', 'see', 'mothers', 'today', 'us', 'protect', 'vulnerable', 'members', 'society', 'beatcoronavirus'], tags=['TRAIN_0'])

In [307]:
n_dim = 500
tweet_w2v = Word2Vec(size=n_dim, min_count=10)
tweet_w2v.build_vocab([x.words for x in tqdm(x_train)])
tweet_w2v.train([x.words for x in tqdm(x_train)], total_examples=len(x_train), epochs=9)

100%|██████████| 3235/3235 [00:00<00:00, 629369.33it/s]
100%|██████████| 3235/3235 [00:00<00:00, 1540657.82it/s]


(268979, 573471)

In [308]:
tweet_w2v.most_similar('happy')

  """Entry point for launching an IPython kernel.


[('wonderful', 0.9971985220909119),
 ('there', 0.9964075088500977),
 ('amazing', 0.9961796402931213),
 ('day', 0.9933226108551025),
 ('mothers', 0.9914435744285583),
 ('mums', 0.9900673627853394),
 ('beautiful', 0.9884551763534546),
 ('mother', 0.9851505160331726),
 ('women', 0.9838199019432068),
 ('wishing', 0.9832685589790344)]

In [309]:
import bokeh.plotting as bp
from bokeh.models import HoverTool, BoxSelectTool
from bokeh.plotting import figure, show, output_notebook

# defining the chart
output_notebook()
plot_tfidf = bp.figure(plot_width=700, plot_height=600, title="A map of 10000 word vectors",
    tools="pan,wheel_zoom,box_zoom,reset,hover",
    x_axis_type=None, y_axis_type=None, min_border=1)

# getting a list of word vectors. limit to 10000. each is of 200 dimensions
word_vectors = [tweet_w2v[w] for w in list(tweet_w2v.wv.vocab.keys())[:5000]]

# dimensionality reduction. converting the vectors to 2d vectors
from sklearn.manifold import TSNE
tsne_model = TSNE(n_components=2, verbose=1, random_state=0)
tsne_w2v = tsne_model.fit_transform(word_vectors)

# putting everything in a dataframe
tsne_df = pd.DataFrame(tsne_w2v, columns=['x', 'y'])
tsne_df['words'] = list(tweet_w2v.wv.vocab.keys())[:5000]

# plotting. the corresponding word appears when you hover on the data point.
plot_tfidf.scatter(x='x', y='y', source=tsne_df)
hover = plot_tfidf.select(dict(type=HoverTool))
hover.tooltips={"word": "@words"}
show(plot_tfidf)

  if sys.path[0] == '':


[t-SNE] Computing 91 nearest neighbors...
[t-SNE] Indexed 785 samples in 0.025s...
[t-SNE] Computed neighbors for 785 samples in 0.329s...
[t-SNE] Computed conditional probabilities for sample 785 / 785
[t-SNE] Mean sigma: 0.031057
[t-SNE] KL divergence after 250 iterations with early exaggeration: 51.892323
[t-SNE] KL divergence after 1000 iterations: 0.373827


In [310]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(analyzer=lambda x: x, min_df=10)
matrix = vectorizer.fit_transform([x.words for x in x_train])
tfidf = dict(zip(vectorizer.get_feature_names(), vectorizer.idf_))
print('vocab size :', len(tfidf))

vocab size : 763


In [311]:
def buildWordVector(tokens, size):
    vec = np.zeros(size).reshape((1, size))
    count = 0.
    for word in tokens:
        try:
            vec += tweet_w2v[word].reshape((1, size)) * tfidf[word]
            count += 1.
        except KeyError: # handling the case where the token is not
                         # in the corpus. useful for testing.
            continue
    if count != 0:
        vec /= count
    return vec

In [312]:
from sklearn.preprocessing import scale
train_vecs_w2v = np.concatenate([buildWordVector(z, n_dim) for z in tqdm(map(lambda x: x.words, x_train))])
train_vecs_w2v = scale(train_vecs_w2v)

# val_vecs_w2v = np.concatenate([buildWordVector(z, n_dim) for z in tqdm(map(lambda x: x.words, x_val))])
# val_vecs_w2v = scale(val_vecs_w2v)

  
3235it [00:01, 2948.47it/s]


In [313]:
y_train

array([ 0,  0, -1, ...,  0,  0, -1])

In [314]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, BatchNormalization, Activation, Embedding, LSTM, SpatialDropout1D
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.optimizers.schedules import InverseTimeDecay
from tensorflow.keras.callbacks import EarlyStopping 
import tensorflow_docs as tfdocs
from keras.utils import to_categorical

max_fatures = 2000
embed_dim = 128
lstm_out = 196

model = Sequential()
model.add(Dense(32, activation='relu', input_dim=n_dim))
model.add(Dense(3, activation='softmax'))

# model.add(Embedding(max_fatures, embed_dim))
# model.add(SpatialDropout1D(0.4))
# model.add(LSTM(lstm_out, dropout=0.2, recurrent_dropout=0.2))
# model.add(Dense(3,activation='softmax'))
model.compile( optimizer='adam',loss='categorical_crossentropy',metrics=['accuracy'])


model.summary()

Model: "sequential_17"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_27 (Dense)             (None, 32)                16032     
_________________________________________________________________
dense_28 (Dense)             (None, 3)                 99        
Total params: 16,131
Trainable params: 16,131
Non-trainable params: 0
_________________________________________________________________


In [315]:
import tensorflow_docs.modeling
early_stop = EarlyStopping(monitor='loss', mode='min', verbose=1)
model.fit(train_vecs_w2v, to_categorical(y_train, 3), epochs=9, batch_size=32, verbose=2,
          callbacks=[early_stop, tfdocs.modeling.EpochDots()])

# validation_data=(val_vecs_w2v, to_categorical(y_val, 3))

Train on 3235 samples
Epoch 1/9

Epoch: 0, accuracy:0.4739,  loss:1.1126,  
.3235/3235 - 0s - loss: 1.1126 - accuracy: 0.4739
Epoch 2/9
.3235/3235 - 0s - loss: 1.1170 - accuracy: 0.4992
Epoch 00002: early stopping


<tensorflow.python.keras.callbacks.History at 0x140363110>

In [316]:
# score = model.evaluate(val_vecs_w2v, to_categorical(y_val, 3), batch_size=128, verbose=2)

In [317]:
test_data = postprocess(test_df)

In [318]:
x_test_to_labelize = np.array(test_data['tokens'])

In [319]:
x_test = labelizeTweets(x_test_to_labelize, 'TEST')

  # Remove the CWD from sys.path while we load stuff.
1387it [00:00, 123487.57it/s]


In [320]:
x_test[0]

LabeledSentence(words=['3', 'yeah', 'cooked', 'potatoes', '3', 'years', 'old', 'mean', 'threw', 'bag', 'spuds', 'toilet', 'happy', 'mothers', 'day', 'made', 'breakfast', '66', 'time', 'thought', 'cool', 'drawing', 'naked', 'lady', 'dachshund', 'overhead', 'projector', 'psychology', 'class', '8'], tags=['TEST_0'])

In [321]:
tweet_w2v_test = Word2Vec(size=n_dim, min_count=10)
tweet_w2v_test.build_vocab([x.words for x in tqdm(x_test)])
tweet_w2v_test.train([x.words for x in tqdm(x_test)], total_examples=len(x_test), epochs=10)

100%|██████████| 1387/1387 [00:00<00:00, 532299.35it/s]
100%|██████████| 1387/1387 [00:00<00:00, 1042749.53it/s]


(103756, 275490)

In [322]:
test_vecs_w2v = np.concatenate([buildWordVector(z, n_dim) for z in tqdm(map(lambda x: x.words, x_test))])
test_vecs_w2v = scale(test_vecs_w2v)

  
1387it [00:00, 3084.22it/s]


In [323]:
test_predictions = model.predict(test_vecs_w2v)

In [324]:
test_predictions[1]

array([0.5213822 , 0.26053876, 0.21807897], dtype=float32)

In [325]:
label = [np.argmax(i) for i in test_predictions]
class_label = [x for x in label]
print(class_label[:10])
submission = pd.DataFrame({ 'id': test_df.id, 'sentiment_class': class_label })
def replace_two(n):
    if n == 2:
        return -1
    else:
        return n
submission['sentiment_class'] = submission['sentiment_class'].apply(replace_two)
submission.to_csv('../test1326.csv', index=False)

[0, 0, 0, 0, 1, 1, 0, 0, 0, 0]
