In [21]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [22]:
!pwd

/content/drive/My Drive/python/zs news


In [23]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import tensorflow as tf

In [24]:
os.chdir('/content/drive/MyDrive/python/zs news/')
os.getcwd()

'/content/drive/MyDrive/python/zs news'

# Read Data

In [25]:
train = pd.read_csv('/content/drive/MyDrive/python/zs news/train_file.csv')
test = pd.read_csv('/content/drive/MyDrive/python/zs news/test_file.csv')
print(train.shape, test.shape)

(55932, 11) (37288, 9)


In [26]:
train.head()

Unnamed: 0,IDLink,Title,Headline,Source,Topic,PublishDate,Facebook,GooglePlus,LinkedIn,SentimentTitle,SentimentHeadline
0,Tr3CMgRv1N,Obama Lays Wreath at Arlington National Cemetery,Obama Lays Wreath at Arlington National Cemete...,USA TODAY,obama,2002-04-02 00:00:00,-1,-1,-1,0.0,-0.0533
1,Wc81vGp8qZ,A Look at the Health of the Chinese Economy,"Tim Haywood, investment director business-unit...",Bloomberg,economy,2008-09-20 00:00:00,-1,-1,-1,0.208333,-0.156386
2,zNGH03CrZH,Nouriel Roubini: Global Economy Not Back to 2008,"Nouriel Roubini, NYU professor and chairman at...",Bloomberg,economy,2012-01-28 00:00:00,-1,-1,-1,-0.42521,0.139754
3,3sM1H0W8ts,Finland GDP Expands In Q4,Finland's economy expanded marginally in the t...,RTT News,economy,2015-03-01 00:06:00,-1,-1,-1,0.0,0.026064
4,wUbnxgvqaZ,"Tourism, govt spending buoys Thai economy in J...",Tourism and public spending continued to boost...,The Nation - Thailand&#39;s English news,economy,2015-03-01 00:11:00,-1,-1,-1,0.0,0.141084


In [27]:
test.head()

Unnamed: 0,IDLink,Title,Headline,Source,Topic,PublishDate,Facebook,GooglePlus,LinkedIn
0,tFrqIR6Chj,Sliding Economy: FG fights back with N3trn TSA...,With the 2016 budget now passed by the Nationa...,BusinessDay,economy,2016-03-29 01:41:12,0,0,1
1,DVAaGErjlF,Microsoft shows how HoloLens can bring distant...,A recent Microsoft Research video shows how th...,Daily Mail,microsoft,2016-03-29 01:41:27,121,2,13
2,OT9UIZm5M2,"Microsoft’s Twitter Robot Praises Hitler, Trum...",* Microsoft teamed with Bing to create TayTwee...,EURweb,microsoft,2016-03-29 01:47:00,12,1,0
3,lflGp3q2Fj,Flood of Central Bank Moves Can't Get World Ec...,Central bankers have managed to steer the worl...,Bloomberg via Yahoo! Finance,economy,2016-03-29 02:00:00,0,0,3
4,zDYG0SoovZ,USD/JPY: bears lining up on mixed U.S. economy...,"However, this streak of seven-day gains might ...",FXStreet,economy,2016-03-29 02:01:07,3,0,0


# Prepare training data

In [28]:
train.duplicated().sum()

0

In [29]:
train.columns

Index(['IDLink', 'Title', 'Headline', 'Source', 'Topic', 'PublishDate',
       'Facebook', 'GooglePlus', 'LinkedIn', 'SentimentTitle',
       'SentimentHeadline'],
      dtype='object')

In [30]:
df1 = train[['Title','SentimentTitle']].copy()
df1.columns = ['text','sentiment']
df1.shape

(55932, 2)

In [31]:
df2 = train[['Headline','SentimentHeadline']].copy()
df2.columns = ['text','sentiment']
df2.shape

(55932, 2)

In [32]:
df = pd.concat([df1,df2], axis=0, ignore_index=True)
df.shape

(111864, 2)

In [33]:
df.head()

Unnamed: 0,text,sentiment
0,Obama Lays Wreath at Arlington National Cemetery,0.0
1,A Look at the Health of the Chinese Economy,0.208333
2,Nouriel Roubini: Global Economy Not Back to 2008,-0.42521
3,Finland GDP Expands In Q4,0.0
4,"Tourism, govt spending buoys Thai economy in J...",0.0


# Cleaning df

In [34]:
print(df.shape)
df = df.drop_duplicates().reset_index(drop=True).copy()
df = df.dropna(how='any')
print(df.shape)

(111864, 2)
(101075, 2)


In [35]:
df.head()

Unnamed: 0,text,sentiment
0,Obama Lays Wreath at Arlington National Cemetery,0.0
1,A Look at the Health of the Chinese Economy,0.208333
2,Nouriel Roubini: Global Economy Not Back to 2008,-0.42521
3,Finland GDP Expands In Q4,0.0
4,"Tourism, govt spending buoys Thai economy in J...",0.0


# Creating X and y

In [36]:
X=df['text'].values.reshape(-1,1)
y=df['sentiment'].values.reshape(-1,1)
print(X.shape,y.shape)

(101075, 1) (101075, 1)


In [37]:
X[:10]

array([['Obama Lays Wreath at Arlington National Cemetery'],
       ['A Look at the Health of the Chinese Economy'],
       ['Nouriel Roubini: Global Economy Not Back to 2008'],
       ['Finland GDP Expands In Q4'],
       ['Tourism, govt spending buoys Thai economy in January'],
       ['Intellitec Solutions to Host 13th Annual Spring Microsoft Dynamics User Group'],
       ['Monday, 29 Feb 2016'],
       ['Obama, stars pay a musical tribute to Ray Charles'],
       ['Fire claims more than 100-year-old barn in Hancock County'],
       ["Microsoft's new Windows 10 ad targets Apple"]], dtype=object)

In [38]:
y[:10]

array([[ 0.        ],
       [ 0.20833333],
       [-0.42521003],
       [ 0.        ],
       [ 0.        ],
       [-0.07537784],
       [ 0.        ],
       [ 0.08333333],
       [-0.17392527],
       [-0.05953621]])

# Text Cleaning

In [39]:
import re
import string
import spacy
from spacy.lang.en.stop_words import STOP_WORDS
from tqdm import tqdm

In [40]:
"not" in STOP_WORDS

True

In [41]:
string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [42]:
punct = "!#$%&()*+,-./:;<=>?@[\]^_`{|}~"
print(punct)

!#$%&()*+,-./:;<=>?@[\]^_`{|}~


In [43]:
nlp = spacy.load('en_core_web_sm')

In [44]:
doc = nlp("Id cant do this and it's difficult. My website n't www.google.com")
for token in doc:
  print(token.lemma_)

-PRON-
would
can
not
do
this
and
-PRON-
be
difficult
.
-PRON-
website
n't
www.google.com


In [45]:
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
print(stopwords.words('english'))

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'ea

In [46]:
def text_cleaning(X):
  corpus = []
  for i in tqdm(range(0,len(X))):
    sen = re.sub("http?\S+"," ",X[i][0])
    sen = re.sub('[^a-zA-Z]', ' ', sen)
    doc = nlp(sen)
    tokens = []
    for token in doc:
      if token.lemma_ != "-PRON-":
        temp = token.lemma_.lower().strip()
      else:
        temp = token.lower_.strip()
      tokens.append(temp)
      tokens = [token for token in tokens if token not in punct]
    corpus.append(' '.join(tokens))
  return corpus

In [47]:
corpus = text_cleaning(X)
print('cleaning done..')

100%|██████████| 101075/101075 [14:02<00:00, 119.97it/s]

cleaning done..





In [48]:
len(corpus)

101075

In [49]:
test.columns

Index(['IDLink', 'Title', 'Headline', 'Source', 'Topic', 'PublishDate',
       'Facebook', 'GooglePlus', 'LinkedIn'],
      dtype='object')

In [50]:
X_test_title= test['Title'].values.reshape(-1,1).copy()
print(X_test_title.shape)

X_test_head= test['Headline'].values.reshape(-1,1).copy()
print(X_test_head.shape)

(37288, 1)
(37288, 1)


In [51]:
corpus_test_title = text_cleaning(X_test_title)
print('cleaning done..')

corpus_test_head = text_cleaning(X_test_head)
print('cleaning done..')

100%|██████████| 37288/37288 [04:14<00:00, 146.61it/s]
  0%|          | 11/37288 [00:00<05:50, 106.41it/s]

cleaning done..


100%|██████████| 37288/37288 [05:43<00:00, 108.63it/s]

cleaning done..





# Find max len

In [52]:
max([len(i[0].split()) for i in X])

80

In [53]:
max([len(i.split()) for i in corpus])

87

In [54]:
max([len(i.split()) for i in corpus_test_title])

23

In [55]:
max([len(i.split()) for i in corpus_test_head])

82

# Onehot

In [56]:
from tensorflow.keras.preprocessing.text import one_hot
from tensorflow.keras.preprocessing.sequence import pad_sequences
voc_size = 25000

In [57]:
print(len(corpus),len(corpus_test_title),len(corpus_test_head))

101075 37288 37288


In [58]:
corpus_tot = corpus+corpus_test_title+corpus_test_head
len(corpus_tot)

175651

In [59]:
onehot_rep = [one_hot(words, n=voc_size) for words in corpus_tot]

In [60]:
onehot_rep[:5]

[[23603, 15760, 5878, 14445, 699, 1345, 338],
 [17688, 15929, 14445, 16024, 9723, 23164, 16024, 1745, 5533],
 [15016, 7669, 5841, 5533, 1873, 22676, 17535],
 [9599, 22074, 2238, 15758, 14767],
 [7066, 21180, 728, 8589, 23546, 5533, 15758, 16646]]

In [61]:
len(onehot_rep)

175651

In [62]:
sent_length = 90
embedded_docs = pad_sequences(onehot_rep, padding='pre', maxlen=sent_length)
print(embedded_docs[0])

[    0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0 23603
 15760  5878 14445   699  1345   338]


In [63]:
embedded_docs_train = embedded_docs[:len(corpus)]
embedded_docs_test_title = embedded_docs[len(corpus):len(corpus)+len(corpus_test_title)]
embedded_docs_test_head = embedded_docs[len(corpus)+len(corpus_test_title):]

print(len(corpus),len(corpus_test_title),len(corpus_test_head))
print(len(embedded_docs_train),len(embedded_docs_test_title),len(embedded_docs_test_head))

101075 37288 37288
101075 37288 37288


# Split data

In [64]:
embedded_docs_train.shape, y.shape, embedded_docs_test_title.shape, embedded_docs_test_head.shape

((101075, 90), (101075, 1), (37288, 90), (37288, 90))

In [65]:
from sklearn.model_selection import train_test_split

X_train, X_val, y_train, y_val = train_test_split(embedded_docs_train, y, test_size=0.33, random_state = 42, shuffle=True)

X_train.shape, y_train.shape, X_val.shape, y_val.shape

((67720, 90), (67720, 1), (33355, 90), (33355, 1))

# Model

In [66]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Bidirectional, Dropout, Embedding, Flatten
from tensorflow.keras.callbacks import EarlyStopping

In [67]:
embedding_dim=128

model_bilstm=Sequential()
model_bilstm.add(Embedding(voc_size,embedding_dim,input_length=sent_length))
model_bilstm.add(Bidirectional(LSTM(128, return_sequences=True)))
model_bilstm.add(Dropout(0.25))
model_bilstm.add(Flatten())
model_bilstm.add(Dense(1))
model_bilstm.compile(loss='mean_squared_error',optimizer='adam')
print(model_bilstm.summary())

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 90, 128)           3200000   
_________________________________________________________________
bidirectional (Bidirectional (None, 90, 256)           263168    
_________________________________________________________________
dropout (Dropout)            (None, 90, 256)           0         
_________________________________________________________________
flatten (Flatten)            (None, 23040)             0         
_________________________________________________________________
dense (Dense)                (None, 1)                 23041     
Total params: 3,486,209
Trainable params: 3,486,209
Non-trainable params: 0
_________________________________________________________________
None


In [68]:
# from keras.utils.vis_utils import plot_model

# plot_model(model=model_bilstm, to_file='model.png')

In [87]:
early_s = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True, verbose=1)

In [88]:
history = model_bilstm.fit(X_train, y_train, validation_data=(X_val,y_val), 
                           batch_size=1024, epochs=20, callbacks=[early_s], verbose=1)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Restoring model weights from the end of the best epoch.
Epoch 00011: early stopping


# Prediction

In [89]:
y_pred_title = model_bilstm.predict(embedded_docs_test_title)
y_pred_head = model_bilstm.predict(embedded_docs_test_head)

In [90]:
y_pred_title

array([[ 0.05054298],
       [-0.05951595],
       [-0.04143532],
       ...,
       [ 0.23324816],
       [ 0.03336612],
       [ 0.04790474]], dtype=float32)

In [91]:
y_pred_head

array([[-0.05522691],
       [ 0.0406756 ],
       [-0.00863103],
       ...,
       [-0.24284135],
       [ 0.06850711],
       [-0.00026917]], dtype=float32)

# csv file

In [92]:
y_pred_title.reshape(1,-1)[0]

array([ 0.05054298, -0.05951595, -0.04143532, ...,  0.23324816,
        0.03336612,  0.04790474], dtype=float32)

In [93]:
sub = pd.DataFrame({'IDLink': test['IDLink'],
                    'SentimentTitle': y_pred_title.reshape(1,-1)[0],
                    'SentimentHeadline': y_pred_head.reshape(1,-1)[0]})
print(sub.shape, test.shape)

(37288, 3) (37288, 9)


In [94]:
sub.head(10)

Unnamed: 0,IDLink,SentimentTitle,SentimentHeadline
0,tFrqIR6Chj,0.050543,-0.055227
1,DVAaGErjlF,-0.059516,0.040676
2,OT9UIZm5M2,-0.041435,-0.008631
3,lflGp3q2Fj,-0.120141,-0.170874
4,zDYG0SoovZ,-0.06361,-0.045311
5,xwr9uOYgEj,0.003976,0.014327
6,X8BqN4I4xZ,-0.035258,0.184739
7,ESXBINkNt4,0.085285,0.132586
8,uUjZusP8FU,-0.135507,0.027662
9,VIxYwukhvK,0.105625,-0.011478


In [95]:
sub.to_csv('/content/drive/MyDrive/python/zs news/sub_file2.csv', index=False)