# Predicting year of the news with word embeddings
In this kernel, I've tried to predict the news year, given the text of the article. 

In [29]:
import tensorflow as tf
import numpy as np 
import pandas as pd
import tensorflow as tf
import os
import plotly.express as px
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from nltk.corpus import stopwords
print('Tensorflow Version:', tf.__version__)

Tensorflow Version: 2.1.0


In [30]:
data = pd.read_csv('/kaggle/input/news-about-major-cryptocurrencies-20132018-40k/crypto_news_parsed_2013-2018_40k.csv')
print(data.info())
data.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 39765 entries, 0 to 39764
Data columns (total 7 columns):
url       39765 non-null object
title     39764 non-null object
text      39449 non-null object
html      39452 non-null object
year      39765 non-null int64
author    38499 non-null object
source    39765 non-null object
dtypes: int64(1), object(6)
memory usage: 2.1+ MB
None


Unnamed: 0,url,title,text,html,year,author,source
0,https://www.ccn.com/paris-hiltons-hotel-mogul-...,Paris Hilton’s Hotel Mogul Father to Sell $38 ...,A group of journalists who left The Denver Pos...,<p>A group of journalists who left The Denver ...,2018,Lester Coleman,accepts_bitcoin
1,https://www.ccn.com/buying-a-home-with-bitcoin...,Buying a Home With Bitcoin Makes Sense: Shark ...,Millionaire investor and Shark Tank star Barba...,<p>Millionaire investor and Shark Tank star <a...,2018,Frisco d'Anconia,accepts_bitcoin
2,https://www.ccn.com/sme-owners-in-the-uk-see-c...,UK SME Owners Anticipate Cryptocurrency Going ...,SME owners in the UK see cryptocurrency becomi...,"<p><span style=""font-weight: 400;"">SME owners ...",2018,Paul de Havilland,accepts_bitcoin
3,https://www.ccn.com/bitcoin-boost-australian-s...,‘Bitcoin Boost’: Australian State Government I...,"The government of Queensland, Australia’s seco...","<p>The government of Queensland, Australia’s s...",2018,Samburaj Das,accepts_bitcoin
4,https://www.ccn.com/bitcoin-price-too-volatile...,"Bitcoin Price Too Volatile for Merchants, Clai...",The bitcoin price is too volatile to make it a...,<p>The bitcoin price is too volatile to make i...,2018,Matt Jackson,accepts_bitcoin


In [31]:
for col in data.columns:
    print(f'The unique values in {col}:', data[col].nunique())

The unique values in url: 39765
The unique values in title: 39211
The unique values in text: 38815
The unique values in html: 38831
The unique values in year: 6
The unique values in author: 821
The unique values in source: 108


As we can see, year has 6 unique values. Let's see the distribution of the same in our dataset. 

In [32]:
year_dist = data['year'].value_counts()
px.bar(x=year_dist.index, y = year_dist, title = 'Distribution of years in the dataset', 
       labels = {'x' : 'year', 'y' : 'rows in dataset'})

## Creating the model
We'll take the text from the data to create a classification model.  

In [33]:
X = data['text'].astype('str')
y = data['year']
X.shape, y.shape

((39765,), (39765,))

In [34]:
stopwords = set(stopwords.words('english'))
X = X.apply(lambda x: ' '.join([x for x in x.split() if x not in stopwords]))

# Replace the years with new encoded numbers
year_dict = {2013 : 0, 2014: 1, 2015: 2, 2016 : 3, 2017 : 4, 2018 : 5}
y = y.replace(year_dict)
y.value_counts()

5    11448
4    10770
3     6729
1     5390
2     4721
0      707
Name: year, dtype: int64

In [35]:
# Creating the train and the test set
training_sentences, testing_sentences, training_labels, testing_labels = train_test_split(X, y, test_size=0.2, stratify = y) # Stratify with y to have enough of each class in the training set
training_sentences = training_sentences.tolist()
testing_sentences = testing_sentences.tolist()

In [36]:
vocab_size = 300000
embedding_dim = 32
max_length = 500
trunc_type='post'
oov_tok = "<OOV>"

In [37]:
training_sentences[0], testing_sentences[0]

('Follow us Telegram subscribe newsletter here.',
 'A new bitcoin application seeking impact online advertising space Bitly alternative rewards users bitcoin.Called Cred, application seeks encourage readers share content specialized links generate monetary rewards. The difference Cred existing URL management platforms that, addition link shortener, application inserts advertisements user experience."There\'s page platforms content, that\'s opportunity advertiser display ad based going," Cred creator online advertising veteran Cameron Hejazi told CoinDesk. Social influencers turn receive $1 every thousand impressions drive content, payments Cred aims bootstrap seeks larger advertising partners.Hejazi acknowledges service could use traditional payment methods, said encouraged fact bitcoin community already using grassroots content monetization products, like social tipping service ChangeTip.He said:"I chose bitcoin community ... lot recognition individual value measured rewarded."Adverti

In [38]:
# Create a tokenizer and prepare the train and test set
tokenizer = Tokenizer(num_words = vocab_size, oov_token=oov_tok)
tokenizer.fit_on_texts(training_sentences)
word_index = tokenizer.word_index
sequences = tokenizer.texts_to_sequences(training_sentences)
padded = pad_sequences(sequences,maxlen=max_length, truncating=trunc_type)

testing_sequences = tokenizer.texts_to_sequences(testing_sentences)
testing_padded = pad_sequences(testing_sequences,maxlen=max_length)

In [39]:
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_size, 32),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(128)),
    tf.keras.layers.Dense(512, activation='relu'),
    tf.keras.layers.Dense(6, activation='softmax')
])
model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()

Model: "sequential_3"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_3 (Embedding)      (None, None, 32)          9600000   
_________________________________________________________________
bidirectional_3 (Bidirection (None, 256)               164864    
_________________________________________________________________
dense_6 (Dense)              (None, 512)               131584    
_________________________________________________________________
dense_7 (Dense)              (None, 6)                 3078      
Total params: 9,899,526
Trainable params: 9,899,526
Non-trainable params: 0
_________________________________________________________________


In [None]:
num_epochs = 10
history = model.fit(padded, training_labels, epochs=num_epochs, validation_data=(testing_padded, testing_labels))

In [None]:
import matplotlib.pyplot as plt


def plot_graphs(history, string):
  plt.plot(history.history[string])
  plt.plot(history.history['val_'+string])
  plt.xlabel("Epochs")
  plt.ylabel(string)
  plt.legend([string, 'val_'+string])
  plt.show()

In [None]:
plot_graphs(history, 'accuracy')

In [None]:
plot_graphs(history, 'loss')