<a href="https://colab.research.google.com/github/ashayghiya/Hackathons/blob/master/News_Category_Machine_Hack.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Training

In [0]:
import gensim
from gensim import corpora, models
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.stem.porter import *
from sklearn.cluster import KMeans
from sklearn.linear_model import LogisticRegression
# Others
import re
import nltk
import string
import numpy as np
import pandas as pd
nltk.download('stopwords')
from sklearn.manifold import TSNE
from nltk.stem import WordNetLemmatizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
# Keras
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential, load_model, Model
from keras.layers import Flatten, Dropout, Activation, Input, Dense, concatenate
from keras.layers.embeddings import Embedding
from keras.initializers import Constant

np.random.seed(2019)

# Disable Warnings
import warnings
warnings.filterwarnings('ignore')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [0]:
df = pd.read_excel(r'Data_Train.xlsx')
df.head()

Unnamed: 0,STORY,SECTION
0,But the most painful was the huge reversal in ...,3
1,How formidable is the opposition alliance amon...,0
2,Most Asian currencies were trading lower today...,3
3,"If you want to answer any question, click on ‘...",1
4,"In global markets, gold prices edged up today ...",3


In [0]:
df['SECTION'].value_counts()

1    2772
2    1924
0    1686
3    1246
Name: SECTION, dtype: int64

In [0]:
y = pd.get_dummies(df['SECTION'])
y.head()

Unnamed: 0,0,1,2,3
0,0,0,0,1
1,1,0,0,0
2,0,0,0,1
3,0,1,0,0
4,0,0,0,1


In [0]:
def clean_text(text):
    
    ## Remove puncuation
    text = text.translate(string.punctuation)
    
    ## Convert words to lower case and split them
    text = text.lower().split()
    
    ## Remove stop words
    stops = set(stopwords.words("english"))
    text = [w for w in text if not w in stops and len(w) >= 3]
    
    text = " ".join(text)
    ## Clean the text
    text = re.sub(r"[^A-Za-z0-9^,!.\/'+-=]", " ", text)
    text = re.sub(r"what's", "what is ", text)
    text = re.sub(r"\'s", " ", text)
    text = re.sub(r"\'ve", " have ", text)
    text = re.sub(r"n't", " not ", text)
    text = re.sub(r"i'm", "i am ", text)
    text = re.sub(r"\'re", " are ", text)
    text = re.sub(r"\'d", " would ", text)
    text = re.sub(r"\'ll", " will ", text)
    text = re.sub(r",", " ", text)
    text = re.sub(r"\.", " ", text)
    text = re.sub(r"!", " ! ", text)
    text = re.sub(r"\/", " ", text)
    text = re.sub(r"\^", " ^ ", text)
    text = re.sub(r"\+", " + ", text)
    text = re.sub(r"\-", " - ", text)
    text = re.sub(r"\=", " = ", text)
    text = re.sub(r"'", " ", text)
    text = re.sub(r"(\d+)(k)", r"\g<1>000", text)
    text = re.sub(r":", " : ", text)
    text = re.sub(r" e g ", " eg ", text)
    text = re.sub(r" b g ", " bg ", text)
    text = re.sub(r" u s ", " american ", text)
    text = re.sub(r"\0s", "0", text)
    text = re.sub(r" 9 11 ", "911", text)
    text = re.sub(r"e - mail", "email", text)
    text = re.sub(r"j k", "jk", text)
    text = re.sub(r"\s{2,}", " ", text)
    ## Stemming
    text = text.split()
    stemmer = SnowballStemmer('english')
    stemmed_words = [stemmer.stem(word) for word in text]
    text = " ".join(stemmed_words)
    return text

In [0]:
df['text'] = df['STORY'].map(lambda x: clean_text(x))

In [0]:
df['text'][0]

'pain huge revers fee incom unheard among privat sector lender essenti mean yes bank took grant fee structur loan deal paid account upfront book borrow turn default fee tie loan deal fell crack gill vow shift safer account practic amort fee incom rather book upfront + + + gill s move mend past way mean nasti surpris futur good news consid investor love clean imag loath uncertainti + + + but gain without pain promis strong stabl balanc sheet come sacrific well investor give hope phenomen growth promis made kapoor'

In [0]:
X_train, X_test, y_train, y_test = train_test_split(df['text'].values, y.values, test_size = 0.2, random_state = 100)

In [0]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(X_train)
vocabulary_size = len(tokenizer.word_index) + 1
print(vocabulary_size)

21404


In [0]:
trainsequences = tokenizer.texts_to_sequences(X_train)
print(trainsequences)

In [0]:
MAXLEN = 1000

In [0]:
trainseqs = pad_sequences(trainsequences, maxlen=MAXLEN, padding='post')
print(trainseqs)

[[  396    27  1236 ...     0     0     0]
 [   36    37  2095 ...     0     0     0]
 [ 1340   134   143 ...     0     0     0]
 ...
 [  562 21398   682 ...     0     0     0]
 [ 2202   134 12969 ...     0     0     0]
 [ 1003   198  2169 ...     0     0     0]]


In [0]:
testsequences = tokenizer.texts_to_sequences(X_test)
testseqs = pad_sequences(testsequences, maxlen=MAXLEN, padding='post')

In [0]:
testseqs.shape

(1526, 1000)

In [0]:
y_train.shape

(6102, 4)

In [0]:
embedding_size = 32
op_units = df['SECTION'].nunique()

In [0]:
# define the model
model = Sequential()
model.add(Embedding(vocabulary_size, embedding_size, input_length=MAXLEN))
model.add(Flatten())
model.add(Dense(op_units, activation='softmax'))

In [0]:
# compile the model
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

In [0]:
# summarize the model
print(model.summary())

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_9 (Embedding)      (None, 1000, 32)          684928    
_________________________________________________________________
flatten_9 (Flatten)          (None, 32000)             0         
_________________________________________________________________
dense_9 (Dense)              (None, 4)                 128004    
Total params: 812,932
Trainable params: 812,932
Non-trainable params: 0
_________________________________________________________________
None


In [0]:
# fit the model
model.fit(trainseqs, 
          y_train, 
          epochs=10,
          validation_data=(testseqs,y_test),
          verbose=2)

Train on 6102 samples, validate on 1526 samples
Epoch 1/10
 - 2s - loss: 0.8271 - acc: 0.6881 - val_loss: 0.2253 - val_acc: 0.9469
Epoch 2/10
 - 1s - loss: 0.1322 - acc: 0.9695 - val_loss: 0.1198 - val_acc: 0.9699
Epoch 3/10
 - 1s - loss: 0.0560 - acc: 0.9887 - val_loss: 0.1029 - val_acc: 0.9725
Epoch 4/10
 - 1s - loss: 0.0304 - acc: 0.9946 - val_loss: 0.0953 - val_acc: 0.9751
Epoch 5/10
 - 1s - loss: 0.0187 - acc: 0.9964 - val_loss: 0.0946 - val_acc: 0.9731
Epoch 6/10
 - 1s - loss: 0.0129 - acc: 0.9967 - val_loss: 0.0939 - val_acc: 0.9731
Epoch 7/10
 - 1s - loss: 0.0099 - acc: 0.9979 - val_loss: 0.0941 - val_acc: 0.9751
Epoch 8/10
 - 1s - loss: 0.0086 - acc: 0.9972 - val_loss: 0.0962 - val_acc: 0.9738
Epoch 9/10
 - 1s - loss: 0.0075 - acc: 0.9972 - val_loss: 0.0951 - val_acc: 0.9758
Epoch 10/10
 - 1s - loss: 0.0068 - acc: 0.9974 - val_loss: 0.0958 - val_acc: 0.9731


<keras.callbacks.History at 0x7f2295a922b0>

In [0]:
# evaluate the model
loss, accuracy = model.evaluate(testseqs, y_test, verbose=2)
print('Loss: %f' % (loss))
print('Accuracy: %f' % (accuracy*100))

Loss: 0.095790
Accuracy: 97.313237


# Prediction

In [0]:
df1 = pd.read_excel(r'Data_Test.xlsx')
df1['text'] = df1['STORY'].map(lambda x: clean_text(x))
predsequences = tokenizer.texts_to_sequences(df1['text'].values)
predseqs = pad_sequences(predsequences, maxlen=MAXLEN, padding='post')

In [0]:
pred = model.predict(predseqs)
pred_list = pred.tolist()
pred_section = [l.index(max(l)) for l in pred_list]
df1['SECTION'] = pred_section
df1.head()

Unnamed: 0,STORY,text,SECTION
0,2019 will see gadgets like gaming smartphones ...,2019 see gadget like game smartphon wearabl me...,1
1,It has also unleashed a wave of changes in the...,also unleash wave chang mcu make sure futur lo...,2
2,It can be confusing to pick the right smartpho...,confus pick right smartphon yourself segreg to...,1
3,The mobile application is integrated with a da...,mobil applic integr dashboard confirm regist p...,1
4,We have rounded up some of the gadgets that sh...,round gadget show 2018 left indel mark on cons...,1


In [0]:
df1.to_excel(r'check_pred.xlsx')
df1['SECTION'].to_excel(r'prediction.xlsx',index=False)