
### Importing The Packages

In [1]:
import pandas as pd
import numpy as np
import plotly
import plotly.express as px
import matplotlib.pyplot as plt
import seaborn as sns


from nltk.corpus import stopwords 
from nltk.tokenize import word_tokenize 
from nltk.stem import WordNetLemmatizer 
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.decomposition import TruncatedSVD
from sklearn.pipeline import Pipeline, make_pipeline


# Below libraries are for feature representation using sklearn
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
# Below libraries are for similarity matrices using sklearn
from sklearn.metrics.pairwise import cosine_similarity  


from lightgbm import LGBMClassifier
from sklearn.metrics import pairwise_distances
import copy

# libraries for XAI (LIME) explaination
from IPython.display import clear_output
from eli5.lime import TextExplainer
import warnings
from eli5.lime.samplers import MaskingTextSampler
from re import sub



from wordcloud import WordCloud
plotly.offline.init_notebook_mode (connected = True)
import eli5

from sklearn import metrics
from sklearn.model_selection import train_test_split
from PIL import Image 
from sklearn.tree import DecisionTreeClassifier
from sklearn import metrics
from xgboost import XGBClassifier
import random
warnings.filterwarnings("ignore")

<a id='2'></a>
# Importing the data

In [2]:
data=pd.read_csv('news.tsv',header=None,sep='\t')
data.columns=['News ID',
"Category",
"SubCategory",
"Title",
"Abstract",
"URL",
"Title Entities",
"Abstract Entities "]


print("Rows and columns : ")
data.shape

Rows and columns : 


(51282, 8)

In [3]:
data[:3]

Unnamed: 0,News ID,Category,SubCategory,Title,Abstract,URL,Title Entities,Abstract Entities
0,N55528,lifestyle,lifestyleroyals,"The Brands Queen Elizabeth, Prince Charles, an...","Shop the notebooks, jackets, and more that the...",https://assets.msn.com/labs/mind/AAGH0ET.html,"[{""Label"": ""Prince Philip, Duke of Edinburgh"",...",[]
1,N19639,health,weightloss,50 Worst Habits For Belly Fat,These seemingly harmless habits are holding yo...,https://assets.msn.com/labs/mind/AAB19MK.html,"[{""Label"": ""Adipose tissue"", ""Type"": ""C"", ""Wik...","[{""Label"": ""Adipose tissue"", ""Type"": ""C"", ""Wik..."
2,N61837,news,newsworld,The Cost of Trump's Aid Freeze in the Trenches...,Lt. Ivan Molchanets peeked over a parapet of s...,https://assets.msn.com/labs/mind/AAJgNsz.html,[],"[{""Label"": ""Ukraine"", ""Type"": ""G"", ""WikidataId..."


In [4]:
# taking the required columns only 
data=data.iloc[:,:5]
data.head()

Unnamed: 0,News ID,Category,SubCategory,Title,Abstract
0,N55528,lifestyle,lifestyleroyals,"The Brands Queen Elizabeth, Prince Charles, an...","Shop the notebooks, jackets, and more that the..."
1,N19639,health,weightloss,50 Worst Habits For Belly Fat,These seemingly harmless habits are holding yo...
2,N61837,news,newsworld,The Cost of Trump's Aid Freeze in the Trenches...,Lt. Ivan Molchanets peeked over a parapet of s...
3,N53526,health,voices,I Was An NBA Wife. Here's How It Affected My M...,"I felt like I was a fraud, and being an NBA wi..."
4,N38324,health,medical,"How to Get Rid of Skin Tags, According to a De...","They seem harmless, but there's a very good re..."


### Stacked Bar plot for showing subcategories and categories

In [5]:
c=data[['Category','SubCategory']].value_counts()
c

Category  SubCategory                    
news      newsus                             6564
sports    football_nfl                       5420
news      newspolitics                       2826
          newscrime                          2254
weather   weathertopstories                  2047
                                             ... 
finance   finance-home-loans                    1
          finance-homesandpropertysection       1
news      newsnational                          1
          narendramodi_opinion                  1
finance   finance-insidetheticker               1
Length: 283, dtype: int64

In [6]:
# visualization 
index=[]
for i in c.index:
    index.append(np.array(i))
index=np.array(index)



df2=pd.DataFrame(columns=['Category','Sub Category','Values'])
df2['Category']=index[:,0]
df2['Sub Category']=index[:,1]
df2['Values']=c.values



px.bar(data_frame=df2,x='Category',y='Values',color='Sub Category')



### Data Pre-processing

In [7]:
print('before processing :',len(data))
data.drop_duplicates(subset=['Title'],inplace=True)
print('after processing :',len(data))

before processing : 51282
after processing : 50434


In [8]:
# Checking the NAN values 
print(data.isna().sum())

# removing the nan values from the dataset 
data.dropna(inplace=True)

data.isna().sum()

News ID           0
Category          0
SubCategory       0
Title             0
Abstract       2646
dtype: int64


News ID        0
Category       0
SubCategory    0
Title          0
Abstract       0
dtype: int64

<a id='6.3'></a>
## Getting Titles with more than 4 words

In [9]:
# taking the title having more than 4 words only 

data=data[data['Title'].apply((lambda x: len(x.split())>=4))]

df2=data.copy()

### Removing the stopwords and tokenizing

ex: removing (is, am, the, this, etc.)

In [10]:
# This function is to remove stopwords from a particular column and to tokenize it
def rem_stopwords_tokenize(data,name):
      
    def getting(sen):
        example_sent = sen

        stop_words = set(stopwords.words('english')) 

        word_tokens = word_tokenize(example_sent) 

        filtered_sentence = [w for w in word_tokens if not w in stop_words] 

        filtered_sentence = [] 

        for w in word_tokens: 
            if w not in stop_words: 
                filtered_sentence.append(w) 
        return filtered_sentence
    x=[]
    for i in data[name].values:
        x.append(getting(i))
    data[name]=x

### Lemmatizing all the words

ex: caring to care

In [11]:
# Making a function to lemmatize all the words
lemmatizer = WordNetLemmatizer() 
def lemmatize_all(data,name):
    arr=data[name]
    a=[]
    for i in arr:
        b=[]
        for j in i:
            x=lemmatizer.lemmatize(j,pos='a')
            x=lemmatizer.lemmatize(x)
            b.append(x)
        a.append(b)
    data[name]=a
  
  


In [None]:
# Removing Stop words from Title Column
rem_stopwords_tokenize(data,'Title')

# Lemmatizing the Title column
lemmatize_all(data,'Title')


# Making a copy of data to use in the future
data4=data.copy()

In [None]:
# converting back to string 

def convert_to_string(data,name):
    t=data[name].values
    p=[]
    for i in t:
        listToStr = ' '.join(map(str, i))
        p.append(listToStr)
    data[name]=p

    
convert_to_string(data,'Title')


### Using TF-IDF Method

In [None]:
tfidf_headline_vectorizer = TfidfVectorizer(min_df = 0)

In [None]:
data['Title'][2]

In [None]:
def TFIDF_based_model(row_index, num_similar_items):
    cate=data['Category'][row_index]
    name=data['Title'][row_index]
    cate_data=data[data['Category']==cate]
 
    row_index2=cate_data[cate_data['Title']==name].index
    headline_features   = tfidf_headline_vectorizer.fit_transform(cate_data['Title'].values)
    couple_dist = pairwise_distances(headline_features,headline_features[row_index2])
    indices = np.argsort(couple_dist.ravel())[0:num_similar_items]
    df = pd.DataFrame({'headline':df2[df2['Category']==cate]['Title'].values[indices],
                       'Category':cate_data['Category'].values[indices],
                       'Abstract':cate_data['Abstract'].values[indices],
                'Euclidean Distance Similarity': couple_dist[indices].ravel()})
    print("="*30,"News Article Name","="*30)
    print('News Headline : ',data['Title'][indices[0]])
    print("\n","="*26,"Recommended News Using TFIDf: ","="*30)
    return df.iloc[1:,:]
# name=input('News Title For Recommendation :')
# clear_output()
# ind=df2[df2['Title']==name].index[0]
# dd=TFIDF_based_model(ind, 100)
# dd.head(10)



# input can be : 
"I Was An NBA Wife . Here 's How It Affected My Mental Health ."

In [None]:
X=data['Title'].values
y=data['Category'].values

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=21)

In [None]:
# print the report

def print_report(pipe):
    y_pred = pipe.predict(X_test)
    p=np.unique(y_test)
    report = metrics.classification_report(y_test, y_pred,
        target_names=p)
    print(report)
    print("accuracy: {:0.3f}".format(metrics.accuracy_score(y_test, y_pred)))



<a id='10.3'></a>
### Pipeline using LGBMClassifier

In [None]:
vec = TfidfVectorizer(min_df=0)
svd = TruncatedSVD(n_components=100, n_iter=1, random_state=42)
lsa = make_pipeline(vec, svd)
lgm=LGBMClassifier()
pipe2 = make_pipeline(lsa, lgm)

In [None]:
pipe2.fit(X_train, y_train)
pipe2.score(X_test, y_test)

In [None]:
# Making report for pipeline using LGBMClassifier
print_report(pipe2)

In [None]:
data['Title'][0]

In [None]:
p=np.unique(y_test)

def print_prediction(doc):
    y_pred = pipe2.predict_proba([doc])[0]
    for target, prob in zip(p, y_pred):
        print("{:.3f} {}".format(prob, target))

doc = data['Title'][0]
print_prediction(doc)

### LGBMClassifier : 54% accuracy

<a id='11'></a>
### The text explainer 

In [None]:
te = TextExplainer(random_state=42)
te.fit(doc, pipe2.predict_proba)
te.show_prediction(target_names=list(p))

#### Explanation makes sense - we expect reasonable classifier to take highlighted words in account. 

## DKN MODEL 

In [None]:
# for handling the directories and files used by DKN model 
import sys
import os
from tempfile import TemporaryDirectory
import scrapbook as sb
import tensorflow as tf
tf.get_logger().setLevel('ERROR') # only show error messages



# for DKN recommender system 
from recommenders.models.deeprec.deeprec_utils import download_deeprec_resources, prepare_hparams
from recommenders.models.deeprec.models.dkn import DKN
from recommenders.models.deeprec.io.dkn_iterator import DKNTextIterator
from recommenders.datasets.download_utils import maybe_download


In [None]:
tmpdir = TemporaryDirectory()
data_path = os.path.join(tmpdir.name, "mind-demo-dkn")

yaml_file = os.path.join(data_path, r'dkn.yaml')
train_file = os.path.join(data_path, r'train_mind_demo.txt')
valid_file = os.path.join(data_path, r'valid_mind_demo.txt')
test_file = os.path.join(data_path, r'test_mind_demo.txt')
news_feature_file = os.path.join(data_path, r'doc_feature.txt')
user_history_file = os.path.join(data_path, r'user_history.txt')
wordEmb_file = os.path.join(data_path, r'word_embeddings_100.npy')
entityEmb_file = os.path.join(data_path, r'TransE_entity2vec_100.npy')
contextEmb_file = os.path.join(data_path, r'TransE_context2vec_100.npy')
if not os.path.exists(yaml_file):
    download_deeprec_resources(r'https://recodatasets.z20.web.core.windows.net/deeprec/', tmpdir.name, 'mind-demo-dkn.zip')
  



In [None]:
epochs = 10
history_size = 50
batch_size = 100

In [None]:
hparams = prepare_hparams(yaml_file,
                          news_feature_file = news_feature_file,
                          user_history_file = user_history_file,
                          wordEmb_file=wordEmb_file,
                          entityEmb_file=entityEmb_file,
                          contextEmb_file=contextEmb_file,
                          epochs=epochs,
                          history_size=history_size,
                          batch_size=batch_size)
print(hparams)

In [None]:
model = DKN(hparams, DKNTextIterator)

In [None]:
print(model.run_eval(valid_file))

In [None]:
history = model.fit(train_file, valid_file)

####  Performance evaluation 

auc :  Model's ability to rank positive items higher than negative items, <br> 
mean_mrr : Model's ability to rank the first relevant item higher than others
<br>
ndcg@5 : calculates the quality of the top 5 recommendations<br>

In [None]:
res = model.run_eval(test_file)
print(res)

In [None]:
sb.glue("res", res)