In [1]:
import pandas as pd #Dataframe Manipulation library
import numpy as np #Data Manipulation library

#sklearn modules for Feature Extraction & Modelling
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics.pairwise import cosine_similarity

#Libraries for Plotting 
import matplotlib.pyplot as plt
from matplotlib.ticker import FuncFormatter

from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
import joblib
import os
import glob

import re
import string

In [2]:
df = pd.read_json(os.path.join(os.getcwd(),'data','News_Category_Dataset_v3.json'), lines=True)

In [3]:
df['text_combine'] = df['headline'] + " " + df['short_description']
df = df[['text_combine','category']]
df_train = df.iloc[:int(len(df)*0.8)]
df_test = df.iloc[int(len(df)*0.8):len(df)]

In [4]:
df_train.head()

Unnamed: 0,text_combine,category
0,Over 4 Million Americans Roll Up Sleeves For O...,U.S. NEWS
1,"American Airlines Flyer Charged, Banned For Li...",U.S. NEWS
2,23 Of The Funniest Tweets About Cats And Dogs ...,COMEDY
3,The Funniest Tweets From Parents This Week (Se...,PARENTING
4,Woman Who Called Cops On Black Bird-Watcher Lo...,U.S. NEWS


In [5]:
import nltk

# nltk.download('wordnet',os.getcwd()) # for download to this path
# nltk.download('punkt',os.getcwd())
nltk.data.path.append('corpora')
nltk.data.path.append('tokenizers')

try:
    from nltk.stem.wordnet import WordNetLemmatizer
    lemmatizer = WordNetLemmatizer()
    def lemmatize(text : str) -> str:
        return lemmatizer.lemmatize(text)
except Exception as e:
    print(f'failed to load WordNetLemmatizer {e}')
    def lemmatize(text : str) -> str:
        return text

In [6]:
try:
    from nltk.tokenize import word_tokenize
except Exception as e:
    print(f'error load nltk tokenize {e}')
    def word_tokenize(text:str)->list:
        return text.split()

In [7]:
def text_cleaning(text:str)->str:
    '''Make text lowercase, remove text in square brackets,remove links,remove punctuation
    and remove words containing numbers.'''
    text = str(text).lower()
    text = re.sub('\[.*?\]', '', text)
    text = re.sub('https?://\S+|www\.\S+', '', text)
    text = re.sub('<.*?>+', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\n', '', text)
    text = re.sub('\w*\d\w*', '', text)
    return text

def list_to_text(l:list)->str:
    return ' '.join(l)

In [8]:
def preprocess_text(text : str) -> str:
    res = []
    text = text_cleaning(text)
    list_text = word_tokenize(text)
    for word in list_text:
        res.append(lemmatize(word))
    return list_to_text(res)

In [9]:
preprocess_text('Over 4 Million Americans Roll Up Sleeves For Omicron-Targeted COVID Boosters Health experts said it is too early to predict whether demand would match up with the 171 million doses of the new boosters the U.S. ordered for the fall.')

'over million american roll up sleeve for omicrontargeted covid booster health expert said it is too early to predict whether demand would match up with the million dos of the new booster the u ordered for the fall'

In [10]:
df_train['text_combine_cleaned'] = df_train['text_combine'].apply(lambda x:preprocess_text(x))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_train['text_combine_cleaned'] = df_train['text_combine'].apply(lambda x:preprocess_text(x))


In [11]:
df_train.to_csv(os.path.join(os.getcwd(),'data','local_train_data_clean.csv'))

In [12]:
df_train.head()

Unnamed: 0,text_combine,category,text_combine_cleaned
0,Over 4 Million Americans Roll Up Sleeves For O...,U.S. NEWS,over million american roll up sleeve for omicr...
1,"American Airlines Flyer Charged, Banned For Li...",U.S. NEWS,american airline flyer charged banned for life...
2,23 Of The Funniest Tweets About Cats And Dogs ...,COMEDY,of the funniest tweet about cat and dog this w...
3,The Funniest Tweets From Parents This Week (Se...,PARENTING,the funniest tweet from parent this week sept ...
4,Woman Who Called Cops On Black Bird-Watcher Lo...,U.S. NEWS,woman who called cop on black birdwatcher lose...


## feature engineering

### word2vec

In [13]:
import gensim.models
from time import time
from gensim import utils
import multiprocessing

class MyCorpus:
    """An iterator that yields sentences (lists of str)."""

    def __iter__(self):
        for line in df_train['text_combine_cleaned']:
            yield utils.simple_preprocess(line)

cores = multiprocessing.cpu_count() # Count the number of cores in a computer


In [14]:
sentences = MyCorpus()
model_w2v = gensim.models.Word2Vec(workers=cores-1)

In [15]:
t = time()

model_w2v.build_vocab(sentences, progress_per=10000)

print('Time to build vocab: {} mins'.format(round((time() - t) / 60, 2)))

Time to build vocab: 0.09 mins


In [16]:
t = time()

model_w2v.train(sentences, total_examples=model_w2v.corpus_count, epochs=10, report_delay=1)

print('Time to train the model: {} mins'.format(round((time() - t) / 60, 2)))

Time to train the model: 0.91 mins


In [17]:
model_w2v.wv['million']

array([ 0.83572507, -1.017963  ,  1.636769  ,  0.1990658 , -1.3724602 ,
       -0.88695705, -3.222287  , -1.8108013 ,  0.9759952 ,  0.6300123 ,
        0.8913318 ,  1.4861679 , -0.16220964, -0.7392116 ,  0.974985  ,
       -0.62273157,  1.1330959 , -3.6353168 , -0.1328214 ,  2.3720486 ,
       -0.96801776,  2.019842  ,  2.315325  , -1.1727276 ,  2.1405778 ,
        2.0043626 , -1.9028642 ,  0.01106686, -1.6374978 ,  0.8556392 ,
       -1.6430027 ,  0.41253135, -1.9979649 ,  0.6195668 ,  0.24608758,
        0.9664026 , -3.0348446 ,  1.758334  ,  1.650371  , -0.96401733,
        0.65161794,  0.44954973, -2.5018058 ,  0.44874212, -2.3940525 ,
       -0.11602435, -4.312738  , -0.4812549 ,  0.7615709 ,  0.5544056 ,
       -1.1617304 ,  0.45244548, -1.330128  ,  0.84189576,  3.4854383 ,
       -0.32123047, -1.5706046 , -2.1290686 ,  0.09419014,  0.09356831,
       -1.1354595 , -0.99886656, -0.39520872, -1.2296982 ,  0.18212397,
        0.80739874, -0.12394285, -1.1035398 ,  1.7466639 , -0.12

In [18]:
# model.wv['million']
model_w2v.wv.most_similar(positive=["million"])

[('billion', 0.826045572757721),
 ('trillion', 0.6207946538925171),
 ('thousand', 0.6041485667228699),
 ('percent', 0.5907055735588074),
 ('upwards', 0.581097424030304),
 ('dollar', 0.5791403651237488),
 ('hundred', 0.5787340998649597),
 ('taxpayer', 0.5559016466140747),
 ('multimillion', 0.5528634786605835),
 ('uninsured', 0.5505988001823425)]

In [19]:
model_w2v.wv.similarity("million", 'money')

0.43469036

In [20]:
model_w2v.wv.doesnt_match(["million", "money", "europe"])

'europe'

In [21]:
if not os.path.exists(os.path.join(os.getcwd(),'saved_model')):
    os.makedirs(os.path.join(os.getcwd(),'saved_model'))

if not os.path.exists(os.path.join(os.getcwd(),'saved_model','word2vec')):
    os.makedirs(os.path.join(os.getcwd(),'saved_model','word2vec'))

model_w2v.save(os.path.join(os.getcwd(),'saved_model','word2vec','gensim-word2vec-model'))

In [22]:
loaded_model = gensim.models.Word2Vec.load(os.path.join(os.getcwd(),'saved_model','word2vec','gensim-word2vec-model'))

In [23]:
loaded_model.wv.most_similar(positive=["million"])

[('billion', 0.826045572757721),
 ('trillion', 0.6207946538925171),
 ('thousand', 0.6041485667228699),
 ('percent', 0.5907055735588074),
 ('upwards', 0.581097424030304),
 ('dollar', 0.5791403651237488),
 ('hundred', 0.5787340998649597),
 ('taxpayer', 0.5559016466140747),
 ('multimillion', 0.5528634786605835),
 ('uninsured', 0.5505988001823425)]

### doc2vec

In [25]:
tagged_df_train = []
count = 0
for sentence in df_train['text_combine_cleaned']:
    tagged_df_train.append(gensim.models.doc2vec.TaggedDocument(sentence.split(), [count]))
    count+=1

In [26]:
model_d2v = gensim.models.doc2vec.Doc2Vec(vector_size=50, min_count=2, epochs=10, workers=cores-1)

In [27]:
model_d2v.build_vocab(tagged_df_train)

In [28]:
print(f"Word 'million' appeared {model_d2v.wv.get_vecattr('million', 'count')} times in the training corpus.")

Word 'million' appeared 2593 times in the training corpus.


In [29]:
model_d2v.train(tagged_df_train, total_examples=model_d2v.corpus_count, epochs=model_d2v.epochs)

In [30]:
vector = model_d2v.infer_vector(df_train['text_combine_cleaned'].iloc[0].split())

In [31]:
vector

array([ 0.15462857,  0.1212381 , -0.06696589,  0.15095988, -0.03678479,
        0.04467931, -0.05589783,  0.2544154 , -0.36791766, -0.24292438,
        0.21923864, -0.2611676 ,  0.02897439,  0.05075863, -0.18680346,
       -0.02482542, -0.06713432, -0.08495195, -0.18180478, -0.17106368,
       -0.00327925,  0.0983294 ,  0.03952096,  0.08551869, -0.11946163,
        0.17738105, -0.06116995, -0.07073114,  0.03264059, -0.11202175,
        0.01450013,  0.07382273,  0.14536524,  0.16590743,  0.21189901,
       -0.03710944,  0.20411941,  0.04205662, -0.00130913, -0.10735402,
        0.07143053,  0.00561398, -0.0595555 ,  0.23168136,  0.18159258,
       -0.11564403, -0.1706933 , -0.23234123, -0.06277809,  0.05156521],
      dtype=float32)

In [32]:
ranks = []
second_ranks = []
for doc_id in range(len(tagged_df_train)):
    inferred_vector = model_d2v.infer_vector(tagged_df_train[doc_id].words)
    sims = model_d2v.dv.most_similar([inferred_vector], topn=len(model_d2v.dv))
    rank = [docid for docid, sim in sims].index(doc_id)
    ranks.append(rank)

    second_ranks.append(sims[1])

In [None]:
# import collections

# counter = collections.Counter(ranks)
# print(counter)

In [None]:
# print('Document ({}): «{}»\n'.format(doc_id, ' '.join(tagged_df_train[doc_id].words)))
# print(u'SIMILAR/DISSIMILAR DOCS PER MODEL %s:\n' % model_d2v)
# for label, index in [('MOST', 0), ('SECOND-MOST', 1), ('MEDIAN', len(sims)//2), ('LEAST', len(sims) - 1)]:
#     print(u'%s %s: «%s»\n' % (label, sims[index], ' '.join(tagged_df_train[sims[index][0]].words)))