In [1]:
import pandas as pd
import nltk
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.stem.porter import *
import numpy as np
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS



# Preprocessing

In [2]:
df = pd.read_csv('data/abcnews-date-text.csv', error_bad_lines=False)

In [3]:
df = df.drop('publish_date', axis=1)

In [4]:
df['headline_text'].loc[:5]

0    aba decides against community broadcasting lic...
1       act fire witnesses must be aware of defamation
2       a g calls for infrastructure protection summit
3             air nz staff in aust strike for pay rise
4        air nz strike to affect australian travellers
5                    ambitious olsson wins triple jump
Name: headline_text, dtype: object

## Import des modules nltk

In [5]:
from nltk.corpus import wordnet, stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer, PorterStemmer

In [6]:
#nltk.download()

In [7]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to /Users/thor/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [8]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /Users/thor/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [9]:
print(WordNetLemmatizer().lemmatize('eggs', pos='n'))

egg


In [10]:
stop_words = set(stopwords.words('english'))
stop_words

{'a',
 'about',
 'above',
 'after',
 'again',
 'against',
 'ain',
 'all',
 'am',
 'an',
 'and',
 'any',
 'are',
 'aren',
 "aren't",
 'as',
 'at',
 'be',
 'because',
 'been',
 'before',
 'being',
 'below',
 'between',
 'both',
 'but',
 'by',
 'can',
 'couldn',
 "couldn't",
 'd',
 'did',
 'didn',
 "didn't",
 'do',
 'does',
 'doesn',
 "doesn't",
 'doing',
 'don',
 "don't",
 'down',
 'during',
 'each',
 'few',
 'for',
 'from',
 'further',
 'had',
 'hadn',
 "hadn't",
 'has',
 'hasn',
 "hasn't",
 'have',
 'haven',
 "haven't",
 'having',
 'he',
 'her',
 'here',
 'hers',
 'herself',
 'him',
 'himself',
 'his',
 'how',
 'i',
 'if',
 'in',
 'into',
 'is',
 'isn',
 "isn't",
 'it',
 "it's",
 'its',
 'itself',
 'just',
 'll',
 'm',
 'ma',
 'me',
 'mightn',
 "mightn't",
 'more',
 'most',
 'mustn',
 "mustn't",
 'my',
 'myself',
 'needn',
 "needn't",
 'no',
 'nor',
 'not',
 'now',
 'o',
 'of',
 'off',
 'on',
 'once',
 'only',
 'or',
 'other',
 'our',
 'ours',
 'ourselves',
 'out',
 'over',
 'own',
 'r

## Tokenizer nos titres afin d'obtenir une liste de mots

In [11]:
df

Unnamed: 0,headline_text
0,aba decides against community broadcasting lic...
1,act fire witnesses must be aware of defamation
2,a g calls for infrastructure protection summit
3,air nz staff in aust strike for pay rise
4,air nz strike to affect australian travellers
...,...
457508,north coast less exposed to personal bankruptcy
457509,nsw economy the worst may be to come
457510,nt govt waging political war against commercial
457511,ny times wins 5 pulitzer awards


In [12]:
words_splitted = []
# for i in df['headline_text'][:10]:
#     words_splitted.append(i.split(' '))

for line in df['headline_text'][:550000]:
#for line in df['headline_text']:
    words_splitted.append(nltk.word_tokenize(line))
    # for i in df['headline_text']:
#     print(i)

In [13]:
final_words = []

for word in words_splitted:
    word_without_sw = []
    for w in word:
        if w not in stop_words and len(w) > 3:
            w = WordNetLemmatizer().lemmatize(w, pos='v').lower()
            w = WordNetLemmatizer().lemmatize(w, pos='n')
            w = WordNetLemmatizer().lemmatize(w, pos='a')
            word_without_sw.append(WordNetLemmatizer().lemmatize(w))
    final_words.append(word_without_sw)
final_words 

[['decide', 'community', 'broadcast', 'licence'],
 ['fire', 'witness', 'must', 'aware', 'defamation'],
 ['call', 'infrastructure', 'protection', 'summit'],
 ['staff', 'aust', 'strike', 'rise'],
 ['strike', 'affect', 'australian', 'traveller'],
 ['ambitious', 'olsson', 'win', 'triple', 'jump'],
 ['antic', 'delight', 'record', 'break', 'barca'],
 ['aussie', 'qualifier', 'stosur', 'waste', 'four', 'memphis', 'match'],
 ['aust', 'address', 'security', 'council', 'iraq'],
 ['australia', 'lock', 'timetable'],
 ['australia', 'contribute', 'million', 'iraq'],
 ['barca', 'take', 'record', 'robson', 'celebrate', 'birthday'],
 ['bathhouse', 'plan', 'move', 'ahead'],
 ['hop', 'launceston', 'cycle', 'championship'],
 ['plan', 'boost', 'paroo', 'water', 'supply'],
 ['blizzard', 'bury', 'unite', 'state', 'bill'],
 ['brigadier', 'dismiss', 'report', 'troop', 'harass'],
 ['british', 'combat', 'troop', 'arrive', 'daily', 'kuwait'],
 ['bryant', 'lead', 'lakers', 'double', 'overtime'],
 ['bushfire', 'vict

In [14]:
df = pd.DataFrame()

In [15]:
df['sentence'] = final_words

In [16]:
#!pip install gensim
from gensim.corpora import Dictionary


In [17]:
# Create a dictionary representation of the documents.
dictionary = Dictionary(df['sentence'])

# Filter out words that occur less than 2 documents, or more than 50% of the documents.
dictionary.filter_extremes(no_below=20, no_above=0.5)

In [18]:
dictionary

<gensim.corpora.dictionary.Dictionary at 0x7fc944bbd7f0>

In [19]:
corpus = [dictionary.doc2bow(word) for word in df['sentence']]

In [20]:
print('Number of unique tokens: %d' % len(dictionary))
print('Number of documents: %d' % len(corpus))

Number of unique tokens: 7798
Number of documents: 457513


In [21]:
corpus[:5]

[[(0, 1), (1, 1), (2, 1), (3, 1)],
 [(4, 1), (5, 1), (6, 1), (7, 1), (8, 1)],
 [(9, 1), (10, 1), (11, 1), (12, 1)],
 [(13, 1), (14, 1), (15, 1), (16, 1)],
 [(16, 1), (17, 1), (18, 1), (19, 1)]]

## Créer un modèle TF IDF

In [22]:
from gensim import corpora, models

tfidf = models.TfidfModel(corpus)
print(tfidf)

TfidfModel(num_docs=457513, num_nnz=2024696)


In [23]:
corpus_tfidf = tfidf[corpus]
print(corpus_tfidf[0])

[(0, 0.6099944689918242), (1, 0.3818987516207417), (2, 0.48738159179345575), (3, 0.49448890310175697)]


In [24]:
final_words[0]

['decide', 'community', 'broadcast', 'licence']

In [25]:
from pprint import pprint

for doc in corpus_tfidf: 
    pprint(f"Poids des mots {doc}")
    break

('Poids des mots [(0, 0.6099944689918242), (1, 0.3818987516207417), (2, '
 '0.48738159179345575), (3, 0.49448890310175697)]')


## LDA bag of words

In [26]:
import gensim

In [27]:
# LDA multicore 
'''
Train your lda model using gensim.models.LdaMulticore and save it to 'lda_model'
'''
lda_model = gensim.models.LdaMulticore(corpus, 
                                       num_topics=10, 
                                       id2word = dictionary, 
                                       passes = 2)

In [28]:
'''
For each topic, we will explore the words occuring in that topic and its relative weight
'''

#num_topics (int, optional) – The number of topics to be selected, if -1 - all topics will be in result (ordered by significance).
for idx, topic in lda_model.print_topics(-1):
    
    print("Topic: {} \nWords: {}".format(idx, topic ))
    print("\n")

Topic: 0 
Words: 0.024*"close" + 0.020*"coast" + 0.017*"kill" + 0.016*"gold" + 0.016*"continue" + 0.015*"force" + 0.014*"storm" + 0.014*"north" + 0.013*"protest" + 0.012*"pakistan"


Topic: 1 
Words: 0.024*"fire" + 0.017*"urge" + 0.016*"test" + 0.013*"break" + 0.012*"resident" + 0.011*"law" + 0.010*"govt" + 0.009*"bill" + 0.008*"public" + 0.008*"blaze"


Topic: 2 
Words: 0.034*"take" + 0.024*"open" + 0.018*"trial" + 0.015*"accuse" + 0.013*"adelaide" + 0.011*"bail" + 0.011*"stand" + 0.010*"team" + 0.009*"stay" + 0.008*"grant"


Topic: 3 
Words: 0.017*"china" + 0.017*"warn" + 0.016*"talk" + 0.015*"kill" + 0.013*"attack" + 0.012*"iraq" + 0.010*"blast" + 0.010*"aust" + 0.009*"time" + 0.009*"rat"


Topic: 4 
Words: 0.025*"water" + 0.021*"council" + 0.019*"plan" + 0.014*"govt" + 0.014*"group" + 0.014*"boost" + 0.013*"mine" + 0.013*"worker" + 0.011*"farmer" + 0.011*"need"


Topic: 5 
Words: 0.040*"court" + 0.033*"charge" + 0.032*"face" + 0.019*"market" + 0.019*"jail" + 0.017*"high" + 0.015*"a

In [29]:
final_words[959]

['yemen', 'chopper', 'crash', 'kill', 'eight', 'troop']

In [30]:
# test

'''
Check which topic our test document belongs to using the LDA Bag of Words model.
'''
document_num = 959
# Our test document is document number 4310

for index, score in sorted(lda_model[corpus[document_num]], key=lambda tup: -1*tup[1]):

    print("\nScore: {}\t \nTopic: {}".format(score, lda_model.print_topic(index, 10)))




Score: 0.39901524782180786	 
Topic: 0.024*"close" + 0.020*"coast" + 0.017*"kill" + 0.016*"gold" + 0.016*"continue" + 0.015*"force" + 0.014*"storm" + 0.014*"north" + 0.013*"protest" + 0.012*"pakistan"

Score: 0.34340208768844604	 
Topic: 0.075*"police" + 0.028*"crash" + 0.024*"death" + 0.019*"woman" + 0.014*"probe" + 0.014*"road" + 0.012*"find" + 0.012*"driver" + 0.012*"victim" + 0.012*"charge"

Score: 0.1575324982404709	 
Topic: 0.034*"take" + 0.024*"open" + 0.018*"trial" + 0.015*"accuse" + 0.013*"adelaide" + 0.011*"bail" + 0.011*"stand" + 0.010*"team" + 0.009*"stay" + 0.008*"grant"

Score: 0.014298612251877785	 
Topic: 0.017*"china" + 0.017*"warn" + 0.016*"talk" + 0.015*"kill" + 0.013*"attack" + 0.012*"iraq" + 0.010*"blast" + 0.010*"aust" + 0.009*"time" + 0.009*"rat"

Score: 0.014294443652033806	 
Topic: 0.040*"court" + 0.033*"charge" + 0.032*"face" + 0.019*"market" + 0.019*"jail" + 0.017*"high" + 0.015*"assault" + 0.014*"drug" + 0.014*"record" + 0.012*"case"

Score: 0.01429155748337

## LDA Tf Idf

In [31]:
'''
Define lda model using corpus_tfidf, again using gensim.models.LdaMulticore()
'''
lda_model_tfidf = gensim.models.LdaMulticore(corpus_tfidf, 
                                             num_topics=10, 
                                             id2word = dictionary, 
                                             passes = 2, 
                                             workers=4)

In [32]:
'''
For each topic, we will explore the words occuring in that topic and its relative weight
'''
for idx, topic in lda_model_tfidf.print_topics(-1):
    
    print("Topic: {} Word: {}".format(idx, topic))
    print("\n")



Topic: 0 Word: 0.007*"north" + 0.006*"bushfire" + 0.005*"news" + 0.005*"west" + 0.005*"cyclone" + 0.005*"mental" + 0.004*"response" + 0.004*"health" + 0.004*"korea" + 0.004*"emergency"


Topic: 1 Word: 0.020*"close" + 0.007*"price" + 0.005*"swan" + 0.005*"whale" + 0.005*"australia" + 0.005*"aussie" + 0.005*"unveil" + 0.004*"economy" + 0.004*"world" + 0.004*"gippsland"


Topic: 2 Word: 0.010*"govt" + 0.010*"council" + 0.009*"rudd" + 0.009*"plan" + 0.008*"fund" + 0.006*"cut" + 0.006*"water" + 0.006*"union" + 0.006*"job" + 0.005*"urge"


Topic: 3 Word: 0.023*"charge" + 0.018*"court" + 0.015*"murder" + 0.013*"police" + 0.012*"accuse" + 0.011*"assault" + 0.011*"face" + 0.010*"jail" + 0.009*"child" + 0.009*"guilty"


Topic: 4 Word: 0.013*"interview" + 0.009*"climate" + 0.008*"change" + 0.006*"cancer" + 0.006*"govt" + 0.005*"smash" + 0.005*"rural" + 0.005*"patient" + 0.005*"horse" + 0.004*"murray"


Topic: 5 Word: 0.018*"miss" + 0.012*"search" + 0.010*"find" + 0.010*"obama" + 0.008*"body" + 0

In [33]:
final_words[959]

['yemen', 'chopper', 'crash', 'kill', 'eight', 'troop']

In [34]:
# test

'''
Check which topic our test document belongs to using the LDA TF IDF model.
'''
document_num = 959
# Our test document is document number 4310

for index, score in sorted(lda_model[corpus_tfidf[document_num]], key=lambda tup: -1*tup[1]):

    print("\nScore: {}\t \nTopic: {}".format(score, lda_model.print_topic(index, 10)))




Score: 0.32419654726982117	 
Topic: 0.024*"close" + 0.020*"coast" + 0.017*"kill" + 0.016*"gold" + 0.016*"continue" + 0.015*"force" + 0.014*"storm" + 0.014*"north" + 0.013*"protest" + 0.012*"pakistan"

Score: 0.26579779386520386	 
Topic: 0.075*"police" + 0.028*"crash" + 0.024*"death" + 0.019*"woman" + 0.014*"probe" + 0.014*"road" + 0.012*"find" + 0.012*"driver" + 0.012*"victim" + 0.012*"charge"

Score: 0.20173302292823792	 
Topic: 0.034*"take" + 0.024*"open" + 0.018*"trial" + 0.015*"accuse" + 0.013*"adelaide" + 0.011*"bail" + 0.011*"stand" + 0.010*"team" + 0.009*"stay" + 0.008*"grant"

Score: 0.029767390340566635	 
Topic: 0.017*"china" + 0.017*"warn" + 0.016*"talk" + 0.015*"kill" + 0.013*"attack" + 0.012*"iraq" + 0.010*"blast" + 0.010*"aust" + 0.009*"time" + 0.009*"rat"

Score: 0.029759394004940987	 
Topic: 0.040*"court" + 0.033*"charge" + 0.032*"face" + 0.019*"market" + 0.019*"jail" + 0.017*"high" + 0.015*"assault" + 0.014*"drug" + 0.014*"record" + 0.012*"case"

Score: 0.0297495238482

In [35]:
test="mayor, quit, council, function"

## Test modele

In [36]:
stemmer = SnowballStemmer("english")

### Model tuto (juste verbes)

In [37]:
'''
Write a function to perform the pre processing steps on the entire dataset
'''
def lemmatize_stemming(text):
    return stemmer.stem(WordNetLemmatizer().lemmatize(text, pos='v'))

# Tokenize and lemmatize
def preprocess(text):
    
    result = []
    
    for token in gensim.utils.simple_preprocess(text) :
        
        if token not in gensim.parsing.preprocessing.STOPWORDS and len(token) > 3:
            
            # TODO: Apply lemmatize_stemming() on the token, then add to the results list
            # print("###############################")
            # print(lemmatize_stemming(token))
            result.append(lemmatize_stemming(token))
    
    return result

In [38]:
unseen_document = "My favorite sports activities are running and swimming."

# Data preprocessing step for the unseen document
bow_vector = dictionary.doc2bow(preprocess(unseen_document))

for index, score in sorted(lda_model[bow_vector], key=lambda tup: -1*tup[1]):

    print("Score: {}\t Topic: {}".format(score, lda_model.print_topic(index, 5)))



Score: 0.28808218240737915	 Topic: 0.034*"take" + 0.024*"open" + 0.018*"trial" + 0.015*"accuse" + 0.013*"adelaide"
Score: 0.2759249806404114	 Topic: 0.028*"miss" + 0.025*"hospital" + 0.021*"rise" + 0.019*"die" + 0.017*"search"
Score: 0.2609405517578125	 Topic: 0.024*"close" + 0.020*"coast" + 0.017*"kill" + 0.016*"gold" + 0.016*"continue"
Score: 0.0250107292085886	 Topic: 0.017*"china" + 0.017*"warn" + 0.016*"talk" + 0.015*"kill" + 0.013*"attack"
Score: 0.02500934712588787	 Topic: 0.040*"court" + 0.033*"charge" + 0.032*"face" + 0.019*"market" + 0.019*"jail"
Score: 0.025008687749505043	 Topic: 0.017*"first" + 0.016*"win" + 0.016*"australia" + 0.015*"return" + 0.014*"home"
Score: 0.025007696822285652	 Topic: 0.033*"govt" + 0.021*"fund" + 0.016*"health" + 0.016*"plan" + 0.015*"change"
Score: 0.025005271658301353	 Topic: 0.024*"fire" + 0.017*"urge" + 0.016*"test" + 0.013*"break" + 0.012*"resident"
Score: 0.025005271658301353	 Topic: 0.025*"water" + 0.021*"council" + 0.019*"plan" + 0.014*"go

### MOdel custom avec verbes noms et adverbes

In [39]:
'''
Write a function to perform the pre processing steps on the entire dataset
'''
def lemmatize_stemming(text):
    w = text
    w = WordNetLemmatizer().lemmatize(w, pos='v').lower()
    w = WordNetLemmatizer().lemmatize(w, pos='n')
    w = WordNetLemmatizer().lemmatize(w, pos='a')
    return w

# Tokenize and lemmatize
def preprocess(text):
    
    result = []
    
    for token in gensim.utils.simple_preprocess(text) :
        
        if token not in gensim.parsing.preprocessing.STOPWORDS and len(token) > 3:
            
            # TODO: Apply lemmatize_stemming() on the token, then add to the results list
            result.append(lemmatize_stemming(token))
    
    return result

In [40]:
unseen_document = "Lightning strikes in India kill 38 people"

# Data preprocessing step for the unseen document
bow_vector = dictionary.doc2bow(preprocess(unseen_document))

for index, score in sorted(lda_model[bow_vector], key=lambda tup: -1*tup[1]):

    print("Score: {}\t Topic: {}".format(score, lda_model.print_topic(index, 5)))



Score: 0.5700391530990601	 Topic: 0.024*"close" + 0.020*"coast" + 0.017*"kill" + 0.016*"gold" + 0.016*"continue"
Score: 0.2965978980064392	 Topic: 0.017*"china" + 0.017*"warn" + 0.016*"talk" + 0.015*"kill" + 0.013*"attack"
Score: 0.01667250134050846	 Topic: 0.040*"court" + 0.033*"charge" + 0.032*"face" + 0.019*"market" + 0.019*"jail"
Score: 0.016671404242515564	 Topic: 0.024*"fire" + 0.017*"urge" + 0.016*"test" + 0.013*"break" + 0.012*"resident"
Score: 0.016670815646648407	 Topic: 0.075*"police" + 0.028*"crash" + 0.024*"death" + 0.019*"woman" + 0.014*"probe"
Score: 0.016670165583491325	 Topic: 0.025*"water" + 0.021*"council" + 0.019*"plan" + 0.014*"govt" + 0.014*"group"
Score: 0.01667007803916931	 Topic: 0.034*"take" + 0.024*"open" + 0.018*"trial" + 0.015*"accuse" + 0.013*"adelaide"
Score: 0.01666966639459133	 Topic: 0.033*"govt" + 0.021*"fund" + 0.016*"health" + 0.016*"plan" + 0.015*"change"
Score: 0.016669200733304024	 Topic: 0.017*"first" + 0.016*"win" + 0.016*"australia" + 0.015*"r

In [41]:
!pip install pyldavis

Collecting pyldavis
  Downloading pyLDAvis-3.3.1.tar.gz (1.7 MB)
[K     |████████████████████████████████| 1.7 MB 5.7 MB/s eta 0:00:01
[?25h  Installing build dependencies ... [?25ldone
[?25h  Getting requirements to build wheel ... [?25ldone
[?25h  Installing backend dependencies ... [?25ldone
[?25h    Preparing wheel metadata ... [?25ldone
Collecting numpy>=1.20.0
  Downloading numpy-1.21.0-cp38-cp38-macosx_10_9_x86_64.whl (16.9 MB)
[K     |████████████████████████████████| 16.9 MB 2.5 MB/s eta 0:00:01
Collecting funcy
  Downloading funcy-1.16-py2.py3-none-any.whl (32 kB)
Collecting sklearn
  Downloading sklearn-0.0.tar.gz (1.1 kB)
Collecting pandas>=1.2.0
  Downloading pandas-1.3.0-cp38-cp38-macosx_10_9_x86_64.whl (11.4 MB)
[K     |████████████████████████████████| 11.4 MB 566 kB/s eta 0:00:01
Building wheels for collected packages: pyldavis, sklearn
  Building wheel for pyldavis (PEP 517) ... [?25ldone
[?25h  Created wheel for pyldavis: filename=pyLDAvis-3.3.1-py2.py3-

In [84]:
import pyLDAvis
import pyLDAvis.gensim_models as gensimvis
from operator import itemgetter

  and should_run_async(code)


In [102]:
get_max_score_topic = lda_model_tfidf.get_document_topics(corpus_tfidf[0])
max(get_max_score_topic, key=itemgetter(1))

  and should_run_async(code)


(0, 0.45845973)

In [101]:
get_max_score_topics = lda_model_tfidf.get_document_topics(corpus_tfidf[:10])
scores = []
for score in get_max_score_topics:
    scores.append((max(score, key=itemgetter(1))))
scores

  and should_run_async(code)


[(0, 0.4581894),
 (3, 0.3922062),
 (0, 0.26929396),
 (2, 0.6993244),
 (6, 0.40034476),
 (3, 0.694104),
 (1, 0.7143104),
 (8, 0.3521499),
 (2, 0.40077934),
 (1, 0.42290905)]