# Joseph ASSOUMA, Issa DIA, Thomas MARGNAC et Carla ZEIDAN

## Importation des librairies :

In [1]:
import pandas as pd
#import autocorrect
import string
import unidecode
import html
from gensim.models.phrases import Phrases
import nltk
from nltk.corpus import stopwords
nltk.download('averaged_perceptron_tagger')
from nltk.stem import WordNetLemmatizer
from nltk import pos_tag

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\Joseph\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


## Importation du dataset :

In [2]:
pathToDataset = "News_Category_Dataset_v2.json"
df = pd.read_json(pathToDataset, lines=True, dtype={"headline": str})

## On s'assure de la qualité des données :

In [3]:
def checkQuality(textList):
    """
    Description
    -----------------
    Check the quality of the dataset.
    
    Param
    -----------------
    textList: List of texts of the dataset.
    
    Return
    -----------------
    Boolean : True if quality if OK.
    """
    # Checking if each element is a string
    assert all([isinstance(text, str) for text in textList])
    return True

## Filtering texts from unwanted characters

In [4]:
def unwanted_char_removing(textList):
    """
    Description
    -----------------
    Remove all unwanted characters in sentences.
    
    Param
    -----------------
    textList: List of texts of the dataset.
    
    Return
    -----------------
    unw_char: List of texts of the dataset without unwanted characters.
    """
    unw_char = []
    for text in textList:
        unw_char.append(html.unescape(text))
    return unw_char

## Unifying our texts:

In [5]:
def punctuation_removing(textList):
    """
    Description
    -----------------
    Remove all punctuation in sentences.
    
    Param
    -----------------
    textList: List of texts of the dataset.
    
    Return
    -----------------
    without_punc: List of texts of the dataset without punctuation.
    """
    without_punc = []
    for text in textList:
        tmp = "".join([i for i in text if i not in string.punctuation])
        tmp = tmp.replace("‘","")
        without_punc.append(tmp.replace("’"," "))
    return without_punc

In [6]:
def accent_removing(textList):
    """
    Description
    -----------------
    Remove all accent in sentences.
    
    Param
    -----------------
    textList: List of texts of the dataset.
    
    Return
    -----------------
    without_accent: List of texts of the dataset without accent.
    """
    without_accent = []
    for text in textList:
        without_accent.append(unidecode.unidecode(text))
    return without_accent

In [7]:
def text_to_lowercase(textList):
    """
    Description
    -----------------
    Convert text to lowercase.
    
    Param
    -----------------
    textList: List of texts of the dataset.
    
    Return
    -----------------
    lowercase: List of texts of the dataset in lowercase.
    """
    lowercase = []
    for text in textList:
        lowercase.append(text.lower())
    return lowercase

## Converting texts to list of words

In [8]:
def text_to_list(textList):
    """
    Description
    -----------------
    Convert text to list of words.
    
    Param
    -----------------
    textList: List of texts of the dataset.
    
    Return
    -----------------
    word_list: List of texts of the dataset in list of words.
    """
    word_list = []
    for text in textList:
        word_list.append(text.split())
    return word_list

## Remove useless words

In [9]:
def useless_words_removing(textList):
    """
    Description
    -----------------
    Remove stopwords from texts.
    
    Param
    -----------------
    textList: List of texts of the dataset.
    
    Return
    -----------------
    with_useful_words: List of texts of the dataset without stopwords.
    """
    stopwords_ =set(stopwords.words('english'))
    with_useful_words = []
    for text in textList:
        with_useful_words.append([word for word in text if not word in stopwords_])
    return with_useful_words

## N-gram creation

In [10]:
def n_gram_creation(textList):
    """
    Description
    -----------------
    Find 2-gram and 3-gram in a sentence.
    
    Param
    -----------------
    textList: List of texts of the dataset.
    
    Return
    -----------------
    with_n_gram: List of texts of the dataset with 2-gram and 3-gram.
    """
    bigram = Phrases(textList)
    trigram = Phrases(bigram[textList])
    with_n_gram = []
    for text in textList:
        with_n_gram.append(trigram[bigram[text]])
    return with_n_gram

## Lemmatize

In [11]:
def lemmatization(textList):
    """
    Description
    -----------------
    Lemmatize words.
    
    Param
    -----------------
    textList: List of texts of the dataset.
    
    Return
    -----------------
    lemmatized: List of texts of the dataset with lemmatized words.
    """
    lemmatizer = WordNetLemmatizer()
    lemmatized = []
    for text in textList:
        sentence = []
        for word in text:
            sentence.append(lemmatizer.lemmatize(word))
        lemmatized.append(sentence)
    return lemmatized

## Part-of-speech tagging

In [12]:
def tagging(textList):
    """
    Description
    -----------------
    Identify the role of words in a sentence.
    
    Param
    -----------------
    textList: List of texts of the dataset.
    
    Return
    -----------------
    tagged: List of texts of the dataset with words roles.
    """
    tagged = []
    for text in textList:
        tagged.append(pos_tag(text))
    return tagged

To fix:
* Will Smith

In [13]:
to_watch = 73 # 7, 21, 73
print('unwanted_char_removing')
texts = unwanted_char_removing(df['headline'].to_list())
print(texts[to_watch])
print('punctuation_removing')
texts = punctuation_removing(texts)
print(texts[to_watch])
print('accent_removing')
texts = accent_removing(texts)
print(texts[to_watch])
print('text_to_lowercase')
texts = text_to_lowercase(texts)
print(texts[to_watch])
print('text_to_list')
texts = text_to_list(texts)
print(texts[to_watch])
print('useless_words_removing')
texts = useless_words_removing(texts)
print(texts[to_watch])
print('n_gram_creation')
texts = n_gram_creation(texts)
print(texts[to_watch])
tmp = texts
print('lemmatization')
texts = lemmatization(texts)
print(texts[to_watch])
#print('tagging')
#texts = tagging(texts)
#texts[to_watch]

unwanted_char_removing
North Korea Threatens Again To Call Off Trump Summit, Warns Of ‘Nuclear Showdown’
punctuation_removing
North Korea Threatens Again To Call Off Trump Summit Warns Of Nuclear Showdown 
accent_removing
North Korea Threatens Again To Call Off Trump Summit Warns Of Nuclear Showdown 
text_to_lowercase
north korea threatens again to call off trump summit warns of nuclear showdown 
text_to_list
['north', 'korea', 'threatens', 'again', 'to', 'call', 'off', 'trump', 'summit', 'warns', 'of', 'nuclear', 'showdown']
useless_words_removing
['north', 'korea', 'threatens', 'call', 'trump', 'summit', 'warns', 'nuclear', 'showdown']
n_gram_creation
['north_korea_threatens', 'call', 'trump', 'summit', 'warns', 'nuclear', 'showdown']
lemmatization
['north_korea_threatens', 'call', 'trump', 'summit', 'warns', 'nuclear', 'showdown']


In [14]:
texts

[['2', 'mass_shootings', 'texas', 'last_week', '1', 'tv'],
 ['smith',
  'join',
  'diplo',
  'nicky',
  'jam',
  '2018',
  'world',
  'cup',
  'official',
  'song'],
 ['hugh', 'grant', 'marries', 'first_time', 'age', '57'],
 ['jim_carrey',
  'blast',
  'castrato',
  'adam_schiff',
  'democrat',
  'new',
  'artwork'],
 ['julianna', 'margulies', 'us', 'donald_trump', 'poop', 'bag', 'pick', 'dog'],
 ['morgan_freeman',
  'devastated',
  'sexual_harassment_claims',
  'could',
  'undermine',
  'legacy'],
 ['donald_trump',
  'lovin',
  'new',
  'mcdonalds',
  'jingle',
  'tonight_show',
  'bit'],
 ['watch', 'amazon_prime', 'new', 'week'],
 ['mike',
  'myers',
  'reveals',
  'hed',
  'like',
  'fourth',
  'austin',
  'power',
  'film'],
 ['watch_hulu', 'new', 'week'],
 ['justin_timberlake', 'visit', 'texas', 'school_shooting', 'victim'],
 ['south_korean_president',
  'meet',
  'north_korea',
  'kim_jong_un',
  'talk',
  'trump',
  'summit'],
 ['way',
  'life',
  'risk',
  'remote',
  'oystergr

In [15]:
import itertools
def compute_word_occurences(texts):
    """You have to define this function yourself. """
    
    words = itertools.chain.from_iterable(texts)
    
    word_count = pd.Series(words).value_counts()
    word_count = pd.DataFrame({"Word": word_count.index, "Count": word_count.values})

    return word_count

In [16]:
tex = compute_word_occurences(texts)

In [17]:
tex

Unnamed: 0,Word,Count
0,photo,10909
1,trump,6442
2,video,5440
3,new,5173
4,say,3980
...,...,...
69739,crispness,1
69740,porous,1
69741,overreacting,1
69742,freakier,1


In [20]:
tex['Word']

0               photo
1               trump
2               video
3                 new
4                 say
             ...     
69739       crispness
69740          porous
69741    overreacting
69742        freakier
69743        achiever
Name: Word, Length: 69744, dtype: object

In [19]:
import gensim.corpora as corpora

#create dictionary
dic = corpora.Dictionary(tex['Word'])

TypeError: doc2bow expects an array of unicode tokens on input, not a single string