## Librairies

In [1]:
import pandas as pd
import numpy as np

## Exploring the dataset

In [2]:
df = pd.read_json("News_Category_Dataset_v2.json", lines=True, dtype={"headline": str})

In [3]:
df.head(10)

Unnamed: 0,category,headline,authors,link,short_description,date
0,CRIME,There Were 2 Mass Shootings In Texas Last Week...,Melissa Jeltsen,https://www.huffingtonpost.com/entry/texas-ama...,She left her husband. He killed their children...,2018-05-26
1,ENTERTAINMENT,Will Smith Joins Diplo And Nicky Jam For The 2...,Andy McDonald,https://www.huffingtonpost.com/entry/will-smit...,Of course it has a song.,2018-05-26
2,ENTERTAINMENT,Hugh Grant Marries For The First Time At Age 57,Ron Dicker,https://www.huffingtonpost.com/entry/hugh-gran...,The actor and his longtime girlfriend Anna Ebe...,2018-05-26
3,ENTERTAINMENT,Jim Carrey Blasts 'Castrato' Adam Schiff And D...,Ron Dicker,https://www.huffingtonpost.com/entry/jim-carre...,The actor gives Dems an ass-kicking for not fi...,2018-05-26
4,ENTERTAINMENT,Julianna Margulies Uses Donald Trump Poop Bags...,Ron Dicker,https://www.huffingtonpost.com/entry/julianna-...,"The ""Dietland"" actress said using the bags is ...",2018-05-26
5,ENTERTAINMENT,Morgan Freeman 'Devastated' That Sexual Harass...,Ron Dicker,https://www.huffingtonpost.com/entry/morgan-fr...,"""It is not right to equate horrific incidents ...",2018-05-26
6,ENTERTAINMENT,Donald Trump Is Lovin' New McDonald's Jingle I...,Ron Dicker,https://www.huffingtonpost.com/entry/donald-tr...,"It's catchy, all right.",2018-05-26
7,ENTERTAINMENT,What To Watch On Amazon Prime That’s New This ...,Todd Van Luling,https://www.huffingtonpost.com/entry/amazon-pr...,There's a great mini-series joining this week.,2018-05-26
8,ENTERTAINMENT,Mike Myers Reveals He'd 'Like To' Do A Fourth ...,Andy McDonald,https://www.huffingtonpost.com/entry/mike-myer...,"Myer's kids may be pushing for a new ""Powers"" ...",2018-05-26
9,ENTERTAINMENT,What To Watch On Hulu That’s New This Week,Todd Van Luling,https://www.huffingtonpost.com/entry/hulu-what...,You're getting a recent Academy Award-winning ...,2018-05-26


- **Constitutions des mots de notre dataset**

In [5]:
#Cette fonction nous permet de recupérer les mots des phrases

def word_split(df):
    new_df = []
    for i in df:
        new_df.append(i.split(" "))
    return new_df

In [7]:
word_dataset = word_split(df["headline"].tolist())
word_dataset

[['There',
  'Were',
  '2',
  'Mass',
  'Shootings',
  'In',
  'Texas',
  'Last',
  'Week,',
  'But',
  'Only',
  '1',
  'On',
  'TV'],
 ['Will',
  'Smith',
  'Joins',
  'Diplo',
  'And',
  'Nicky',
  'Jam',
  'For',
  'The',
  '2018',
  'World',
  "Cup's",
  'Official',
  'Song'],
 ['Hugh',
  'Grant',
  'Marries',
  'For',
  'The',
  'First',
  'Time',
  'At',
  'Age',
  '57'],
 ['Jim',
  'Carrey',
  'Blasts',
  "'Castrato'",
  'Adam',
  'Schiff',
  'And',
  'Democrats',
  'In',
  'New',
  'Artwork'],
 ['Julianna',
  'Margulies',
  'Uses',
  'Donald',
  'Trump',
  'Poop',
  'Bags',
  'To',
  'Pick',
  'Up',
  'After',
  'Her',
  'Dog'],
 ['Morgan',
  'Freeman',
  "'Devastated'",
  'That',
  'Sexual',
  'Harassment',
  'Claims',
  'Could',
  'Undermine',
  'Legacy'],
 ['Donald',
  'Trump',
  'Is',
  "Lovin'",
  'New',
  "McDonald's",
  'Jingle',
  'In',
  "'Tonight",
  "Show'",
  'Bit'],
 ['What',
  'To',
  'Watch',
  'On',
  'Amazon',
  'Prime',
  'That’s',
  'New',
  'This',
  'Week'

- **Total de chaque mot dans le dataset**

In [28]:
from itertools import chain
def count_word(df):
    new_df = chain.from_iterable(df)
    word_count = pd.Series(new_df).value_counts()
    word_count = pd.DataFrame({"Word": word_count.index, "Count": word_count.values})
    return word_count

In [30]:
count_word(word_dataset).head(10)

Unnamed: 0,Word,Count
0,The,47803
1,To,38569
2,A,24839
3,In,24141
4,Of,22956
5,For,18788
6,Is,16823
7,And,15137
8,On,13642
9,With,12556


## Preprocessing

- **Data Quality**

In [36]:
def data_quality(df):
    assert all([isinstance(t, str) for t in df])
    assert all([t != np.nan for t in df])
    return True

In [37]:
def force_format(texts):
    return [str(t) for t in texts]

In [35]:
data_quality(df['headline'])

True

In [42]:
new_df =force_format(df["headline"])
new_df

['There Were 2 Mass Shootings In Texas Last Week, But Only 1 On TV',
 "Will Smith Joins Diplo And Nicky Jam For The 2018 World Cup's Official Song",
 'Hugh Grant Marries For The First Time At Age 57',
 "Jim Carrey Blasts 'Castrato' Adam Schiff And Democrats In New Artwork",
 'Julianna Margulies Uses Donald Trump Poop Bags To Pick Up After Her Dog',
 "Morgan Freeman 'Devastated' That Sexual Harassment Claims Could Undermine Legacy",
 "Donald Trump Is Lovin' New McDonald's Jingle In 'Tonight Show' Bit",
 'What To Watch On Amazon Prime That’s New This Week',
 "Mike Myers Reveals He'd 'Like To' Do A Fourth Austin Powers Film",
 'What To Watch On Hulu That’s New This Week',
 'Justin Timberlake Visits Texas School Shooting Victims',
 "South Korean President Meets North Korea's Kim Jong Un To Talk Trump Summit",
 'With Its Way Of Life At Risk, This Remote Oyster-Growing Region Called In Robots',
 "Trump's Crackdown On Immigrant Parents Puts More Kids In An Already Strained System",
 "'Trump's

- **Filtering texts**

1. remove html balise

In [45]:
from bs4 import BeautifulSoup
def remove_html(df):
    text = BeautifulSoup(df, 'lxml').text
    return text.get_text()

In [48]:
tk = [remove_html(t) for t in new_df]
tk

['There Were 2 Mass Shootings In Texas Last Week, But Only 1 On TV',
 "Will Smith Joins Diplo And Nicky Jam For The 2018 World Cup's Official Song",
 'Hugh Grant Marries For The First Time At Age 57',
 "Jim Carrey Blasts 'Castrato' Adam Schiff And Democrats In New Artwork",
 'Julianna Margulies Uses Donald Trump Poop Bags To Pick Up After Her Dog',
 "Morgan Freeman 'Devastated' That Sexual Harassment Claims Could Undermine Legacy",
 "Donald Trump Is Lovin' New McDonald's Jingle In 'Tonight Show' Bit",
 'What To Watch On Amazon Prime That’s New This Week',
 "Mike Myers Reveals He'd 'Like To' Do A Fourth Austin Powers Film",
 'What To Watch On Hulu That’s New This Week',
 'Justin Timberlake Visits Texas School Shooting Victims',
 "South Korean President Meets North Korea's Kim Jong Un To Talk Trump Summit",
 'With Its Way Of Life At Risk, This Remote Oyster-Growing Region Called In Robots',
 "Trump's Crackdown On Immigrant Parents Puts More Kids In An Already Strained System",
 "'Trump's

2. remove punctuations

In [66]:
import string
def remove_punctuation(df):
    no_punc = ''.join([c for c in df if c not in string.punctuation])
    return no_punc

In [68]:
punct = [remove_punctuation(t) for t in tk]
punct

['There Were 2 Mass Shootings In Texas Last Week But Only 1 On TV',
 'Will Smith Joins Diplo And Nicky Jam For The 2018 World Cups Official Song',
 'Hugh Grant Marries For The First Time At Age 57',
 'Jim Carrey Blasts Castrato Adam Schiff And Democrats In New Artwork',
 'Julianna Margulies Uses Donald Trump Poop Bags To Pick Up After Her Dog',
 'Morgan Freeman Devastated That Sexual Harassment Claims Could Undermine Legacy',
 'Donald Trump Is Lovin New McDonalds Jingle In Tonight Show Bit',
 'What To Watch On Amazon Prime That’s New This Week',
 'Mike Myers Reveals Hed Like To Do A Fourth Austin Powers Film',
 'What To Watch On Hulu That’s New This Week',
 'Justin Timberlake Visits Texas School Shooting Victims',
 'South Korean President Meets North Koreas Kim Jong Un To Talk Trump Summit',
 'With Its Way Of Life At Risk This Remote OysterGrowing Region Called In Robots',
 'Trumps Crackdown On Immigrant Parents Puts More Kids In An Already Strained System',
 'Trumps Son Should Be Conc

- **Unify Text**

In [94]:
def convert_lower(df):
    new_df = []
    for i in df:
        new_df.append(i.split(" "))
    for i in range(len(new_df)):
        for j in range(len(new_df[i])):
            new_df[i][j] = new_df[i][j].lower()
    return new_df

In [95]:
sent = convert_lower(punct)

In [96]:
sent

[['there',
  'were',
  '2',
  'mass',
  'shootings',
  'in',
  'texas',
  'last',
  'week',
  'but',
  'only',
  '1',
  'on',
  'tv'],
 ['will',
  'smith',
  'joins',
  'diplo',
  'and',
  'nicky',
  'jam',
  'for',
  'the',
  '2018',
  'world',
  'cups',
  'official',
  'song'],
 ['hugh',
  'grant',
  'marries',
  'for',
  'the',
  'first',
  'time',
  'at',
  'age',
  '57'],
 ['jim',
  'carrey',
  'blasts',
  'castrato',
  'adam',
  'schiff',
  'and',
  'democrats',
  'in',
  'new',
  'artwork'],
 ['julianna',
  'margulies',
  'uses',
  'donald',
  'trump',
  'poop',
  'bags',
  'to',
  'pick',
  'up',
  'after',
  'her',
  'dog'],
 ['morgan',
  'freeman',
  'devastated',
  'that',
  'sexual',
  'harassment',
  'claims',
  'could',
  'undermine',
  'legacy'],
 ['donald',
  'trump',
  'is',
  'lovin',
  'new',
  'mcdonalds',
  'jingle',
  'in',
  'tonight',
  'show',
  'bit'],
 ['what',
  'to',
  'watch',
  'on',
  'amazon',
  'prime',
  'that’s',
  'new',
  'this',
  'week'],
 ['mike