# Documentation

https://nlp.stanford.edu/IR-book/html/htmledition/tokenization-1.html  
https://nlp.stanford.edu/IR-book/html/htmledition/normalization-equivalence-classing-of-terms-1.html#sec:normalization  
https://stackoverflow.com/questions/39302880/getting-the-root-word-using-the-wordnet-lemmatizer  
http://scikit-learn.org/stable/tutorial/text_analytics/working_with_text_data.html  
https://spacy.io/usage/linguistic-features  
https://medium.com/@datamonsters/sentiment-analysis-tools-overview-part-1-positive-and-negative-words-databases-ae35431a470c  
http://www.derczynski.com/sheffield/papers/twitter_pos.pdf  
http://www.aclweb.org/anthology/C14-1166

# Modules

In [8]:
import numpy as np
import json
import csv
import pandas as pd

In [9]:
with open('data/json_datas_sample.json',encoding='iso-8859-1') as f:
    data = json.load(f)

In [10]:
data = pd.read_csv('data/csv_datas_sample.csv',encoding='iso-8859-1',delimiter='\t')

In [11]:
data['tweet_text']

0      L'AcadÃ©mie franÃ§aise rejette le choix de l'a...
1      Jeux olympiques 2016 : Ã  Rio, des installatio...
2      Quand est ce que chacun s'occupe de son domain...
3      #JO2024 : l' #AcadÃ©mieFranÃ§aise peste contre...
4      Mobilisation contre le slogan pourri de #Paris...
5      Ce soir, le partage au â¤ï¸ de la rÃ©union d...
6      Soutenons #Paris2024 avec la #RCValdeLoire le ...
7      @_Stalker_69_ // Et la mÃ¨re Hidalgo qui nous ...
8      https://t.co/Ufnloou7OK via @academie_fr #Pari...
9      @lequipe Soutenir la candidature #Paris2024 n'...
10     LâAcadÃ©mie franÃ§aise exprime Ã  lâunanim...
11     L'acadÃ©mie franÃ§aise pas contente du slogan ...
12     La @FDJ lance un jeu Ã  gratter spÃ©cial #Pari...
13     Service inadÃ©quate ; dÃ©terioration mÃ©caniqu...
14     #JO2024 : l'AcadÃ©mie franÃ§aise s'Ã©nerve con...
15     La #Generation2024 trÃ¨s concentrÃ©e pour la r...
16     Les #TransportsIDF pour les Jeux Olympiques : ...
17     [#PARIS2024] Les partici

In [12]:
from nltk.tokenize import word_tokenize
import nltk

In [13]:
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package punkt to /mnt/data/zwanto/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /mnt/data/zwanto/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /mnt/data/zwanto/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [41]:
data.loc[0,'tweet_text']

"L'AcadÃ©mie franÃ§aise rejette le choix de l'anglais pour le slogan de la candidature de Paris #JO2024 #MadeForSharing https://t.co/7IjwlIvFBw"

In [14]:
tokenizer = word_tokenize(data.loc[0,'tweet_text'],language = 'french')

In [46]:
tokenizer

["L'AcadÃ©mie",
 'franÃ§aise',
 'rejette',
 'le',
 'choix',
 'de',
 "l'anglais",
 'pour',
 'le',
 'slogan',
 'de',
 'la',
 'candidature',
 'de',
 'Paris',
 '#',
 'JO2024',
 '#',
 'MadeForSharing',
 'https',
 ':',
 '//t.co/7IjwlIvFBw']

In [66]:
nltk.pos_tag(tokenizer)

[("L'AcadÃ©mie", 'NNP'),
 ('franÃ§aise', 'NN'),
 ('rejette', 'NN'),
 ('le', 'JJ'),
 ('choix', 'NN'),
 ('de', 'IN'),
 ("l'anglais", 'FW'),
 ('pour', 'FW'),
 ('le', 'FW'),
 ('slogan', 'FW'),
 ('de', 'FW'),
 ('la', 'FW'),
 ('candidature', 'NN'),
 ('de', 'IN'),
 ('Paris', 'NNP'),
 ('#', '#'),
 ('JO2024', 'NNP'),
 ('#', '#'),
 ('MadeForSharing', 'VBG'),
 ('https', 'NN'),
 (':', ':'),
 ('//t.co/7IjwlIvFBw', 'NN')]

In [51]:
from nltk.stem.wordnet import WordNetLemmatizer

In [53]:
stemmer = WordNetLemmatizer()

In [59]:
stemmer.lemmatize(tokenizer[1])

'franÃ§aise'

In [60]:
nltk.pos_tag_sents(tokenizer[1])

LookupError: 
**********************************************************************
  Resource [93maveraged_perceptron_tagger[0m not found.
  Please use the NLTK Downloader to obtain the resource:

  [31m>>> import nltk
  >>> nltk.download('averaged_perceptron_tagger')
  [0m
  Searched in:
    - '/mnt/data/zwanto/nltk_data'
    - '/usr/share/nltk_data'
    - '/usr/local/share/nltk_data'
    - '/usr/lib/nltk_data'
    - '/usr/local/lib/nltk_data'
    - '/srv/venv/sandbox/nltk_data'
    - '/srv/venv/sandbox/share/nltk_data'
    - '/srv/venv/sandbox/lib/nltk_data'
**********************************************************************


In [1]:
import nltk

In [6]:
nltk.download('stopwords')
nltk.corpus.stopwords.words('french')

[nltk_data] Downloading package stopwords to
[nltk_data]     /mnt/data/zwanto/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


['au',
 'aux',
 'avec',
 'ce',
 'ces',
 'dans',
 'de',
 'des',
 'du',
 'elle',
 'en',
 'et',
 'eux',
 'il',
 'je',
 'la',
 'le',
 'leur',
 'lui',
 'ma',
 'mais',
 'me',
 'même',
 'mes',
 'moi',
 'mon',
 'ne',
 'nos',
 'notre',
 'nous',
 'on',
 'ou',
 'par',
 'pas',
 'pour',
 'qu',
 'que',
 'qui',
 'sa',
 'se',
 'ses',
 'son',
 'sur',
 'ta',
 'te',
 'tes',
 'toi',
 'ton',
 'tu',
 'un',
 'une',
 'vos',
 'votre',
 'vous',
 'c',
 'd',
 'j',
 'l',
 'à',
 'm',
 'n',
 's',
 't',
 'y',
 'été',
 'étée',
 'étées',
 'étés',
 'étant',
 'étante',
 'étants',
 'étantes',
 'suis',
 'es',
 'est',
 'sommes',
 'êtes',
 'sont',
 'serai',
 'seras',
 'sera',
 'serons',
 'serez',
 'seront',
 'serais',
 'serait',
 'serions',
 'seriez',
 'seraient',
 'étais',
 'était',
 'étions',
 'étiez',
 'étaient',
 'fus',
 'fut',
 'fûmes',
 'fûtes',
 'furent',
 'sois',
 'soit',
 'soyons',
 'soyez',
 'soient',
 'fusse',
 'fusses',
 'fût',
 'fussions',
 'fussiez',
 'fussent',
 'ayant',
 'ayante',
 'ayantes',
 'ayants',
 'eu'

In [1]:
from nltk.tokenize import RegexpTokenizer

tokenizer = RegexpTokenizer(r'\w+')
tokenizer.tokenize('Eighty-seven miles to go, yet.  Onward!')


['Eighty', 'seven', 'miles', 'to', 'go', 'yet', 'Onward']

In [2]:
import nltk
def getTerms(sentences):
    tokens = nltk.word_tokenize(sentences)
    words = [w.lower() for w in tokens if w.isalnum()]
    print(tokens)
    print(words)

getTerms("hh, hh3h. wo shi 2 4 A . fdffdf. A&&B ")

['hh', ',', 'hh3h', '.', 'wo', 'shi', '2', '4', 'A', '.', 'fdffdf', '.', 'A', '&', '&', 'B']
['hh', 'hh3h', 'wo', 'shi', '2', '4', 'a', 'fdffdf', 'a', 'b']


In [2]:
from spellchecker import SpellChecker

spell = SpellChecker()

# find those words that may be misspelled
misspelled = spell.unknown(['something', 'is', 'hapenning', 'here'])

for word in misspelled:
    # Get the one `most likely` answer
    print(spell.correction(word))

    # Get a list of `likely` options
    print(spell.candidates(word))

happening
{'penning', 'henning', 'happening'}


In [27]:
spell = SpellChecker(language='fr')
a=tokenizer[2].encode('utf-8')
spell.unknown(['bnjour'])

UnicodeDecodeError: 'ascii' codec can't decode byte 0xc3 in position 438: ordinal not in range(128)

In [18]:
unicode(tokenizer,'utf-8')

NameError: name 'unicode' is not defined

In [2]:
import enchant
from nltk.metrics import edit_distance

class SpellingReplacer(object):
    def __init__(self, dict_name = 'fr_FR', max_dist = 2):
        self.spell_dict = enchant.Dict(dict_name)
        self.max_dist = 2

    def replace(self, word):
        if self.spell_dict.check(word):
            return word
        suggestions = self.spell_dict.suggest(word)

        if suggestions and edit_distance(word, suggestions[0]) <= self.max_dist:
            return suggestions[0]
        else:
            return word

In [3]:
def spell_check(word_list):
    checked_list = []
    for item in word_list:
        replacer = SpellingReplacer()
        r = replacer.replace(item)
        checked_list.append(r)
    return checked_list

In [4]:
spell_check(['bnjour'])

['bonjour']

In [34]:
enchant.list_dicts()

[('en', <Enchant: Aspell Provider>),
 ('en_CA', <Enchant: Aspell Provider>),
 ('en_GB', <Enchant: Aspell Provider>),
 ('en_US', <Enchant: Aspell Provider>)]