# Importation de Data

# <br></br>

In [207]:
import numpy as np
import pandas as pd
# Pour la visualisation
from bokeh.plotting import figure
from bokeh.io import output_notebook, show
# Natural Language Toolkit
from nltk.stem import WordNetLemmatizer


In [208]:
# importation du dataset
df = pd.read_excel("Data-FakeRealCOVID.xlsx", header=0)

In [209]:
#Lire les premiers elements
df.head()

Unnamed: 0,id,tweet,label
0,1,The CDC currently reports 99031 deaths. In gen...,real
1,2,States reported 1121 deaths a small rise from ...,real
2,3,Politically Correct Woman (Almost) Uses Pandem...,fake
3,4,#IndiaFightsCorona: We have 1524 #COVID testin...,real
4,5,Populous states can generate large case counts...,real


In [210]:
#Afficher les colonnes
df.columns

Index(['id', 'tweet', 'label'], dtype='object')

In [211]:
#Details sur les donnees
print("lenght : ",len(df))
print("size :",df.size)
print("shape : ",df.shape)

lenght :  6420
size : 19260
shape :  (6420, 3)


In [212]:
#
df.info(verbose  = False)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6420 entries, 0 to 6419
Columns: 3 entries, id to label
dtypes: int64(1), object(2)
memory usage: 150.6+ KB


In [213]:
#Type de donnees
df.dtypes

id        int64
tweet    object
label    object
dtype: object

In [214]:
#Vérification des données manquantes
df.columns[df.isnull().any()]

Index([], dtype='object')

On a pas de données manquantes 

In [215]:
# voir les valeurs possibles de la colonne label
df['label'].unique()

array(['real', 'fake'], dtype=object)

In [216]:
#Nombre de valeurs possibles pour la colonne label
df['label'].nunique()

2

In [217]:
#Importer Bokeh
output_notebook()

# Afficher les tweets avec leurs labels

# <br></br>

In [218]:
# Visualisation label, index et valeur  

label = df.groupby('label').count()
idx = label.index.tolist()
values= label['tweet'].tolist()
print(idx)
print(values)
label

['fake', 'real']
[3060, 3360]


Unnamed: 0_level_0,id,tweet
label,Unnamed: 1_level_1,Unnamed: 2_level_1
fake,3060,3060
real,3360,3360


In [219]:
#Visualisation de la distibutions de data en fct de label
p = figure(x_range=idx, title="Distribution of data",
           toolbar_location=None, tools="")

p.vbar(x=idx, top=values, width=0.9)
p.xgrid.grid_line_color = None

show(p)

In [220]:
#Reduire les colonnes en tweet et label
data=df[['tweet','label']]
data

Unnamed: 0,tweet,label
0,The CDC currently reports 99031 deaths. In gen...,real
1,States reported 1121 deaths a small rise from ...,real
2,Politically Correct Woman (Almost) Uses Pandem...,fake
3,#IndiaFightsCorona: We have 1524 #COVID testin...,real
4,Populous states can generate large case counts...,real
...,...,...
6415,A tiger tested positive for COVID-19 please st...,fake
6416,???Autopsies prove that COVID-19 is??� a blood...,fake
6417,_A post claims a COVID-19 vaccine has already ...,fake
6418,Aamir Khan Donate 250 Cr. In PM Relief Cares Fund,fake


# Préparation des données

<br></br>
## 1. Diviser le dataset 

##### Données d'Apprentissage

In [253]:
from sklearn.model_selection import train_test_split
import numpy

# Split data in training and remaining dataset
data_train,data_remaining,label_train, label_remaining = train_test_split(df['tweet'], 
                         df['label'], 
                         train_size=0.8)
trainnig_data=pd.DataFrame(list(zip(data_train,label_train)), columns=['tweet', 'label'])
trainnig_data.head()


Unnamed: 0,tweet,label
0,An Ayurveda practitioner Devender Sharma gave ...,fake
1,COVID-19 Update We have 3 new confirmed cases ...,real
2,“Since the onset of #COVID19Nigeria we‘ve trai...,real
3,In Kerala India Consumerfed starts 24 hours fr...,fake
4,"""Killing Virus Bacteria UV sterilization +Ozon...",fake


#### Données de Validation, Données de Test

In [254]:
# Now since we want the valid and test size to be equal (10% each of overall data). 
# we have to define valid_size=0.5 (that is 50% of remaining data)
data_valid, data_test, label_valid, label_test = train_test_split(data_remaining, 
                                                                         label_remaining , 
                                                                         test_size=0.5)

In [255]:
# Afficher la taille de chaque data
print(data_train.shape,label_train.shape)
print(data_test.shape,label_test.shape)
print(data_valid.shape,label_valid.shape)

(5136,) (5136,)
(642,) (642,)
(642,) (642,)


## 2. Normaliser les données

In [256]:
#Remplacer real par 1 et fake par 0
trainnig_data['label'] =trainnig_data['label'].replace('real',1)
trainnig_data['label']= trainnig_data['label'].replace('fake',0)
trainnig_data['label']

0       0
1       1
2       1
3       0
4       0
       ..
5131    0
5132    1
5133    1
5134    0
5135    1
Name: label, Length: 5136, dtype: int64

In [257]:
# voir les valeurs possibles de la colonne label
trainnig_data['label'].unique()

array([0, 1], dtype=int64)

In [259]:
#Definir Data_real, Data_fake et Dataset
data_real = trainnig_data[trainnig_data['label']== 1]
data_fake = trainnig_data[trainnig_data['label'] == 0]
trainnig_data = pd.concat([data_real, data_fake])
trainnig_data.head()

Unnamed: 0,tweet,label
1,COVID-19 Update We have 3 new confirmed cases ...,1
2,“Since the onset of #COVID19Nigeria we‘ve trai...,1
7,Many of these deaths are in people under 70 ye...,1
8,WHO is working through our 150 country offices...,1
13,779 new cases of #COVID19Nigeria; Lagos-285 Ri...,1


### 2.1 Nettoyage de Data

In [160]:
# Fonctions pour nettoyage des données
import re

In [161]:
#Supprimer les emojis 
def deEmojify(text):
    return text.encode("ascii", "ignore").decode()

In [162]:
# Séparer les hashtags en des mots 
def clean_hash(text):
    s = ""
    for word in str(text).split():
        if word.startswith("#"):
            word=  " ".join([a for a in re.split('([A-Z][a-z]+)', word) if a])
        s+= word+' '
    return s

In [163]:
# Supprimer les mentions 
def remove_mentions(text):
    return re.sub("@[A-Za-z0-9_]+","", text)

In [164]:
#Supprimer les urls 
def clean_url(text):
    return re.sub(r'''(?i)\b((?:https?://|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:'".,<>?«»“”‘’]))''', " ", text)


In [165]:
#Supprimer la ponctuation 
import string
punctuations = string.punctuation
def clean_punctuation(text):
    trs = str.maketrans('', '', punctuations)
    return text.translate(trs)

In [166]:
#Supprimer les nombres 
def clean_numbers(text):
    return re.sub('[0-9]+', '', text)

Supprimer les stopwords 

In [167]:
#Génération de la liste des "mots vide"(stopwords) avec nltk
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
stopwords= stopwords.words('english')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ASUS\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [168]:
#Afficher les stopwords de l'anglais
print(*stopwords)

i me my myself we our ours ourselves you you're you've you'll you'd your yours yourself yourselves he him his himself she she's her hers herself it it's its itself they them their theirs themselves what which who whom this that that'll these those am is are was were be been being have has had having do does did doing a an the and but if or because as until while of at by for with about against between into through during before after above below to from up down in out on off over under again further then once here there when where why how all any both each few more most other some such no nor not only own same so than too very s t can will just don don't should should've now d ll m o re ve y ain aren aren't couldn couldn't didn didn't doesn doesn't hadn hadn't hasn hasn't haven haven't isn isn't ma mightn mightn't mustn mustn't needn needn't shan shan't shouldn shouldn't wasn wasn't weren weren't won won't wouldn wouldn't


In [169]:
#Ajouter les stopword dans une liste
STOPWORDS= set(stopwords)

In [170]:
#Supprimer les stopwords 
def clean_stopword(text):
    s = ""
    for word in str(text).split():
        if word not in STOPWORDS:
             s+=word+" "
    return s

In [260]:
#Appliquer toutes ces oprerations sur notre Dataset
trainnig_data['tweet'] = trainnig_data['tweet'].apply(lambda text: deEmojify(text))
trainnig_data['tweet']= trainnig_data['tweet'].apply(lambda text: clean_hash(text))
trainnig_data['tweet'] = trainnig_data['tweet'].apply(lambda text: remove_mentions(text))
trainnig_data['tweet']= trainnig_data['tweet'].apply(lambda text: clean_url(text))
trainnig_data['tweet']= trainnig_data['tweet'].str.lower()
trainnig_data['tweet']= trainnig_data['tweet'].apply(lambda text: clean_stopword(text))
trainnig_data['tweet'] = trainnig_data['tweet'].apply(lambda text: clean_punctuation(text))
trainnig_data['tweet']= trainnig_data['tweet'].apply(lambda text: clean_numbers(text))
trainnig_data

Unnamed: 0,tweet,label
1,covid update new confirmed cases covid one ca...,1
2,since onset covid nigeria weve trained health...,1
7,many deaths people years age occur low middle...,1
8,working country offices support response covi...,1
13,new cases covid nigeria lagos rivers fct edo...,1
...,...,...
5122,everyone stock essentials milk ration medicine...,0
5128,word coronavirus disinfecting spray reaffirms ...,0
5130,coronavirus vaccines change recipients dna the...,0
5131,year old canadian music awards show junos canc...,0


### 2.2 Indexation : Méthode Bag of words

#### Tokenisation des tweets avec NLTK

In [153]:
import nltk

In [154]:
#punkt:divise un texte en une liste de phrases
#wordnet:you can lemmatize sentences using the WordNetLemmatizer class
nltk.download('punkt')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\ASUS\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\ASUS\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [139]:
from nltk.tokenize import word_tokenize

In [262]:
trainnig_data['tweet_tokenized'] = trainnig_data['tweet'].apply(nltk.word_tokenize)
trainnig_data['tweet_tokenized'].head()
#Autre affichage
t=pd.DataFrame(list(zip(trainnig_data['tweet_tokenized'],trainnig_data['label'])), columns=['tweet', 'label'])
t.head()

Unnamed: 0,tweet,label
0,"[covid, update, new, confirmed, cases, covid, ...",1
1,"[since, onset, covid, nigeria, weve, trained, ...",1
2,"[many, deaths, people, years, age, occur, low,...",1
3,"[working, country, offices, support, response,...",1
4,"[new, cases, covid, nigeria, lagos, rivers, fc...",1


###### Back of words

In [263]:
from sklearn.feature_extraction.text import CountVectorizer

In [266]:
#
target=pd.DataFrame()
target['tweet'] = trainnig_data['tweet_tokenized'].apply(lambda x: " ".join(x))
target.tweet[1]

'covid update new confirmed cases covid one case linked cluster auckland household contact previously reported case two imported cases total number active cases imported cases miq'

In [267]:
#affecter une frequence
coun_vect = CountVectorizer()
count_matrix = coun_vect.fit_transform(target.tweet)
count_array = count_matrix.toarray()
df = pd.DataFrame(data=count_array,columns = coun_vect.get_feature_names())
df.to_csv("matrice.",index=True,header=True)
df.head()

Unnamed: 0,aa,aaj,aamir,aampe,aampes,aaradhya,ababa,abacha,abakaliki,abandoned,...,zone,zonecityspecific,zones,zoo,zoodirector,zookeepers,zoology,zoom,zooming,zydus
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [268]:
#
df['covid']

0       2
1       2
2       1
3       1
4       1
       ..
5131    0
5132    0
5133    1
5134    0
5135    2
Name: covid, Length: 5136, dtype: int64

### 2.3 Ponderation

### 2.4  Reduire le dimension de la matrice

#### Stemming avec nltk

Processus de réduction d'un mot à un ou plusieurs racines \ tries to reduce a word to its dictionary form


In [269]:
#Exemple de tweet_tokenized
text = trainnig_data['tweet_tokenized'][6]
print(text)

['covid', 'lowered', 'death', 'rate', 'chicago']


In [270]:
from nltk.stem import PorterStemmer

In [272]:
#Stemming function
ps = PorterStemmer()

def stemming(token):
    l=[]
    for e in token:
        l.append(ps.stem(e))
    return l


In [276]:
trainnig_data['tweet_tokenized'] = trainnig_data['tweet_tokenized'].apply(lambda t: stemming(t))
trainnig_data['tweet_tokenized'].head()
#Autre affichage
t=pd.DataFrame(list(zip(trainnig_data['tweet_tokenized'],trainnig_data['label'])), columns=['tweet', 'label'])
t.head()

Unnamed: 0,tweet,label
0,"[covid, updat, new, confirm, case, covid, one,...",1
1,"[sinc, onset, covid, nigeria, weve, train, hea...",1
2,"[mani, death, peopl, year, age, occur, low, mi...",1
3,"[work, countri, offic, support, respon, covid,...",1
4,"[new, case, covid, nigeria, lago, river, fct, ...",1


### Lemmatization avec NLTK

Processus consistant à regrouper les différentes formes infléchies d'un mot afin qu'elles puissent être analysées comme un seul élément

In [277]:
from nltk.stem import WordNetLemmatizer

In [279]:
#Lemmatization
lm = WordNetLemmatizer()

def lemmatizing(token):
    l=[]
    for e in token:
        l.append(lm.lemmatize(e))
    return l


In [280]:
trainnig_data['tweet_tokenized'] = trainnig_data['tweet_tokenized'].apply(lambda t: lemmatizing(t))
trainnig_data['tweet_tokenized'].head()

1     [covid, updat, new, confirm, case, covid, one,...
2     [sinc, onset, covid, nigeria, weve, train, hea...
7     [mani, death, peopl, year, age, occur, low, mi...
8     [work, countri, offic, support, respon, covid,...
13    [new, case, covid, nigeria, lago, river, fct, ...
Name: tweet_tokenized, dtype: object

In [282]:
#Apres le stemming et lemmatization
t=pd.DataFrame()
t['tweet'] = trainnig_data['tweet_tokenized'].apply(lambda x: " ".join(x))
c_vect = CountVectorizer()
count_ma = c_vect.fit_transform(t.tweet)
count_a = count_ma.toarray()
d = pd.DataFrame(data=count_a,columns = c_vect.get_feature_names())
d.to_csv("matrice.",index=True,header=True)
d.head()

Unnamed: 0,aa,aaj,aamir,aamp,aaradhya,ababa,abacha,abakaliki,abandon,abat,...,zomato,zombi,zone,zonecityspecif,zoo,zoodirector,zookeep,zoolog,zoom,zydu
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


### TF-IDF

tf–idf, short for term frequency–inverse document frequency, is a numerical statistic that is intended to reflect how important a word is to a document in a collection or corpus.[1]:8 It is often used as a weighting factor in information retrieval and text mining. The tf-idf value increases proportionally to the number of times a word appears in the document, but is offset by the frequency of the word in the corpus, which helps to adjust for the fact that some words appear more frequently in general.

Variations of the tf–idf weighting scheme are often used by search engines as a central tool in scoring and ranking a document's relevance given a user query. tf–idf can be successfully used for stop-words filtering in various subject fields including text summarization and classification.

Let us start with the "Term Frequency" - TF

In [285]:
tokens_list = trainnig_data['tweet_tokenized'].tolist()

In [291]:
tokens_list

[['covid',
  'updat',
  'new',
  'confirm',
  'case',
  'covid',
  'one',
  'case',
  'link',
  'cluster',
  'auckland',
  'household',
  'contact',
  'previou',
  'report',
  'case',
  'two',
  'import',
  'case',
  'total',
  'number',
  'activ',
  'case',
  'import',
  'case',
  'miq'],
 ['sinc',
  'onset',
  'covid',
  'nigeria',
  'weve',
  'train',
  'health',
  'worker',
  'infect',
  'prevent',
  'amp',
  'control',
  'ipc',
  'weve',
  'develop',
  'onlin',
  'cour',
  'ipc',
  'reduc',
  'risk',
  'covid',
  'transmiss',
  'health',
  'care',
  'set',
  'take',
  'respon'],
 ['mani',
  'death',
  'peopl',
  'year',
  'age',
  'occur',
  'low',
  'middleincom',
  'countri',
  'beat',
  'nc',
  'd',
  'covid'],
 ['work',
  'countri',
  'offic',
  'support',
  'respon',
  'covid',
  'support',
  'continu',
  'essenti',
  'health',
  'servic',
  'engag',
  'commun',
  'ensur',
  'demand',
  'servic',
  'maintain'],
 ['new',
  'case',
  'covid',
  'nigeria',
  'lago',
  'river',
 

In [300]:
# Let us get the frequency count

from nltk.corpus import stopwords
stop = stopwords.words('english')
stop.extend(('.', ',', '"', "'", '?', '!', ':', ';', '(', ')', '[', ']', '{', '}','/','-'))
frequency_words = {}
for data in tokens_list:
    data= str(data)
    data = data.replace("[" , "")
    data = data.replace("]","")
    data = data.replace("'","")
    data_list = data.split(',')
    print(data_list)
    for token in data_list:
        token= str(token)
        token = token.rstrip()
        token = token.lstrip()
        if token not in stop:
            if token in frequency_words:
                count = frequency_words[token]
                count = count + 1
                frequency_words[token] = count
            else:
                frequency_words[token] = 1

['covid', ' updat', ' new', ' confirm', ' case', ' covid', ' one', ' case', ' link', ' cluster', ' auckland', ' household', ' contact', ' previou', ' report', ' case', ' two', ' import', ' case', ' total', ' number', ' activ', ' case', ' import', ' case', ' miq']
['sinc', ' onset', ' covid', ' nigeria', ' weve', ' train', ' health', ' worker', ' infect', ' prevent', ' amp', ' control', ' ipc', ' weve', ' develop', ' onlin', ' cour', ' ipc', ' reduc', ' risk', ' covid', ' transmiss', ' health', ' care', ' set', ' take', ' respon']
['mani', ' death', ' peopl', ' year', ' age', ' occur', ' low', ' middleincom', ' countri', ' beat', ' nc', ' d', ' covid']
['work', ' countri', ' offic', ' support', ' respon', ' covid', ' support', ' continu', ' essenti', ' health', ' servic', ' engag', ' commun', ' ensur', ' demand', ' servic', ' maintain']
['new', ' case', ' covid', ' nigeria', ' lago', ' river', ' fct', ' edo', ' enugu', ' delta', ' ebonyi', ' oyo', ' kaduna', ' ogun', ' ondo', ' imo', ' 

['new', ' case', ' record', ' today', ' total', ' activ', ' covid', ' case', ' gautam', ' buddh', ' nagar', ' uttar', ' pradesh', ' covid', ' noida', ' covid', ' covid', ' covid', ' updat', ' coronaviru', ' india', ' coronaviru']
['break', ' peopl', ' may', ' coronaviru', ' requir', ' selfisol', ' law', ' refu', ' face', ' fine', ' england', ' read', ' here']
['latest', ' updat', ' directorgen', ' health', ' polic', ' commiss', ' new', ' case', ' covid', ' report', ' today', ' mean', ' new', ' zealand', ' combin', ' total', ' confirm', ' probabl', ' case', ' confirm', ' case', ' covid']
['india', ' fight', ' corona', ' regularli', ' engag', ' state', ' ut', ' exhibit', ' either', ' sudden', ' surg', ' covid', ' case', ' high', ' caseload', ' report', ' higher', ' mortal', ' district', ' stay', ' safe', ' india', ' win', ' detail']
['india', ' fight', ' corona', ' corona', ' viru', ' updat', ' lakh', ' averag', ' test', ' everi', ' day', ' india', ' scale', ' covid', ' test', ' test', '

['number', ' patient', ' current', ' hospit', ' covid', ' fallen', ' peak', ' juli']
['probabl', ' still', ' see', ' weekend', ' lag', ' effect', ' last', ' sever', ' week', ' tuesday', ' number', ' tell', ' u', ' lot', ' great', ' day', ' total', ' test', ' posit', ' rate', ' stay', ' lower']
['prof', ' peter', ' horbi', ' talk', ' kay', ' burley', ' new', ' treatment', ' triall', ' hospitali', ' covid', ' patient', ' includ', ' veri', ' promi', ' monoclon', ' antibodi', ' treatment', ' involv', ' give', ' patient', ' laboratorymad', ' antibodi', ' kay', ' burley', ' latest']
['there', ' great', ' support', ' church', ' faith', ' base', ' organi', ' alert', ' level', ' understand', ' import', ' practi', ' faith', ' limit', ' gather', ' current', ' appli', ' public', ' privat', ' gather', ' review', ' may']
['black', ' man', ' my', ' greatest', ' risk', ' death', ' longer', ' covid', ' color', ' skin', ' said']
['anoth', ' reason', ' casual', ' attitud', ' peopl', ' toward', ' covid', 

['week', ' well', ' reach', ' million', ' regist', ' case', ' covid', ' thousand', ' death', ' behind', ' statist', ' great', ' deal', ' pain', ' suffer', ' everi', ' life', ' lost', ' matter']
['two', ' case', ' consid', ' recov', ' covid', ' bring', ' total', ' number', ' activ', ' case']
['state', ' report', ' almost', ' k', ' new', ' case', ' day', ' averag', ' risen', ' day', ' row', ' region', ' dynam', ' drive', ' trend']
['group', ' pupil', ' sent', ' home', ' posit', ' test', ' rather', ' someon', ' develop', ' symptom', ' pm', ' say']
['august', ' nation', ' forecast', ' suggest', ' new', ' death', ' report', ' week', ' end', ' septemb', ' data', ' predict', ' total', ' report', ' covid', ' death', ' u', ' septemb', ' learn', ' more']
['thursday', ' mark', ' six', ' month', ' sinc', ' declar', ' covid', ' public', ' health', ' emerg', ' intern', ' concern']
['three', ' stori', ' morn', ' pm', ' consid', ' new', ' coronaviru', ' measur', ' london', ' mayor', ' insist', ' wait'

['covid', ' coronaviru', ' coronavirusupd', ' think', ' ever', ' shake', ' hand', ' anyon']
['got', ' hydroxychloroquin', ' chloroquin', ' discus', ' someon', ' today', ' feel', ' like', ' work', ' peopl', ' recov', ' faster', ' dissent', ' told', ' believ', ' everyth', ' read', ' safe', ' didnt', ' kill', ' trump', ' remind', ' trump', ' never', ' covid']
['a', ' vaccin', ' coronaviru', ' develop']
['multipl', ' studi', ' said', ' decad', ' mask', ' work', ' viru']
['feder', ' deputi', ' equiv', ' feder', ' repr', ' brazil', ' osmar', ' terra', ' claim', ' itali', ' lockdown', ' polici', ' made', ' covid', ' case', ' shot', ' up']
['good', ' true', ' somendfindout', ' u', ' n', ' mni', ' cointrir', ' wth', ' strongest', ' intel', ' agenc', ' cudnt', ' detect', ' fact', ' who', ' cheat', ' worldi', ' corona', ' viru', ' bacteri', ' infect', ' patient', ' cure', ' one', ' day', ' see', ' video', ' share', ' fact', ' b', ' verifi']
['recent', ' manufactur', ' gain', ' wipe', ' covid', ' 

['man', ' beaten', ' deoria', ' india', ' lockdown', ' time']
['u', ' presid', ' donald', ' trump', ' said', ' white', ' hou', ' brief', ' chloroquin', ' hydroxychloroquin', ' often', ' use', ' treat', ' malaria', ' approv', ' food', ' drug', ' administr', ' fda', ' treatment', ' covid']
['covid', ' coronaviru', ' coronavirusupd', ' case', ' total', ' test', ' india', ' apr', ' reason', ' good', ' news', ' tb', ' vaccin', ' bcg', ' effect', ' time', ' travel', ' restrict', ' lockdown', ' let', ' build', ' gain', ' stay', ' home']
['say', ' covid', ' remain', ' air', ' eight', ' hour', ' everyon', ' requir', ' wear', ' mask', ' everywh']
['covid', ' mean', ' see', ' sheep', ' surrend', ' latin']
['video', ' claim', ' itali', ' discov', ' covid', ' bacteria', ' viru']
['chloroquin', ' describ', ' wonder', ' drug', ' corona', ' viru', ' yet', ' coordin', ' campaign', ' it', ' would', ' big', ' pharma', ' cheap', ' effect', ' drug', ' big', ' pharma', ' puppet']
['said', ' game', ' plan', 

['dr', ' uma', ' kumar', ' aiim', ' gave', ' interview', ' coronaviru']
['covid', ' cure', ' hot', ' water', ' bake', ' soda']
['mitch', ' mcconnel', ' tri', ' push', ' candid', ' through', ' may', ' relea', ' bioengin', ' corona', ' viru', ' fromi', ' dunnogyna']
['news', ' govern', ' coronaviru', ' strategi', ' enter', ' hope', ' sort', ' phase']
['biggest', ' covid', ' expert', ' nanavati', ' hospit', ' dr', ' ansari', ' dr', ' limay', ' inform', ' follow', ' four', ' treatment', ' commonli', ' given', ' patient']
['fal', ' claim', ' report', ' never', ' ask', ' coronaviru', ' may', ' press', ' confer', ' transcript', ' show', ' report', ' ask', ' mani', ' pandemicrel', ' question', ' unemploy', ' reopen', ' plan', ' test', ' outbreak']
['hairstylist', ' iowa', ' expo', ' client', ' coronaviru', ' coronaviru', ' hairstylist', ' placebopil']
['horrif', ' handl', ' respect', ' bier', ' citizen', ' ie', ' realiti', ' hw', ' corona', ' downgrad', ' digniti', ' shud', ' b', ' appli', ' b

In [301]:
frequency_words['tweet']

29

In [302]:
df_tfidf = pd.DataFrame(data=list(frequency_words.items()),columns=['word','tf'])


In [303]:
df_tfidf.head()

Unnamed: 0,word,tf
0,covid,3302
1,updat,516
2,new,882
3,confirm,429
4,case,1645


In [304]:
df_tfidf.sort_values(ascending=False, by = "tf", inplace=True)

In [305]:
df_tfidf.head()

Unnamed: 0,word,tf
0,covid,3302
4,case,1645
289,coronaviru,1461
87,test,1310
2,new,882


In [315]:
#Let us get in how many documents (each tweet) does the word occur
def get_documents_count(row):
    document_counter = 0
    word = row['word']
    for document in tokens_list :
        document= str(document)
        document = document.replace("'",'')
        document = document.replace("[",'')
        document = document.replace("]",'')
        document = document.split(',')
        document = map(str.strip,document)
        if word in document:
            document_counter = document_counter + 1
    return document_counter

In [316]:
df_tfidf['document_count'] = df_tfidf.apply(get_documents_count,axis=1)

In [317]:
df_tfidf.head()

Unnamed: 0,word,tf,document_count
0,covid,3302,2545
4,case,1645,1013
289,coronaviru,1461,1232
87,test,1310,837
2,new,882,737


In [318]:
df_tfidf.tail()

Unnamed: 0,word,tf,document_count
2440,zydu,1,1
2437,marchapril,1,1
2436,westcoast,1,1
5091,abc,1,1
3832,accordingli,1,1


In [319]:
# we already have the count of all the documents
total_docs = df.shape[0]

In [320]:
total_docs

5136

###### Let us compute the tf-idf

Term Frequency = tf

Inverse Document Frequency = idf

idf = log(total_docs/number of documents that contain the word)

tf-idf = tf . idf

In [322]:
import math

In [323]:
def compute_tfidf(row):
    idf = math.log10(total_docs/row['document_count'])
    return row['tf'] * idf

In [324]:
df_tfidf['tfidf'] = df_tfidf.apply(compute_tfidf,axis=1)

In [325]:
df_tfidf.head()

Unnamed: 0,word,tf,document_count,tfidf
0,covid,3302,2545,1006.902728
4,case,1645,1013,1159.750612
289,coronaviru,1461,1232,905.840903
87,test,1310,837,1032.14842
2,new,882,737,743.664939


In [326]:
df_tfidf.tail()

Unnamed: 0,word,tf,document_count,tfidf
2440,zydu,1,1,3.710625
2437,marchapril,1,1,3.710625
2436,westcoast,1,1,3.710625
5091,abc,1,1,3.710625
3832,accordingli,1,1,3.710625


In [327]:
df_tfidf.sort_values(by='tfidf',ascending=True,inplace=True)

In [328]:
df_tfidf.head()

Unnamed: 0,word,tf,document_count,tfidf
3832,accordingli,1,1,3.710625
7524,ree,1,1,3.710625
7523,selfm,1,1,3.710625
7522,kem,1,1,3.710625
7521,penali,1,1,3.710625


In [329]:
df_tfidf.replace(to_replace=0.0,value=0.1,inplace=True)

In [330]:
df_tfidf.tail()

Unnamed: 0,word,tf,document_count,tfidf
120,india,856,525,847.838649
289,coronaviru,1461,1232,905.840903
0,covid,3302,2545,1006.902728
87,test,1310,837,1032.14842
4,case,1645,1013,1159.750612


In [331]:
df_tfidf.set_index('word', inplace=True)

In [332]:
df_tfidf.head()

Unnamed: 0_level_0,tf,document_count,tfidf
word,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
accordingli,1,1,3.710625
ree,1,1,3.710625
selfm,1,1,3.710625
kem,1,1,3.710625
penali,1,1,3.710625


In [333]:
word_tfidf = df_tfidf['tfidf'].to_dict()

In [335]:
word_tfidf

{'accordingli': 3.710625015060797,
 'ree': 3.710625015060797,
 'selfm': 3.710625015060797,
 'kem': 3.710625015060797,
 'penali': 3.710625015060797,
 'watsapp': 3.710625015060797,
 'phonesert': 3.710625015060797,
 'sensor': 3.710625015060797,
 'uschin': 3.710625015060797,
 'avigan': 3.710625015060797,
 'testament': 3.710625015060797,
 'throughw': 3.710625015060797,
 'witherspoon': 3.710625015060797,
 'dua': 3.710625015060797,
 'shetti': 3.710625015060797,
 'teargas': 3.710625015060797,
 'abysm': 3.710625015060797,
 'kalyan': 3.710625015060797,
 'lake': 3.710625015060797,
 'ghaziabad': 3.710625015060797,
 'dallascowboy': 3.710625015060797,
 'cowboy': 3.710625015060797,
 'atampt': 3.710625015060797,
 'lid': 3.710625015060797,
 'bay': 3.710625015060797,
 'masjid': 3.710625015060797,
 'trchnolog': 3.710625015060797,
 'mila': 3.710625015060797,
 'forti': 3.710625015060797,
 'hkon': 3.710625015060797,
 'chuki': 3.710625015060797,
 'av': 3.710625015060797,
 'ki': 3.710625015060797,
 'pucho': 3

# Réseau de neurones

# <br></br>

# Entraîner le réseau de neuronne

# <br></br>

# Tester le modèle

# <br></br>