# Projet IA
## Vérification de la véracité des informations concernant COVID19. 

### Importation des dépendances 

In [1]:
import numpy as np
import pandas as pd
# Visualisation
from bokeh.plotting import figure
from bokeh.io import output_notebook, show
# Natural Language Toolkit
from nltk.stem import WordNetLemmatizer



In [2]:
# importation du dataset
df = pd.read_excel("Data-FakeRealCOVID.xlsx", header=0)

## Exploration du Dataset

In [3]:
df.head()

Unnamed: 0,id,tweet,label
0,1,The CDC currently reports 99031 deaths. In gen...,real
1,2,States reported 1121 deaths a small rise from ...,real
2,3,Politically Correct Woman (Almost) Uses Pandem...,fake
3,4,#IndiaFightsCorona: We have 1524 #COVID testin...,real
4,5,Populous states can generate large case counts...,real


In [4]:
df.columns

Index(['id', 'tweet', 'label'], dtype='object')

In [5]:
print("lenght : ",len(df))
print("size :",df.size)
print("shape : ",df.shape)

lenght :  6420
size : 19260
shape :  (6420, 3)


In [6]:
df.info(verbose  = False)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6420 entries, 0 to 6419
Columns: 3 entries, id to label
dtypes: int64(1), object(2)
memory usage: 150.6+ KB


In [7]:
df.dtypes

id        int64
tweet    object
label    object
dtype: object

In [8]:
#Vérification des données manquantes
df.columns[df.isnull().any()]

Index([], dtype='object')

On a pas de données manquantes

In [9]:
# voir les valeurs possible de la collone label
df['label'].unique()

array(['real', 'fake'], dtype=object)

In [10]:
df['label'].nunique()


2

In [11]:
output_notebook()

In [12]:
# Visualisation 

label = df.groupby('label').count()
idx = label.index.tolist()
values= label['tweet'].tolist()

In [13]:
p = figure(x_range=idx, title="Distribution of data",
           toolbar_location=None, tools="")

p.vbar(x=idx, top=values, width=0.9)

p.xgrid.grid_line_color = None


show(p)

## Prétraitement des données

In [14]:
data=df[['tweet','label']]


In [15]:
data['label'] = data['label'].replace('real',1)
data['label'] = data['label'].replace('fake',0)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['label'] = data['label'].replace('real',1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['label'] = data['label'].replace('fake',0)


In [16]:
data['label'].unique()

array([1, 0], dtype=int64)

In [17]:
data_real = data[data['label'] == 1]
data_fake = data[data['label'] == 0]
dataset = pd.concat([data_real, data_fake])

In [18]:
# Fonctions pour nettoyage des données
import re

## supprimer les emojis 
def deEmojify(text):
    return text.encode("ascii", "ignore").decode()
## séparer les hashtags en des mots 
def clean_hash(text):
    s = ""
    for word in str(text).split():
        if word.startswith("#"):
            word=  " ".join([a for a in re.split('([A-Z][a-z]+)', word) if a])
        s+= word+' '
    return s
## supprimer les mentions 
def remove_mentions(text):
    return re.sub("@[A-Za-z0-9_]+","", text)
## supprimer les urls 
def clean_url(text):
    return re.sub(r'''(?i)\b((?:https?://|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:'".,<>?«»“”‘’]))''', " ", text)
## supprimer la ponctuation 
import string
punctuations = string.punctuation
def clean_punctuation(text):
    trs = str.maketrans('', '', punctuations)
    return text.translate(trs)
## supprimer les nombres 
def clean_numbers(text):
    return re.sub('[0-9]+', '', text)

In [19]:
## supprimer les stop words 
# géneration de la liste des "mots vide"(stopwords) avec nltk
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
stopwords= stopwords.words('english')


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\botai\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [20]:
print(*stopwords)

i me my myself we our ours ourselves you you're you've you'll you'd your yours yourself yourselves he him his himself she she's her hers herself it it's its itself they them their theirs themselves what which who whom this that that'll these those am is are was were be been being have has had having do does did doing a an the and but if or because as until while of at by for with about against between into through during before after above below to from up down in out on off over under again further then once here there when where why how all any both each few more most other some such no nor not only own same so than too very s t can will just don don't should should've now d ll m o re ve y ain aren aren't couldn couldn't didn didn't doesn doesn't hadn hadn't hasn hasn't haven haven't isn isn't ma mightn mightn't mustn mustn't needn needn't shan shan't shouldn shouldn't wasn wasn't weren weren't won won't wouldn wouldn't


In [21]:
STOPWORDS= set(stopwords)

In [22]:
## supprimer les stopwords 
def clean_stopword(text):
    s = ""
    for word in str(text).split():
        if word not in STOPWORDS:
             s+=word+" "
    return s

In [23]:
dataset['tweet'] = dataset['tweet'].apply(lambda text: deEmojify(text))
dataset['tweet'] = dataset['tweet'].apply(lambda text: clean_hash(text))
dataset['tweet'] = dataset['tweet'].apply(lambda text: remove_mentions(text))
dataset['tweet'] = dataset['tweet'].apply(lambda text: clean_url(text))
dataset['tweet'] = dataset['tweet'].str.lower()
dataset['tweet'] = dataset['tweet'].apply(lambda text: clean_stopword(text))
dataset['tweet'] = dataset['tweet'].apply(lambda text: clean_punctuation(text))
dataset['tweet'] = dataset['tweet'].apply(lambda text: clean_numbers(text))
dataset


Unnamed: 0,tweet,label
0,cdc currently reports deaths general discrepa...,1
1,states reported deaths small rise last tuesda...,1
3,india fights corona covid testing laborator...,1
4,populous states generate large case counts loo...,1
5,covid act found on average person illinois cov...,1
...,...,...
6413,states like new york illinois california vocal...,0
6415,tiger tested positive covid please stay away p...,0
6416,autopsies prove covid is blood clot pneumonia ...,0
6417,a post claims covid vaccine already developed ...,0


### Tokenisation des tweets avec NLTK

In [44]:
import nltk
nltk.download('punkt')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\botai\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\botai\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\wordnet.zip.


True

In [56]:
from nltk.tokenize import word_tokenize
dataset['tweet_tokenized'] = dataset['tweet'].apply(nltk.word_tokenize)
dataset['tweet_tokenized'].head()

0    [cdc, currently, reports, deaths, general, dis...
1    [states, reported, deaths, small, rise, last, ...
3    [india, fights, corona, covid, testing, labora...
4    [populous, states, generate, large, case, coun...
5    [covid, act, found, on, average, person, illin...
Name: tweet_tokenized, dtype: object

## Stemming avec nltk
 processus de réduction d'un mot à un ou plusieurs racines.

In [57]:
text = dataset['tweet_tokenized'][6]
print(text)


['tested', 'positive', 'covid', 'symptoms', 'stay', 'home', 'away', 'people', 'learn', 'cdcs', 'recommendations', 'around', 'others', 'covid', 'infection']


In [58]:
from nltk.stem import PorterStemmer
ps = PorterStemmer()

def stemming(token):
    l=[]
    for e in token:
        l.append(ps.stem(e))
    return l
dataset['tweet_tokenized']= dataset['tweet_tokenized'].apply(lambda t: stemming(t))
dataset['tweet_tokenized'].head()

0    [cdc, current, report, death, gener, discrep, ...
1    [state, report, death, small, rise, last, tues...
3    [india, fight, corona, covid, test, laboratori...
4    [popul, state, gener, larg, case, count, look,...
5    [covid, act, found, on, averag, person, illino...
Name: tweet_tokenized, dtype: object

## Lemmatization avec NLTK
processus consistant à regrouper les différentes formes infléchies d'un mot afin qu'elles puissent être analysées comme un seul élément

In [59]:
from nltk.stem import WordNetLemmatizer
lm = WordNetLemmatizer()

def lemmatizing(token):
    l=[]
    for e in token:
        l.append(lm.lemmatize(e))
    return l
dataset['tweet_tokenized'] = dataset['tweet_tokenized'].apply(lambda t: lemmatizing(t))
dataset['tweet_tokenized'].head()

0    [cdc, current, report, death, gener, discrep, ...
1    [state, report, death, small, rise, last, tues...
3    [india, fight, corona, covid, test, laboratori...
4    [popul, state, gener, larg, case, count, look,...
5    [covid, act, found, on, averag, person, illino...
Name: tweet_tokenized, dtype: object

## Méthode Bag of words:

In [84]:
from sklearn.feature_extraction.text import CountVectorizer
target=pd.DataFrame()
target['tweet'] = dataset['tweet_tokenized'].apply(lambda x: " ".join(x))
target.tweet[1]

'state report death small rise last tuesday southern state report death'

In [102]:
coun_vect = CountVectorizer(max_features=13)

In [94]:
count_matrix = coun_vect.fit_transform(target.tweet)
count_array = count_matrix.toarray()
df = pd.DataFrame(data=count_array,columns = coun_vect.get_feature_names())
df.to_csv("matrice.",index=True,header=True)
df.head()

Unnamed: 0,aa,aaj,aamir,aamp,aaradhya,ababa,abacha,abakaliki,abandon,abat,...,zombi,zone,zonecityspecif,zoo,zoodirector,zookeep,zoolog,zoom,zydu,zyphr
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [93]:
df['covid']

0       0
1       0
2       1
3       0
4       2
       ..
6415    1
6416    1
6417    1
6418    1
6419    0
Name: covid, Length: 6420, dtype: int64

In [103]:
count_matrix = coun_vect.fit_transform(target.tweet)
count_array = count_matrix.toarray()
df = pd.DataFrame(data=count_array,columns = coun_vect.get_feature_names())
df.head()

Unnamed: 0,case,corona,coronaviru,covid,death,india,new,number,peopl,report,state,test,updat
0,0,0,0,0,3,0,0,0,1,1,0,0,0
1,0,0,0,0,2,0,0,0,0,2,2,0,0
2,0,1,0,1,0,3,0,0,0,0,0,2,0
3,3,0,0,0,0,0,1,0,0,0,2,0,0
4,0,0,0,2,0,0,0,0,1,0,0,0,0
