In [1]:
import pandas as pd
import numpy as np
import plotly as py
import plotly.graph_objs as go
import ipywidgets as widgets
import warnings

py.offline.init_notebook_mode(connected = True)
warnings.filterwarnings('ignore')

In [78]:
df = pd.read_excel("data.xlsx")

In [79]:
df.head()

Unnamed: 0,class,preprocessed-text,source,text,title
0,-1,,algerie360,Yazid AlilatIl a évoqué à la radio nationale é...,Médicaments: Des anticancéreux bientôt produit...
1,-1,,AlgeriePart,"VEON, a publié aujourd’hui les chiffres de l’e...",La descente aux enfers de l’ex-leader de la té...
2,-1,,AlgeriePart,"VEON, a publié aujourd’hui les chiffres de l’e...",
3,-1,,AlgeriePart,Une vidéo inédite obtenue par Algériepart retr...,Vidéo inédite. La colère des employés de djezz...
4,-1,,AlgeriePart,Une grosse pagaille a secoué lundi l’aéroport ...,


In [80]:
df = df.replace('elwatan','Elwatan')
df_agg = df.groupby('source').count()[['class']]

In [81]:
df_agg.columns = ['count']
df_agg['sources'] = ['Algerie Part', 'El Watan', 'Algerie360']
df_agg['percentage'] = (df_agg['count']*100)/df_agg['count'].sum()
df_agg['percentage'] = df_agg['percentage'].round(2)

In [91]:
layout = go.Layout(
    title = "Ditribution des Sources",
    yaxis = go.layout.YAxis(
        tickfont = dict(
            size = 18
        ),
        automargin = True
    )
)

data = [go.Bar(
            x = df_agg['percentage'],
            y = df_agg['sources'],
            text = df_agg['percentage'],
            textposition = 'auto',
            orientation = 'h'
)]

fig = go.Figure(data=data, layout=layout)
py.offline.iplot(fig, filename='horizontal-bar')

In [129]:
layout = go.Layout(
    title = "Ditribution des Sources",
    xaxis = go.layout.XAxis(
        tickfont = dict(
            size = 18
        ),
        automargin = True
    )
)

data = [go.Bar(
            x = df_agg['sources'],
            y = df_agg['percentage'],
            text = df_agg['percentage'],
            textposition = 'auto',
            orientation = 'v'
)]

fig = go.Figure(data=data, layout=layout)
py.offline.iplot(fig, filename='horizontal-bar')

In [136]:
layout = go.Layout(
    title = "Ditribution des Articles par Classe",
    xaxis = go.layout.XAxis(
        tickfont = dict(
            size = 18
        ),
        automargin = True
    )
)
X = ['Positive', 'Négative', 'Neutre']
v1 = len(df[df['class']==1])*100/len(df) 
v2 = len(df[df['class']==-1])*100/len(df) 
v3 = len(df[df['class']==0])*100/len(df)  
data = [go.Bar(
            x = X,
            y = [v1,v2,v3],
            text = [round(v1,2),round(v2,2),round(v3,2)],#df_agg['percentage'],
            textposition = 'auto',
            orientation = 'v'
)]

fig = go.Figure(data=data, layout=layout)
py.offline.iplot(fig, filename='horizontal-bar')

In [128]:
layout = go.Layout(
    title='<b>Ditribution des Sources</b>',
    yaxis=dict(
        title='<i>(Pourcentage)</i>'
    ),
    barmode='stack'
)
sources = ['Algerie360', 'El Watan', 'Algerie Part']
p1 = len(df[(df['class'] == 1) & (df['source']=='algerie360')])
p2 = len(df[(df['class'] == 1) & (df['source']=='AlgeriePart')])
p3 = len(df[(df['class'] == 1) & (df['source']=='Elwatan')])

n1 = len(df[(df['class'] == -1) & (df['source']=='algerie360')])
n2 = len(df[(df['class'] == -1) & (df['source']=='AlgeriePart')])
n3 = len(df[(df['class'] == -1) & (df['source']=='Elwatan')])

nn1 = len(df[(df['class'] == 0) & (df['source']=='algerie360')])
nn2 = len(df[(df['class'] == 0) & (df['source']=='AlgeriePart')])
nn3 = len(df[(df['class'] == 0) & (df['source']=='Elwatan')])

trace1 = go.Bar(
    x=sources,
    y=[p1, p2, p3],
    name='positive',
    marker=dict(
        color = 'green'
    ),
    text = [n1, n2, n3],
    textposition = 'auto'
    
)
trace2 = go.Bar(
    x=sources,
    y=[n1, n2, n3],
    name='negative',
     marker=dict(
        color = 'red'
    ),
    text = [20, 15, 30],
    textposition = 'auto'
)
trace3 = go.Bar(
    x=sources,
    y=[nn1, nn2, nn3],
    name='neutre',
     marker=dict(
        color = 'orange'
    ),
    text = [45, 15, 30],
    textposition = 'auto'
)


data = [trace1, trace2, trace3]
layout = layout

fig = go.Figure(data=data, layout=layout)
py.offline.iplot(fig, filename='stacked-bar')

In [159]:
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
import matplotlib.pyplot as plt

In [164]:
!cd word_cloud 
!pip install --upgrade pip

[33mCache entry deserialization failed, entry ignored[0m
Collecting pip
  Downloading https://files.pythonhosted.org/packages/5c/e0/be401c003291b56efc55aeba6a80ab790d3d4cece2778288d65323009420/pip-19.1.1-py2.py3-none-any.whl (1.4MB)
[K    100% |████████████████████████████████| 1.4MB 116kB/s ta 0:00:011
[?25hInstalling collected packages: pip
  Found existing installation: pip 9.0.1
    Uninstalling pip-9.0.1:
      Successfully uninstalled pip-9.0.1
Successfully installed pip-19.1.1


In [161]:
_text = " ".join(review for review in df.text)
print ("There are {} words in the combination of all articles.".format(len(_text)))

There are 15137688 words in the combination of all articles.


In [None]:
stopwords = set(STOPWORDS)
stopwords.update(["avec", "leur","sur", "le", "dans", "la", "pour", "que", "et", "de", "ce", "en", "cette", 
                  "les", "plus", "ou", "par", "qui", "sont", "dont", "ainsi", "est", "se", "ces"])

wordcloud = WordCloud(stopwords=stopwords, background_color="white").generate(_text)

plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
plt.show()