In [None]:
!pip install plotly
!pip install dash
!pip install jupyter_dash
!pip install dash_bootstrap_components
!pip install dash_bootstrap_templates

In [None]:
import csv
import nltk
from nltk import word_tokenize
from nltk.util import ngrams
from collections import Counter
import re
from nltk.corpus import stopwords
from nltk.tag import pos_tag
import numpy as np
import pandas as pd
import string

nltk.download('averaged_perceptron_tagger')
nltk.download('maxent_ne_chunker')
nltk.download('words')
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('vader_lexicon')

sw = stopwords.words('english')

In [None]:
def process_text(text):
    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))
    
    # Tokenize the text into words
    tokens = word_tokenize(text)
    
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    filtered_tokens = [word for word in tokens if word.lower() not in stop_words]
    
    # Lowercase the tokens
    lowercase_tokens = [word.lower() for word in filtered_tokens]
    
    # Join the tokens back into a string
    clean_text = ' '.join(lowercase_tokens)
    
    return clean_text

In [None]:
def gram_count(file_name, n_gram):
    # Read the file into a DataFrame
    df = pd.read_csv(file_name, encoding='ISO-8859-1')
    
    # Extract the text data and preprocess it
    text_data = df['text'].tolist()
    processed_text = [process_text(str(text)) for text in text_data if text != '']
    
    # Combine the preprocessed text into a single string
    processed_text = ' '.join(processed_text)
    
    # Tokenize the text into words and remove stopwords
    stop_words = set(stopwords.words('english'))
    words = [word for word in word_tokenize(processed_text) if word.lower() not in stop_words]
    
    # Generate the n-grams and count their frequencies
    n_grams = ngrams(words, n_gram)
    n_gram_counts = Counter(n_grams)
    
    return n_gram_counts

In [None]:
path = "--filepath--"

In [None]:
fake_1gram = gram_count(path+"DataSet_Misinfo_FAKE.csv",1)
fake_2gram = gram_count(path+"DataSet_Misinfo_FAKE.csv",2)
fake_3gram = gram_count(path+"DataSet_Misinfo_FAKE.csv",3)

In [None]:
true_1gram = gram_count(path+"DataSet_Misinfo_TRUE.csv",1)
true_2gram = gram_count(path+"DataSet_Misinfo_TRUE.csv",2)
true_3gram = gram_count(path+"DataSet_Misinfo_TRUE.csv",3)

In [None]:
propa_1gram = gram_count(path+"EXTRA_RussianPropagandaSubset.csv",1)
propa_2gram = gram_count(path+"EXTRA_RussianPropagandaSubset.csv",2)
propa_3gram = gram_count(path+"EXTRA_RussianPropagandaSubset.csv",3)

In [None]:
def sort_n_gram(n_gram):
    # Convert the word counter to a pandas dataframe
        df = pd.DataFrame.from_dict(n_gram, orient='index', columns=['Frequency'])

    # Sort the dataframe by frequency in descending order
        df = df.sort_values(by=['Frequency'], ascending=False).reset_index()
        df.columns = ['word','count']
        df['word'] = df['word'].apply(lambda x: re.sub('[^A-Za-z0-9]',' ', str(x)))
        df['word'] = df['word'].apply(lambda x: re.sub('\s+',' ', str(x)))
        df = df.query('word!=" "')
        return df

In [None]:
fake_1gram = sort_n_gram(fake_1gram)
fake_2gram = sort_n_gram(fake_2gram)
fake_3gram = sort_n_gram(fake_3gram)

In [None]:
true_1gram = sort_n_gram(true_1gram)
true_2gram = sort_n_gram(true_2gram)
true_3gram = sort_n_gram(true_3gram)

In [None]:
propa_1gram = sort_n_gram(propa_1gram)
propa_2gram = sort_n_gram(propa_2gram)
propa_3gram = sort_n_gram(propa_3gram)

In [None]:
df = {"fake":{1:fake_1gram,2:fake_2gram,3:fake_3gram},"true":{1:true_1gram,2:true_2gram,3:true_3gram},"propa":{1:propa_1gram,2:propa_2gram,3:propa_3gram}}

In [None]:
test = df["fake"][1]

In [None]:
px.bar(
    test[:20],
    y = 'word',
    x = 'count'
).update_layout(barmode='stack', yaxis={'categoryorder':'total ascending'})

In [None]:
import matplotlib.pyplot as plt
from jupyter_dash import JupyterDash
from dash import dcc, html
from dash.dependencies import Output, Input
from dash.exceptions import PreventUpdate
import plotly.graph_objs as go
import plotly.express as px
import pandas as pd
import os
import dash_bootstrap_components as dbc
from dash_bootstrap_templates import load_figure_template
from nltk.sentiment.vader import SentimentIntensityAnalyzer


dbc_css = "https://cdn.jsdelivr.net/gh/AnnMarieW/dash-bootstrap-templates/dbc.min.css"
# Create an instance of the SentimentIntensityAnalyzer class
sia = SentimentIntensityAnalyzer()

# Define a function to calculate the sentiment score for each bigram
def get_sentiment(ngram):
    return sia.polarity_scores(ngram)['compound']

app = JupyterDash(__name__, external_stylesheets=[dbc.themes.LUMEN, dbc_css])

load_figure_template("LUMEN")

app.layout = dbc.Container(
    [
        html.H2(),
        dbc.Alert(html.H2("Misinformation & Fake News text Analysis"), color="primary"),

        dbc.Tabs(
            [
                dbc.Tab(label="Top Words", tab_id="gen",children = [
                        dbc.Row([
                                dbc.Row([
                                    dbc.Col(
                                        dbc.RadioItems(id = "type_radio",value = "fake",inline = True,
                                            options=[
                                                {"label": "True News", "value": "true"},
                                                {"label": "Fake News", "value": "fake"},
                                                {"label": "Russian Propaganda", "value": "propa"}
                                            ],),
                                        width=6  
                                        ),
                                    dbc.Col(
                                         html.Div([
                                             html.P("Top: "),                
                                             dcc.Dropdown(
                                                 id="top-dropdown", 
                                                options=[5,10,15,20,30,40,50],
                                                value=10,
                                                className="dbc"
                                             )
                                            ],style = {'display': 'flex'}
                                        ),width=2
                                    ),
                                    dbc.Col(
                                         html.Div([   
                                                html.P("N-Gram: "),                
                                                dcc.Dropdown(
                                                    id="gram-dropdown", 
                                                    options=[1,2,3],
                                                    value=1,
                                                    className="dbc",
                                                )
                                            ],style = {'display': 'flex'}
                                         )
                                    )
                                ]),
                                html.H1(),
                                dbc.Col(dbc.Card(dcc.Graph(id="top_word")))
                        ])
                ]),
                dbc.Tab(
                    label="Sentimental Analysis", 
                    tab_id="sent",
                    children = [
                        dcc.Markdown(id='title2'),
                        dbc.Row([
                                dbc.Row([
                                    dbc.Col(
                                        dbc.RadioItems(id = "type_radio2",value = "fake",inline = True,
                                            options=[
                                                {"label": "True News", "value": "true"},
                                                {"label": "Fake News", "value": "fake"},
                                                {"label": "Russian Propaganda", "value": "propa"}
                                            ],),
                                        width=6  
                                        ),
                                    dbc.Col(
                                         html.Div([
                                             html.P("Top: "),                
                                             dcc.Dropdown(
                                                 id="top-dropdown2", 
                                                options=[5,10,15],
                                                value=10,
                                                className="dbc"
                                             )
                                            ],style = {'display': 'flex'}
                                        ),width=2
                                    ),
                                    dbc.Col(
                                         html.Div([   
                                                html.P("N-Gram: "),                
                                                dcc.Dropdown(
                                                    id="gram-dropdown2", 
                                                    options=[1,2,3],
                                                    value=1,
                                                    className="dbc",
                                                )
                                            ],style = {'display': 'flex'}
                                         )
                                    )
                                ]),
                                html.H1(),
                                dbc.Col(
                                    dbc.Card(
                                        dcc.Graph(id="sent_bar"),
                                    ),
                                    width=7
                                ),
                                dbc.Col(
                                    dbc.Card(
                                        dcc.Graph(id="sent_pie"),
                                    ),
                                    width=5
                                ),
                        ])
                    ]
                ),
             
            ],
            id="tabs",
            active_tab="gen",
        ),
        html.Div(id="tab-content", className="p-4"),
    ]
)

@app.callback(
    Output("top_word", "figure"),
    [Input("type_radio", "value"), Input("top-dropdown", "value"), Input("gram-dropdown","value")],
)
def render_top_word(dataset,top_n,n_gram):
    top_df = df[dataset][n_gram][:top_n]
    
    fig = px.bar (
            top_df,
            y = 'word',
            x = 'count'
    ).update_layout(barmode='stack', yaxis={'categoryorder':'total ascending'})
    return fig

@app.callback(
    Output("sent_bar", "figure"),
    [Input("type_radio2", "value"), Input("top-dropdown2", "value"), Input("gram-dropdown2","value")],
)
def render_sent_bar(dataset,top_n,n_gram):
    df_filtered = df[dataset][n_gram][:500]
    # Apply the get_sentiment function to the 'bigram' column of the DataFrame
    df_filtered['sentiment'] = df_filtered['word'].apply(get_sentiment)
    # Group the DataFrame by sentiment score and select the top N words for each sentiment category

    df_top_positive = df_filtered[df_filtered['sentiment'] > 0].nlargest(top_n, 'sentiment')
    df_top_negative = df_filtered[df_filtered['sentiment'] < 0].nsmallest(top_n, 'sentiment')
    
    if dataset=="true": name = "True News"
    elif dataset=="false": name = "False News"
    else: name = "Russian Propaganda"
    # Create a horizontal bar plot
    fig = go.Figure()
    fig.add_trace(go.Bar(
        y=df_top_positive['word'],
        x=df_top_positive['sentiment'],
        name='Positive',
        orientation='h',
        marker=dict(
            color='green'
        )
    ))
    fig.add_trace(go.Bar(
        y=df_top_negative[::-1]['word'],
        x=df_top_negative[::-1]['sentiment'],
        name='Negative',
        orientation='h',
        marker=dict(
            color='red'
        ),
    ))

    # Set the layout of the plot
    fig.update_layout(
        xaxis_title='Sentiment Score',
        yaxis_title=f'{n_gram}-Gram',
        barmode='stack',
    )
    return fig

@app.callback(
    Output("sent_pie", "figure"),
    [Input("type_radio2", "value"), Input("gram-dropdown2","value")],
)
def render_sent_pie(dataset,n_gram):
    df_filtered = df[dataset][n_gram]
    df_filtered['sentiment'] = df_filtered['word'].apply(get_sentiment)
    df_filtered['category'] = df_filtered['sentiment'].apply(sent_catigorize)
    if dataset=="true": name = "True News"
    elif dataset=="false": name = "False News"
    else: name = "Russian Propaganda"
    fig = px.pie(
        df_filtered.groupby('category').agg("count").reset_index(),
        values="count",
        names = 'category',
        hole=0.6
    )
    return fig

def sent_catigorize(score):
    if score>0: return "Positive"
    elif score<0: return "Negative"
    else: "Neutral"


app.run_server(debug=True, mode="inline", port=8664)

Dash is running on http://127.0.0.1:8664/



INFO:dash.dash:Dash is running on http://127.0.0.1:8664/



<IPython.core.display.Javascript object>