In [None]:
import dash
from dash import dcc, html
from dash.dependencies import Input, Output
import plotly.express as px
import pandas as pd
from gensim import corpora, models
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

# Ensure required NLTK resources are available
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

In [None]:
import sqlite3
import pandas as pd

def connect_to_db(db_path):
    """Establish a connection to the SQLite database."""
    conn = sqlite3.connect(db_path)
    return conn

def load_table_to_dataframe(conn, table_name):
    """Load a table from the SQLite database into a Pandas DataFrame."""
    query = f"SELECT * FROM {table_name}"
    df = pd.read_sql(query, conn)
    return df

def create_text_column(df):
    """Create a new column 'TEXT' by concatenating specific columns."""
    df['TEXT'] = df['title'] + ' ' + df['meta_description'] + ' ' + df['description'] + ' ' + df['body']
    df['TEXT'] = df['TEXT'].fillna('')
    return df

# DB path
db_path = r"../data/articles.sqlite"

# Connect to the database
conn = connect_to_db(db_path)

# Load the table into a DataFrame
table_name = "article"
df = load_table_to_dataframe(conn, table_name)

# Create the 'TEXT' column
df = create_text_column(df)

# Close the database connection
conn.close()

# Display the resulting DataFrame
print(df.columns)
print(df.head())

In [None]:
import dash
from dash import dcc, html
from dash.dependencies import Input, Output
import plotly.express as px
import pandas as pd
from gensim import corpora, models
from collections import Counter
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.util import ngrams
from nltk.stem import WordNetLemmatizer
from dateutil import parser
import os

# Ensure required NLTK resources are available
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

# Initialize stop words and lemmatizer
stop_words = set(stopwords.words('spanish'))
lemmatizer = WordNetLemmatizer()

# Preprocess text
def preprocess_text(text):
    tokens = word_tokenize(text.lower())
    tokens = [lemmatizer.lemmatize(word) for word in tokens if word.isalpha() and word not in stop_words]
    return tokens

# Custom function to parse the Spanish date strings
def parse_spanish_date(date_str):
    # Handle None or NaN values
    if pd.isnull(date_str):
        return pd.NaT
    
    # Replace month names in Spanish with numbers
    months = {
        "enero": "01",
        "febrero": "02",
        "marzo": "03",
        "abril": "04",
        "mayo": "05",
        "junio": "06",
        "julio": "07",
        "agosto": "08",
        "septiembre": "09",
        "octubre": "10",
        "noviembre": "11",
        "diciembre": "12"
    }
    
    date_str = date_str.lower()
    
    for month, num in months.items():
        date_str = date_str.replace(month, num)
    
    # Now try to parse the date using dateutil.parser
    try:
        parsed_date = parser.parse(date_str, dayfirst=True)
        # Remove timezone info if present
        if parsed_date.tzinfo is not None:
            parsed_date = parsed_date.replace(tzinfo=None)
    except ValueError:
        parsed_date = pd.NaT  # Not a Time if parsing fails
    
    return parsed_date

# Apply the custom parsing function to the 'date' column
df['DATE'] = df['date'].apply(parse_spanish_date)

preprocessed_texts = df['TEXT'].apply(preprocess_text)

# Create a dictionary from the preprocessed text
dictionary = corpora.Dictionary(preprocessed_texts)

# Create a bag-of-words corpus
corpus = [dictionary.doc2bow(text) for text in preprocessed_texts]

# Define the path to save the LDA model
lda_model_path = "lda_model_interactive.gensim"

# Check if the model already exists, if so, load it; otherwise, train and save it
if os.path.exists(lda_model_path):
    lda_model = models.LdaModel.load(lda_model_path)
    print("LDA model loaded from disk.")
else:
    lda_model = models.LdaModel(corpus, num_topics=7, id2word=dictionary, passes=100)
    lda_model.save(lda_model_path)
    print("LDA model trained and saved to disk.")

# Label topics with the 2 most frequent bigrams
def identify_topics_by_frequent_bigrams(lda_model, corpus, preprocessed_texts, num_bigrams=2):
    """Identify each topic by the most frequent bigrams within the documents associated with that topic."""
    document_topics = [max(lda_model[doc], key=lambda x: x[1])[0] for doc in corpus]
    grouped_documents = {topic: [] for topic in range(lda_model.num_topics)}
    for doc_id, topic in enumerate(document_topics):
        grouped_documents[topic].append(doc_id)
    
    # Extract the most frequent bigrams for each topic
    topic_labels = {}
    for topic, documents in grouped_documents.items():
        group_texts = [preprocessed_texts[doc_id] for doc_id in documents]
        all_bigrams = [bigram for text in group_texts for bigram in ngrams(text, 2)]
        bigram_freq = Counter(all_bigrams)
        most_common_bigrams = [' '.join(bigram) for bigram, _ in bigram_freq.most_common(num_bigrams)]
        topic_labels[topic] = ' / '.join(most_common_bigrams)
    
    return topic_labels, grouped_documents

topic_labels, grouped_documents = identify_topics_by_frequent_bigrams(lda_model, corpus, preprocessed_texts)

# Prepare data for plotting
topic_data = []
for doc_id, row in df.iterrows():
    topics = lda_model.get_document_topics(corpus[doc_id])
    for topic_id, prob in topics:
        topic_data.append({
            "Document": doc_id,
            "Topic": topic_labels[topic_id],
            "Probability": prob,
            "Date": row['DATE']
        })

topic_df = pd.DataFrame(topic_data)

# Ensure the Date column is in datetime format
topic_df['Date'] = pd.to_datetime(topic_df['Date'])

# Group by month and calculate the average topic distribution
topic_df['Month'] = topic_df['Date'].dt.to_period('M').astype(str)
monthly_avg_topic_df = topic_df.groupby(['Month', 'Topic']).agg({'Probability': 'mean'}).reset_index()

# Dash app setup
app = dash.Dash(__name__)

# Layout of the app
app.layout = html.Div([
    html.H1("Interactive Topic Exploration"),
    
    dcc.Dropdown(
        id='topic-dropdown',
        options=[{'label': topic_labels[i], 'value': topic_labels[i]} for i in range(lda_model.num_topics)],
        value=topic_labels[0],
        clearable=False
    ),
    
    dcc.Graph(id='topic-graph'),
    
    html.Div(id='topic-words')
])

# Callback to update the graph and the list of words for a selected topic
@app.callback(
    [Output('topic-graph', 'figure'),
     Output('topic-words', 'children')],
    [Input('topic-dropdown', 'value')]
)
def update_graph(selected_topic):
    filtered_df = monthly_avg_topic_df[monthly_avg_topic_df['Topic'] == selected_topic]
    
    fig = px.bar(filtered_df, x='Month', y='Probability',
                 title=f'Average Monthly Distribution for Topic {selected_topic}',
                 labels={"Probability": "Average Probability"})
    
    # Get the top words for the selected topic
    topic_id = list(topic_labels.keys())[list(topic_labels.values()).index(selected_topic)]
    top_words = lda_model.show_topic(topic_id, topn=10)
    word_list = [f"{word}: {round(prob, 3)}" for word, prob in top_words]
    word_div = html.Div([
        html.H3("Most Frequent Words"),
        html.Ul([html.Li(word) for word in word_list])
    ])
    
    return fig, word_div

# Run the app
if __name__ == '__main__':
    app.run_server(debug=True)
