In [1]:
"""Database connection configuration."""
from sqlalchemy import create_engine
from sqlalchemy.orm import sessionmaker
import os

DATABASE_URL = DATABASE_URL = f"postgresql+psycopg2://postgres:{os.getenv('POSTGRES_PASSWORD')}@localhost/gazzetta"

engine = create_engine(DATABASE_URL)
SessionLocal = sessionmaker(autocommit=False, autoflush=False, bind=engine)

def get_db():
    """Get database session."""
    db = SessionLocal()
    try:
        yield db
    finally:
        db.close()

In [2]:
from sqlalchemy import create_engine
import pandas as pd
import os
from sqlalchemy.orm import sessionmaker
from data_collection.db.models import Article, Blogger, Category, StancePrediction

# Get database connection
DATABASE_URL = f"postgresql+psycopg2://postgres:{os.getenv('POSTGRES_PASSWORD')}@localhost/gazzetta"
engine = create_engine(DATABASE_URL)

# SQL query to get all articles with blogger names, categories and stance predictions
query = """
SELECT 
    a.id,
    a.title,
    a.content,
    a.article_url,
    a.published_date,
    b.name as blogger_name,
    string_agg(DISTINCT c.name, ', ') as categories,
    sp.target_club,
    sp.stance,
    sp.justification
FROM articles a
LEFT JOIN bloggers b ON a.blogger_id = b.id
LEFT JOIN article_categories ac ON a.id = ac.article_id
LEFT JOIN categories c ON ac.category_id = c.id
LEFT JOIN stance_predictions sp ON a.id = sp.article_id
GROUP BY 
    a.id, 
    a.title, 
    a.content, 
    a.article_url, 
    a.published_date,
    b.name,
    sp.target_club,
    sp.stance,
    sp.justification
"""

# Read into DataFrame
df = pd.read_sql_query(query, engine)

# Convert datetime columns
df['published_date'] = pd.to_datetime(df['published_date'])

# Split categories string into list
df['categories'] = df['categories'].fillna('').str.split(', ')

# Display info about the DataFrame
print("\nDataFrame Info:")
print(df.info())

print("\nSample of the data:")
print(df.head())

print("\nTotal articles:", len(df))
print("Unique bloggers:", df['blogger_name'].nunique())
print("Total categories:", len(set([cat for cats in df['categories'] if cats != [''] for cat in cats])))


DataFrame Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12448 entries, 0 to 12447
Data columns (total 10 columns):
 #   Column          Non-Null Count  Dtype         
---  ------          --------------  -----         
 0   id              12448 non-null  int64         
 1   title           12448 non-null  object        
 2   content         12448 non-null  object        
 3   article_url     12448 non-null  object        
 4   published_date  12448 non-null  datetime64[ns]
 5   blogger_name    12448 non-null  object        
 6   categories      12448 non-null  object        
 7   target_club     105 non-null    object        
 8   stance          105 non-null    object        
 9   justification   105 non-null    object        
dtypes: datetime64[ns](1), int64(1), object(8)
memory usage: 972.6+ KB
None

Sample of the data:
   id                                              title  \
0   1                  Τάραξε λίγο τα νερά ο Ολυμπιακός…   
1   2  Ο Μεντιλίμπαρ πλέον τον ξέ

In [8]:
df.dropna(subset="stance").groupby("blogger_name")["stance"].value_counts()

blogger_name           stance  
Βασίλης Σαμπράκος      ουδέτερη     5
Γιάννης Σερέτης        αρνητική     1
                       θετική       1
                       ουδέτερη     1
Γιώργος Κούβαρης       ουδέτερη    30
                       αρνητική     1
Γιώργος Τσακίρης       ουδέτερη     2
Δημήτρης Τομαράς       ουδέτερη    17
                       θετική       9
                       αρνητική     2
Κώστας Νικολακόπουλος  θετική       4
                       αρνητική     3
                       ουδέτερη     3
Νίκος Αθανασίου        ουδέτερη     8
Νίκος Παπαδογιάννης    αρνητική     1
                       ουδέτερη     1
Σταύρος Σουντουλίδης   ουδέτερη    13
Χρύσανθος Τσαλτίδης    ουδέτερη     2
                       αρνητική     1
Name: count, dtype: int64