In [None]:
# Just testing data extraction from a copy and pasted article


In [2]:
import pandas as pd
from ollama import Client
from pydantic import BaseModel, Field
import json
from typing import Optional, List

ollama_client = Client(host='http://localhost:11434')

class ArticleClassification(BaseModel):
    people: Optional[List[str]] = Field(None, description="List of proper nouns for people mentioned", min_items=0)
    organizations: Optional[List[str]] = Field(None, description="List of organizations mentioned", min_items=0)
    date: Optional[str] = Field(None, description="Relevant date mentioned (e.g., '2025-05-30' or 'May 2025')", max_length=50)
    location: Optional[str] = Field(None, description="Primary location mentioned (e.g., city, state, country)", max_length=100)
    main_topic: str = Field(..., description="Main topic of the article (2-3 words)", min_length=2, max_length=50)

def process_text_with_ollama(content, client):
    """
    Process input text and return a DataFrame with extracted information.
    
    Args:
        content: String representing the input text
        client: Ollama client instance
    
    Returns:
        pandas.DataFrame: DataFrame with extracted information
    """
    if not content or len(content.strip()) < 50:
        print("Error: Input text is too short or empty.")
        return pd.DataFrame()
    
    template = f"""
    You are an expert at extracting key information from any text. Analyze the input text and provide a structured JSON output with:
    - "people": List of proper nouns for individuals mentioned (e.g., full names like 'Taylor Swift', 'Scooter Braun'), null if none
    - "organizations": List of organizations, companies, or institutions mentioned (e.g., 'Tesla', 'CNN', 'Big Machine'), null if none
    - "date": The most relevant date mentioned (preferably in YYYY-MM-DD format like '2025-05-30' or descriptive like 'May 2025'), null if none
    - "location": The primary location mentioned (e.g., city, state, country like 'New York, NY'), null if no clear location
    - "main_topic": Main topic of the text (2-3 words, specific and descriptive, e.g., 'music rights', 'climate summit')
    Prioritize the date and location tied to the primary event or focus of the text. Include all relevant people and organizations mentioned.
    Return ONLY the JSON object, wrapped in triple backticks (```json\n{{}}\n```).
    Examples:
    ```json
    {{
        "people": ["Taylor Swift", "Scooter Braun"],
        "organizations": ["Big Machine", "CNN"],
        "date": "2025-05-25",
        "location": null,
        "main_topic": "music rights"
    }}
    ```
    ```json
    {{
        "people": ["Greta Thunberg"],
        "organizations": ["Greenpeace", "United Nations"],
        "date": "2025-05-28",
        "location": "London, UK",
        "main_topic": "climate summit"
    }}
    ```
    Content: {content[:2000]}
    """
    try:
        response = client.generate(
            model='mistral:7b-instruct-v0.3-q4_0',
            prompt=template,
            options={"temperature": 0.3}  # Lowered temperature for more precise outputs
        )
        raw_response = response['response'].strip()
        if raw_response.startswith('```json'):
            raw_response = raw_response[7:].rsplit('```', 1)[0].strip()
        print(f"Raw LLM response: {raw_response[:200]}...")
        result = json.loads(raw_response)
        classification = ArticleClassification.model_validate(result)
        # Convert to DataFrame, flattening lists
        result_dict = classification.dict()
        result_dict['people'] = ', '.join(result_dict['people'] or []) or None
        result_dict['organizations'] = ', '.join(result_dict['organizations'] or []) or None
        return pd.DataFrame([result_dict])
    except Exception as e:
        print(f"Error processing content: {str(e)}")
        return pd.DataFrame()

# Example usage with your provided text
if __name__ == "__main__":
    sample_text = """
    Taylor Swift has bought back the rights to her first six albums, ending a long-running battle over the ownership of her music.

"All of the music I've ever made now belongs to me," said the star, announcing the news on her official website. "I've been bursting into tears of joy... ever since I found out this is really happening."

The saga began in June 2019, when music manager Scooter Braun bought Swift's former record label Big Machine and, with it, all of the songs from Taylor Swift, Fearless, Speak Now, Red, 1989 and Reputation.

Swift had personal objections to the deal, blaming Braun for complicity in the "incessant, manipulative bullying" against her by Kanye West, one of his clients.

ADVERTISING

On her website, Swift said that reclaiming the rights to her music had, for a long time, seemed unimaginable.

"To say this is my greatest dream come true is actually being pretty reserved about it," she added, thanking fans for their support as the drama played out.

"I can't thank you enough for helping to reunite me with this art that I have dedicated my life to, but have never owned until now.

"I almost stopped thinking it could ever happen, after 20 years of having the carrot dangled and then yanked away," she wrote.

"But that's all in the past now."

In the music industry, the owner of a master recording controls the way it is distributed and licenced. The artist still earns royalties, but controlling the masters offers protection over how the work is used in future.

Reputation (Taylor's Version) delayed?
TAS Rights Management Taylor Swift sits cross-legged on the floor, holding copies of her first six albumsTAS Rights Management
Swift posed with vinyl copies of her first six albums to celebrate the news
Swift responded to the original sale of her masters by vowing to re-record those records, effectively diminishing the value of those master tapes, and putting ownership back in her hands.

To date, she has released four re-recorded albums - known as "Taylor's Versions" - with dozens of bonus tracks and supplementary material.

In her letter, the star told fans she had yet to complete the project, after "hitting a stopping point" while trying to remake 2017's Reputation album - which dealt with public scrutiny of her private life, and the fall-out of her feud with Kanye West.

"The Reputation album was so specific to that time in my life," she explained. "All that defiance, that longing to be understood while feeling purposefully misunderstood...

"To be perfectly honest, it's the one album in those first six that I thought couldn't be improved by re-doing it... so I kept putting it off."

Last week, the star previewed the new version of Reputation's first single, Look What You Made Me Do, in an episode of The Handmaid's Tale - but her letter suggested that a full re-recording would be delayed or even scrapped.

However, she promised that vault tracks from the record would be released at a future date, if fans were "into the idea".

She also confirmed that she had re-recorded her self-titled debut, adding: "I really love how it sounds now".

"Those two albums can still have their moments to re-emerge when the time is right," she added.

"But if it happens, it won't be from a place of sadness and longing for what I wish I could have. It will just be a celebration now."

What is a master recording?
As the name suggests, a master recording is the original recorded performance of a song. Whoever owns it controls all the rights to exploit the music.

That includes distributing it to streaming services, pressing new physical CDs and vinyl, creating box sets, or licensing songs to movies or video games.

Swift, as the writer or co-writer of her music, always maintained her publishing rights, which meant she was able to veto attempts to license songs like Shake It Off and Love Story to other companies.

"I do want my music to live on. I do want it to be in movies. I do want it to be in commercials. But I only want that if I own it," she told Billboard in 2019.

It is not known how much it cost Swift to acquire her masters, but the catalogue previously sold for $300m (£222m) in 2020.

The BBC understands that rumours she paid between $600m to $1bn are inaccurately high.

Getty Images Taylor Swift dances in an ornate cream ball gown during a date on her 2024 Eras TourGetty Images
Revisiting her old songs for the "Taylor's Version" project helped to inspire the career-spanning Eras Tour
How did the sale of Taylor Swift's masters happen?
When 14-year-old Taylor Swift moved to Nashville in 2004 to chase her dream of becoming a country pop star, she signed a record deal with Big Machine.

Label boss Scott Borchetta gave the unproven singer a big cash advance in exchange for having ownership of the master recordings to her first six albums "in perpetuity".

This was fairly common practice in the era before streaming, when artists needed record label backing to get played on the radio, and for the manufacture and distribution of CDs.

Swift's deal with Big Machine expired in 2018, at which point she left and signed with Republic Records and Universal Music Group (UMG).

A year later, Borchetta sold his label to Scooter Braun's Ithaca Holdings.

Swift said she only learned about the deal when it was announced; characterising it as an act of aggression that "stripped me of my life's work".

She labelled Braun - who rose to prominence as the manager of Justin Bieber and Ariana Grande - as "the definition of toxic male privilege in our industry".

She also expressed frustration that she had been unable to make a counter offer for her music.

"I spent 10 years of my life trying rigorously to purchase my masters outright and was then denied that opportunity," she told Billboard, adding that: "Artists should maybe have the first right of refusal to buy."

Was Taylor Swift really banned from playing her hits?
What is the Swift vs Braun dispute all about
Taylor Swift's Red: The stories behind the songs
Taylor Swift releases a 'perfect replica' of Fearless
Braun later told Variety that the dispute had "gotten out of hand" after he and his family received death threats.

The music mogul later sold his stake in Swift's back catalogue to Shamrock Holdings, a Los Angeles investment fund founded by the Disney family in 1978, in November 2020.

The multi-million dollar deal left Swift feeling betrayed again.

"This is the second time my music had been sold without my knowledge," she said in a social media post.

While she was "open to the possibility of a partnership with Shamrock", she subsequently learnt that, under the terms of the sale, Braun would "continue to profit off my old music" for years.

"I simply cannot in good conscience bring myself to be involved in benefiting Scooter Braun's interests," she wrote in a letter to the company, which she posted on X.

Getty Images Taylor Swift performs in a bejewelled top during the Z100 Jungle Ball in 2012Getty Images
Ownership of the masters means that Swift can now choose to license original recordings to films and TV shows, in addition to the re-recordings
She began releasing her re-recorded albums in 2021, starting with her breakthrough, coming-of-age album Fearless.

Produced with forensic attention to detail, they were often indistinguishable from the originals - albeit with slightly cleaner mixes, and greater separation between the instruments.

But the big attraction was the bonus tracks, including the unabridged, 10-minute version of her break-up ballad All Too Well - described by Variety magazine as the "holy grail" of the star's back catalogue.

The song went on to top the US charts, and made number three in the UK - where it is the longest song ever to reach the top five.

In the meantime, the singer continued to release original material, including the Grammy Award-winning albums Folklore and Midnights.

In 2023, Forbes magazine reported that Swift had become the first musician to make $1 billion (£740 million) solely from songwriting and performing.

Half of her fortune came from music royalties and touring, while the rest came from the increasing value of her music catalogue, including her re-recordings.

Revisiting the old material also inspired Swift's career-spanning Eras tour, which made more than $2 billion (£1.48 billion) in ticket sales across 2023 and 2024.

In her letter, Swift said the success of the Eras tour "is why I was able to buy back my music".

She added that she was heartened to see her struggle inspiring other artists.

"Every time a new artist tells me they negotiated to own their master recordings in their record contract because of this fight, I'm reminded of how important it was for all of this to happen.

"Thank you being curious about something that used to be thought of as too industry-centric for broad discussion.

"You'll never know how much it means to me that you cared. Every single bit of it counted, and ended us up here."
    """

    # Process text and get DataFrame
    df = process_text_with_ollama(sample_text, ollama_client)
    print("\nResulting DataFrame:")
    print(df)

Raw LLM response: {
        "people": ["Taylor Swift", "Scooter Braun"],
        "organizations": ["Big Machine"],
        "date": "2019-06",
        "location": null,
        "main_topic": "music rights"
    }...

Resulting DataFrame:
                        people organizations     date location    main_topic
0  Taylor Swift, Scooter Braun   Big Machine  2019-06     None  music rights


C:\Users\cha\AppData\Local\Temp\ipykernel_7512\4274856445.py:74: PydanticDeprecatedSince20: The `dict` method is deprecated; use `model_dump` instead. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.11/migration/
  result_dict = classification.dict()
