In [1]:
import ipynb.fs.full.times_reader as feedreader
import ipynb.fs.full.openai_models as openai
import ipynb.fs.full.gemini_models as gemini
import ipynb.fs.full.data_operations as data
import ipynb.fs.full.database_operations as database
import pandas
import pprint
import json

In [2]:
def main_with_gsheets():
    cities = ['Mumbai', 'Delhi', 'Bangalore']
    for city in cities:
        try:
            feedreader.toi_feed_reader.load_links()
            
            print(f'Trying feed for city: {city}...')
            feed = feedreader.toi_feed_reader(city)
            entries = feed.get_feed_entries()
            print(f'Found {entries.shape[0]} news articles for {city}. Now processing to analyse...')
            
            llm = gemini.Gemini_Models()
            response = llm.classify_headlines(entries[['sub-site', 'title']], silent_mode=False)
            if(response.strip()=='' or response.strip()=='{}' or response.strip()=='[]'):
                continue
            else:
                classified_hl = pandas.DataFrame(json.loads(response), columns=['title', 'classification', 'explanation'])
                
                itemized_hl = entries.join(classified_hl, lsuffix='_orig', rsuffix='_copy') 
                itemized_hl = itemized_hl[itemized_hl['classification'] == True]
                itemized_hl.drop_duplicates(subset=['link_id'], keep='first', inplace=True, ignore_index=False)
                
                writer = data.gspread_Operator()
                writer.write_headlines(itemized_hl)
            
        except Exception as e:
            print(f'Caught exception {e} in code flow...')
            import traceback
            traceback.print_exc()
        finally:
            continue

In [3]:
def main_with_databases():

    # Writing all of this with databases instead of google_sheets because:
    # 1. Processing with local databases is faster
    # 2. Storing data is much easier with a local database because of limitations in write ops in Google Sheets 
    
    sub_sites = ['Hyderabad']
    for sub_site in sub_sites:
        try:
            feedreader.toi_feed_reader.load_links()
            
            print(f'Trying feed for sub_site: {sub_site}...')
            feed = feedreader.toi_feed_reader(sub_site)
            entries = feed.get_feed_entries()
            print(f'Found {entries.shape[0]} news articles for {sub_site}. Now processing to dedupe...')

            cnxn = database.mysql_Database('timely_feeds')
            existing_links = cnxn.search_values_in_table('fact_feed_table', 'link_id', entries['link_id'], ['link_id'])

            uniques = entries[~entries['link_id'].isin(existing_links['link_id'])]
            if(uniques.shape[0]>0):                                                          #There is atleast one article worth testing 
                llm = gemini.Gemini_Models()
                response = llm.classify_headlines(uniques[['link_id', 'sub_site_name', 'title']], silent_mode=False)
                if(response.strip()=='' or response.strip()=='{}' or response.strip()=='[]'):
                    continue
                else:
                    classified_hl = pandas.DataFrame(json.loads(response), columns=['link_id', 'title', 'classification', 'explanation'])
                    itemized_hl = pandas.merge(uniques, classified_hl, on='link_id', how='inner', suffixes=('', '_copy')) ##= entries.join(classified_hl, lsuffix='_orig', rsuffix='_copy') 
                    itemized_hl = itemized_hl[itemized_hl['classification'] == True]
                    itemized_hl.drop_duplicates(subset=['link_id'], keep='first', inplace=True, ignore_index=False)
                    #print(itemized_hl.columns)
                    #Write these articles in the database 
                    cnxn.insert_values_in_table('fact_classified_articles', 
                                                itemized_hl[['site_name', 'sub_site_name', 'link_id', 'links', 'title', 'link_date', 'classification', 'explanation']]
                                               )
                    
                    #Write the id of all feed gathered articles to the feed_table database so as to not read them again
                    cnxn.insert_values_in_table('fact_feed_table', 
                                                   uniques[['link_id', 'site_name', 'sub_site_name', 'link_date']])
                    
            else:
                print(f'No new articles found for this subsite: {sub_site}')
                continue
        except Exception as e:
            print(f'Caught exception {e} in code flow...')
            import traceback
            traceback.print_exc()
        finally:
            continue

In [5]:
if __name__ == '__main__':
    hl = main_with_databases()

Trying feed for sub_site: Hyderabad...
Found 20 news articles for Hyderabad. Now processing to dedupe...
No new articles found for this subsite: Hyderabad


cnxn = database.mysql_Database('timely_feeds')
item = pandas.DataFrame({
            'site_name': ['TOI', 'TOI'],
            'sub_site_name': ['Delhi', 'Mumbai'],
            'link_id': ['120831219', '120834074'],
            'link': ['https://timesofindia.indiatimes.com/city/delhi/dda-razes-illegal-farmhouses-on-encroached-land-in-sainik-farms/articleshow/120831219.cms',
                     'https://timesofindia.indiatimes.com/city/mumbai/hc-declares-citys-biggest-dumping-ground-a-protected-forest/articleshow/120834074.cms'],
            'title': ['DDA razes illegal farmhouses on encroached land in Sainik Farms', 
                      'HC declares city\'s biggest dumping ground a \'protected forest\''],
            'link_date': ['2025-05-02T22:58:48+05:30', '2025-05-03T01:04:28+05:30'],
            'classification': ['TRUE', 'TRUE'],
            'explanation': ['Headline reports DDA action against illegal structures on encroached land.',
                            'Court order changes land status, affecting potential development.'],
            }
        )
cnxn.insert_values_in_table('fact_classified_articles', 
                                                item[['site_name', 'sub_site_name', 'link_id', 'link', 'title', 'link_date', 'classification', 'explanation']]
                                               )

In [6]:
feedreader.toi_feed_reader.load_links()
feed = feedreader.toi_feed_reader('Hyderabad')
entries = feed.get_feed_entries()
entries['link_id']

0     120854598
1     120854658
2     120854596
3     120854561
4     120854559
5     120850644
6     120850552
7     120847436
8     120847012
9     120846984
10    120846861
11    120833519
12    120833478
13    120833039
14    120833033
15    120833005
16    120832998
17    120832990
18    120832985
19    120832983
Name: link_id, dtype: object