In [1]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
import time
import cloudscraper
import os
from dotenv import load_dotenv
import pymysql
from ulid import ULID
from datetime import datetime
from sqlalchemy import types, create_engine
import warnings
warnings.filterwarnings("ignore")

In [2]:
## load env
BASE_DIR = os.path.dirname(os.path.abspath('__file__'))
load_dotenv(os.path.join(BASE_DIR, '.env'))

True

## Retrieve section date from MySQL

In [3]:
MYSQL_DB_USER = os.getenv("MYSQL_DB_USER")
MYSQL_DB_PASSWORD = os.getenv("MYSQL_DB_PASSWORD")
MYSQL_DB_HOST = os.getenv("MYSQL_DB_HOST")
MYSQL_DB_PORT = int(os.getenv("MYSQL_DB_PORT"))
MYSQL_DB_NAME = os.getenv("MYSQL_DB_NAME") 

In [4]:
conn = pymysql.connect(host=MYSQL_DB_HOST,
                       port=MYSQL_DB_PORT,
                       user=MYSQL_DB_USER,
                       passwd=MYSQL_DB_PASSWORD,
                       db=MYSQL_DB_NAME)

In [5]:
##finding last time table was updated
section_data = pd.read_sql_query("Select id,url from sections where section = 'Abroad, Ghanaians'", con=conn)

## Getting story URLS

In [6]:
df_columns = ['sectionID','url','storyExtracted']

In [7]:
def get_story_urls(row):
    # try:
    df = pd.DataFrame()
    scraper = cloudscraper.create_scraper()
    page = scraper.get(row["url"])
    soup = BeautifulSoup(page.content, 'html.parser')
    page_stories = soup.find_all("div", class_ = "afcon-news list")
    for ul in page_stories:
        for li in ul.findAll('li'):
            data = li.find("a")
            url = row['url']+data.get('href')
            story_extracted = "PENDING"
            temp_row = pd.DataFrame(data = [[row["id"], url,story_extracted]], columns = df_columns)
            df = pd.concat([df,temp_row], ignore_index=True)  
        time.sleep(20)
    # except:
    #     pass

    return df

In [8]:
url_df  = section_data.apply(lambda row : get_story_urls(row),axis = 1)

In [9]:
article_urls_df = pd.DataFrame(columns = df_columns)

In [10]:
for row in url_df:
    article_urls_df = pd.concat([article_urls_df,row])

Add ulid to table

In [11]:
## function to generate ulid
def generate_ulid(row):
    row["id"] = int(ULID())
    return row

In [12]:
article_urls_df = article_urls_df.apply(lambda row: generate_ulid(row), axis=1)

In [13]:
article_urls_df = article_urls_df[["id","url","sectionID","storyExtracted"]]

In [14]:
article_urls_df["dateGenerated"] = datetime.now()

## Push to db

In [20]:
mysql_engine = create_engine(f"mysql+pymysql://{MYSQL_DB_USER}:{MYSQL_DB_PASSWORD}@{MYSQL_DB_HOST}:{MYSQL_DB_PORT}/{MYSQL_DB_NAME}")

In [33]:
## create df schema
df_schema = dict(zip(article_urls_df.columns.tolist(),(types.VARCHAR(length=100), 
                                                    types.VARCHAR(length=1000),
                                                    types.VARCHAR(length=100),
                                                    types.VARCHAR(length=100),
                                                    types.TIMESTAMP)))

In [35]:
article_urls_df.to_sql("story_urls", con=mysql_engine, if_exists="append",dtype=df_schema,index=False) 

27

## Extracting Stories from urls

In [23]:
# story_df_columns = ['Section','Header','Date','Title','Information']
# stories_data = pd.DataFrame(columns = story_df_columns)

In [24]:
# def get_abroad_gh_articles(section_title,url):
#     df = pd.DataFrame()
#     scraper = cloudscraper.create_scraper()
#     page = scraper.get(url)
#     soup = BeautifulSoup(page.content, 'html.parser')
#     page_stories = soup.find(id = "medsection1")
#     try:
#         div_class = page_stories.find("div", class_ = "article-left-col")       
#     except AttributeError:
#         print('cannot find the target div: article-left-col')
#         file_object = open("C:\\Users\\annieboadu\\Documents\\ghana-web-data-main\\error_urls\\article urls.txt", 'a')
#         file_object.write(f'\n\n{section_title},{url}')
#         file_object.close()
#         return None
#     else:
#         header = div_class.find('p', class_ = "floatLeft").text
#         date = div_class.find('a', id = "date").text
#         title = div_class.find('h1').text
#         info = div_class.find('p',id = "article-123").text
#         temp_row = pd.DataFrame(data = [[section_title, header, date, title, info]], columns = story_df_columns)
#         df = pd.concat([df,temp_row],ignore_index =True)
#     return df

In [36]:
# abroad_gh_final_df = article_urls_df.apply(lambda row:get_abroad_gh_articles(row['Section Title'],row['Url']), axis = 1)

In [26]:
# abroad_stories_urls_df = pd.DataFrame(columns = story_df_columns)

In [37]:
# for row in abroad_gh_final_df:
#     abroad_stories_urls_df = pd.concat([abroad_stories_urls_df,row])

In [28]:
# abroad_stories_urls_df.reset_index(inplace=True)

In [29]:
# del abroad_stories_urls_df["index"]