In [1]:
import pandas as pd
import numpy as np
import requests, json
from scrapy.http import TextResponse
import re
from urllib.parse import urlparse
import idna
from urllib3.exceptions import LocationValueError
import time

from geonamescache.mappers import country
mapper = country(from_key='name', to_key='iso')

with open('google.key','r') as f:
    APIKEY = f.read()
    
AMP_BATCHGET_URL = 'https://acceleratedmobilepageurl.googleapis.com/v1/ampUrls:batchGet?key=' + APIKEY

In [2]:
USER_AGENT = {'User-Agent': 'Mozilla/5.0 (Linux; Android 7.0; SM-G892A Build/NRD90M; wv) AppleWebKit/537.36 (KHTML, like Gecko) Version/4.0 Chrome/60.0.3112.107 Mobile Safari/537.36'}

def getArticleLinks(url):
    
    links = []
    series = pd.Series(data=links)
    symbols = 0
    hyphens = 0
    domain = getDomain(url)
    
    try:    
        r = requests.get(url, headers=USER_AGENT, timeout=4)
    except requests.exceptions.RequestException as e:  # This is the correct syntax
        return series
    except idna.core.IDNAError as e:
        return series
    except UnicodeError:
        return series
    except UnicodeEncodeError:
        return series
    except LocationValueError:
        return series
    
    try:
        response = TextResponse(r.url, body=r.text, encoding='utf-8')
        c = response.xpath('//a[contains(@href, "-")]/@href').extract()
    #c = response.xpath('//a/@href').extract()
    except UnicodeEncodeError:
        return series
    
    my_regex = r"^https+://.*" + re.escape(domain) + r".*"
    #my_regex1 = r".*" + re.escape(domain) + r"/.*"
    #my_regex2 = r".*" + re.escape(domain) + r"/\d+.html"
    
    for link in c:
        hyphens = link.count('-')
        symbols = link.count('?') + link.count('#') + link.count('&') + link.count('=')
        
        #if link has more than 5 hyphens, it is very likely it is a news link
        if (hyphens > 5 and symbols < 1):
            #if found most likely it has the http(s) in there too
            #if (re.match(my_regex,link, re.IGNORECASE)):
                if ('http' in link):
                    links.append(link)
                else:
                    links.append("http://" + domain + '/' + link)
        
        #if (re.search(my_regex2, link, re.IGNORECASE)):
        #    print(link)
    series = pd.Series(data=links)
    series = series.drop_duplicates(keep='first')
    
    return series

def getAMPUrl(link):
    c = None
    
    try:    
        r = requests.get(link, headers=USER_AGENT, timeout=4)
    except requests.exceptions.RequestException as e:  # This is the correct syntax
        return c
        
    response = TextResponse(r.url, body=r.text, encoding='utf-8')
    c = response.xpath('//link[contains(@rel, "amphtml")]/@href').extract()
    
    return c

def getHTTPCode(link):
    c = None
    
    try:    
        r = requests.get(link, headers=USER_AGENT, timeout=4)
    except requests.exceptions.RequestException as e:  # This is the correct syntax
        return c
        
    return r.status_code

def getSampleLinksByCC(df, cc, n):
    df_cc = df.loc[df['cc']==cc]
    if (len(df_cc) < n):
        return df_cc
    else:
        return df_cc.sample(n=n)
    
def getDomain(url):
    parsed_uri = urlparse(url)
    domain = '{uri.netloc}'.format(uri=parsed_uri)
    return domain

def split_dataframe(df, chunk_size):
    
    chunks = []
    length = len(df)
    
    while (length > chunk_size):
        df_head = df.head(chunk_size)
        df = df.tail(length - chunk_size)
        length = len(df)
        chunks.append(df_head)
    
    if length < chunk_size:
        chunks.append(df.head(length))
    
    return chunks

def getAMPUrls(df_urls):
    urls  = np.array(df_urls['url'].values)      
    
    HEADERS = {'accept': 'application/json',
            'content-type': 'application/json',
            'cookie': 'ASP.NET_SessionId=aiggen1ccck0gq141dgq1sip; ASP.NET_SessionId=aiggen1ccck0gq141dgq1sip'
          }
    
    body = {
          'lookupStrategy': 'IN_INDEX_DOC',
          'urls': urls.tolist()
        }
    
    try:
        r = requests.post(AMP_BATCHGET_URL, data=json.dumps(body), headers=HEADERS)
        res = json.loads(r.text)
    except requests.exceptions.RequestException as e:
        return None
    except json.decoder.JSONDecodeError as e:
        return None
    
    return res
    
    

In [7]:
df_domains = pd.read_csv('data/mediasources.csv', encoding='latin1')
df_domains['cc'] = df_domains.apply(lambda x: mapper(x['country']), axis=1)

In [8]:
df_domains = df_domains.loc[:,['name', 'link', 'cc']]
df_domains = df_domains.dropna()
df_domains = df_domains.drop_duplicates()
df_domains.to_csv('data/domains.csv', sep='|', encoding='utf-8', index=False)

### Get links from ABYZNEWS for each country

In [9]:
df_links = pd.DataFrame(columns=['cc','name','domain','url','ori_amp_url', 'amp_viewer_url', 'amp_cdn_url'])
df_links.to_csv('data/links.csv', sep=',', encoding='utf-8', index=False)

In [52]:
df_domains = pd.read_csv('data/domains2.csv', sep='|', encoding='utf-8')
df_domains = df_domains.drop_duplicates()
df_domains['domain'] = df_domains.apply(lambda x: getDomain(x['link']), axis=1)
#df_domains = df_domains.sample(frac=1)

In [11]:
#df_domains = df_domains.drop_duplicates()
#len(df_domains.domain.drop_duplicates())

28155

In [60]:
sample_size=10
domains = []

f = open("data/links3.csv", "a")
#f.write("cc|name|domain|url\n")

for index, row in df_domains.iterrows():
    url = row['link']
    
    symbols = url.count('?') + url.count('#') + url.count('&') + url.count('=') + url.count(',') + url.count(';') 
    
    if (symbols > 0):
        continue
    
    links = getArticleLinks(url)
    domain = row['domain']
            
    if (domain not in domains):
        domains.append(domain)
    else:
        continue
        
    if (links is None or links.size < 1):
        continue
  
    if (len(links) <= sample_size):
        sample_links = links.head(len(links))
    else:
        sample_links = links.sample(sample_size)
    
    #print("Domain="+domain + "   Length=" + str(len(sample_links)))
        
    for index, url in sample_links.iteritems():
        
        f.write("\"" + str(row['cc']) + "\"|\"" + 
                str(row['name']) + "\"|\"" +
                str(domain) + "\"|\"" + 
                str(url) + "\"\n"
               )
    
    f.flush()    
f.close()

#         df_links = df_links.append({'name': row['name'], 
#                              'cc': row['cc'],
#                              'domain': domain,
#                              'url': url,
#                              'ori_amp_url': None, 
#                              'amp_viewer_url': None, 
#                              'amp_cdn_url' : None}, 
#                             ignore_index=True)
    
#     df_links.to_csv('data/links.csv', mode='a', header=False, index=False)

In [None]:
df_domains.loc[df_domains.cc == 'CA']

In [11]:
df = pd.read_csv("data/links2.csv", sep=";", encoding='utf-8', error_bad_lines=False)

In [12]:
len(df)

164

In [13]:
chunks = split_dataframe(df, 50)

In [264]:
res = getAMPUrls(chunks[100])

In [None]:
print(res)

In [14]:
count = 1
f = open("data/amp_urls1.csv", "a")
#f.write("originalUrl;ampUrl;cdnAmpUrl\n")

for chunk in chunks:
    res = getAMPUrls(chunk)
    
    if (res is None):
        continue
    
    try:
        if (res['error']['code'] == 429):
            print("Resources exceeded")
            time.sleep(100)
            res = getAMPUrls(chunk)
    except:
        pass
    
    urls=[]
    errors=[]
    try:
        urls = res['ampUrls']
    except:
        pass
    
    try:
        errors = res['urlErrors']
    except:
        pass
    
    merged = urls + errors

    for u in merged:
        ori = u['originalUrl']
        amp = ""
        cdn = ""

        try: 
            amp = u['ampUrl']
            cdn = u['cdnAmpUrl']
        except:
            pass
        
        f.write("\"" + ori + "\";\"" + amp + "\";\"" + cdn + "\"\n")
    
    f.flush()    
    
#     count = count + 1
#     if (count == 15):
#         print("count=" + str(count))
#         time.sleep(110)
#         count = 1

f.close()

In [255]:
a = json.loads(test)

In [257]:
a['error']['code']

429

In [30]:
df = pd.read_csv("data/links2.csv", sep=";", encoding='utf8', codec)

In [31]:
url = df.url.values[162].strip()

In [40]:
import urllib3

AttributeError: module 'urllib3' has no attribute 'unquote'