In [2]:
import pandas as pd
from random import randint
import urllib3
from bs4 import BeautifulSoup
import shutil
import os
from time import sleep
import uuid

In [3]:
largest_cities = pd.read_html("https://www.infoplease.com/world/geography/major-cities-latitude-longitude-and-corresponding-time-zones")

In [4]:
df = largest_cities[0]

In [5]:
df.columns = ['City','lat1','lat2','log1','log2','time']

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 120 entries, 0 to 119
Data columns (total 6 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   City    120 non-null    object
 1   lat1    120 non-null    int64 
 2   lat2    120 non-null    object
 3   log1    120 non-null    int64 
 4   log2    120 non-null    object
 5   time    120 non-null    object
dtypes: int64(2), object(4)
memory usage: 5.8+ KB


In [7]:
def city_row_to_coordinates(row):
  name = row[0]

  lat1 = int(row[1])
  lat2, lat_sign = row[2].split(' ')
  lat2 = int(lat2)
  lat_sign = 1 if lat_sign == 'N' else -1

  lon1 = int(row[3])
  lon2, lon_sign = row[4].split(' ')
  lon2 = int(lon2)
  lon_sign = 1 if lon_sign == 'W' else -1

  lat = lat_sign*(lat1 + lat2/60)
  lon = lon_sign*(lon1 + lon2/60)

  return name, lat, lon


def draw_coordinates(df):
  row = df.sample().values
  return city_row_to_coordinates(row[0])

In [8]:
print(draw_coordinates(df))
print(draw_coordinates(df))

('Bristol, England', 51.46666666666667, 2.5833333333333335)
('Marseilles, France', 43.333333333333336, -5.333333333333333)


In [9]:
def prepare_random_link(df, size=200, alt=384400):  # 384 400km - average distance moon-earth
  name, lat, lon = draw_coordinates(df)
  year = randint(2000,2019)
  month = randint(1,12)
  day = randint(1,28)                               # not uniform for all days, but we dont need it to be uniform in days of month but in phases
  hour = randint(0,23)
  minute = randint(0,59)
  second = randint(0,59)

  link = f'http://www.fourmilab.ch/cgi-bin/Earth?imgsize={size}&opt=-l&lat={lat:.4f}&ns=North&lon={lon:.4f}&ew=West&alt={alt}&img=LRO_100m.evif&date=1&utc={year}-{month}-{day}+{hour}:{minute}:{second}'

  return link

In [10]:
print(prepare_random_link(df))
print(prepare_random_link(df))

http://www.fourmilab.ch/cgi-bin/Earth?imgsize=200&opt=-l&lat=-1.4667&ns=North&lon=48.4833&ew=West&alt=384400&img=LRO_100m.evif&date=1&utc=2007-11-8+7:53:39
http://www.fourmilab.ch/cgi-bin/Earth?imgsize=200&opt=-l&lat=43.1667&ns=North&lon=-132.0000&ew=West&alt=384400&img=LRO_100m.evif&date=1&utc=2019-5-27+16:18:23


In [11]:
def download_data(link, data_folder='data'):
  http = urllib3.PoolManager()
  r = http.request('GET', link)
  moon_data = pd.read_html(r.data)[0]
  moon_data.columns = ['col_name', 'value', 'useless']
  moon_data

  age = moon_data[moon_data['col_name'] == 'Age of Moon:']['value'][0]
  phase = moon_data[moon_data['col_name'] == 'Phase:']['value'].values[0]
  distance = moon_data[moon_data['col_name'] == 'Distance:']['value'].values[0]
  subtends = moon_data[moon_data['col_name'] == 'Moon subtends:']['value'].values[0]

  soup = BeautifulSoup(r.data, 'html.parser')
  img_link = 'http://www.fourmilab.ch' + soup.img.get("src")

  filename = str(uuid.uuid4()) + '.jpg' # random filename

  if not os.path.exists(data_folder):
      os.makedirs(data_folder)

  http = urllib3.PoolManager()
  with http.request('GET',img_link, preload_content=False) as resp, open(data_folder + '/' + filename, 'wb') as out_file:
      shutil.copyfileobj(resp, out_file)

  return {'filename': filename,
          'age': age,
          'phase': phase,
          'subtends': subtends,
          'link': link}

  

In [12]:
def generate_data(df, n=10, data_folder='data'):
  if not os.path.exists(data_folder):
        os.makedirs(data_folder)

  data_file = data_folder + '/' + 'data.csv'

  if not os.path.exists(data_file):
      with open(data_file, 'w') as f:
        f.write("filename;age;phase;subtends;link")
        f.write('\n')

  for i in range(n):
    print(f'downoading file no {i}')
    sleep(1)
    random_link = prepare_random_link(df)
    downloaded_data = download_data(random_link)

    with open(data_file, 'a') as f:
      csv_string = ';'.join( list(downloaded_data.values()) )
      f.write(csv_string)
      f.write('\n')


In [13]:
cd "/content/drive/My Drive/Colab Notebooks/moon_phases"

/content/drive/My Drive/Colab Notebooks/moon_phases


In [14]:
generate_data(df, n=10)

downoading file no 0
downoading file no 1
downoading file no 2
downoading file no 3
downoading file no 4
downoading file no 5
downoading file no 6
downoading file no 7
downoading file no 8
downoading file no 9
