In [2]:
import pandas as pd
import numpy as np

In [3]:
#using Geopy package, with Google Maps as agent, to determine the latitude and Longitutde of all the locations  
from geopy.exc import GeocoderTimedOut
from geopy.geocoders import Nominatim
geolocator= Nominatim(user_agent="Google Maps") 

In [6]:
# load datasets
data = pd.read_csv('weatherAUS.csv') 
data["Date"]= pd.to_datetime(data["Date"])

In [9]:
percent_missing = data.isnull().sum() * 100 / len(data)
missing_value_df = pd.DataFrame({'column_name': data.columns,
                                 'percent_missing': percent_missing})
missing_value_df.sort_values('percent_missing', inplace=True)

In [10]:
missing_value_df

Unnamed: 0,column_name,percent_missing
Date,Date,0.0
Location,Location,0.0
MaxTemp,MaxTemp,0.866905
MinTemp,MinTemp,1.020899
Temp9am,Temp9am,1.214767
WindSpeed9am,WindSpeed9am,1.214767
Humidity9am,Humidity9am,1.824557
WindSpeed3pm,WindSpeed3pm,2.105046
Rainfall,Rainfall,2.241853
RainToday,RainToday,2.241853


In [7]:
data['Location'].value_counts()

Canberra            3436
Sydney              3344
Hobart              3193
Melbourne           3193
Brisbane            3193
Perth               3193
Adelaide            3193
Darwin              3193
Albury              3040
Wollongong          3040
AliceSprings        3040
MountGambier        3040
Albany              3040
Townsville          3040
Cairns              3040
Launceston          3040
GoldCoast           3040
MountGinini         3040
Bendigo             3040
Ballarat            3040
Newcastle           3039
Tuggeranong         3039
Penrith             3039
Williamtown         3009
PerthAirport        3009
Mildura             3009
PearceRAAF          3009
Dartmoor            3009
NorfolkIsland       3009
Woomera             3009
MelbourneAirport    3009
Richmond            3009
Moree               3009
Portland            3009
Sale                3009
Nuriootpa           3009
Cobar               3009
Watsonia            3009
Witchcliffe         3009
CoffsHarbour        3009


In [4]:
# to view the location data and uniqueness
print(data['Location'].unique(),"\n")
print("Number of unique cities: ", len(data['Location'].unique()))

['Albury' 'BadgerysCreek' 'Cobar' 'CoffsHarbour' 'Moree' 'Newcastle'
 'NorahHead' 'NorfolkIsland' 'Penrith' 'Richmond' 'Sydney' 'SydneyAirport'
 'WaggaWagga' 'Williamtown' 'Wollongong' 'Canberra' 'Tuggeranong'
 'MountGinini' 'Ballarat' 'Bendigo' 'Sale' 'MelbourneAirport' 'Melbourne'
 'Mildura' 'Nhil' 'Portland' 'Watsonia' 'Dartmoor' 'Brisbane' 'Cairns'
 'GoldCoast' 'Townsville' 'Adelaide' 'MountGambier' 'Nuriootpa' 'Woomera'
 'Albany' 'Witchcliffe' 'PearceRAAF' 'PerthAirport' 'Perth' 'SalmonGums'
 'Walpole' 'Hobart' 'Launceston' 'AliceSprings' 'Darwin' 'Katherine'
 'Uluru'] 

Number of unique cities:  49


In [5]:
# declare an empty list to store latitude and longitude of values of all the cities
longitude = []
latitude = []

In [6]:
def findGeocode(city):
    return geolocator.geocode(city)

In [7]:
MapList = ['Albury','Badgerys Creek','Cobar','Coffs Harbour','Moree,Australia','Newcastle,Australia','Norah Head','Norfolk Island','Penrith,Australia','Richmond,Australia','Sydney','Sydney Airport','Wagga Wagga',
'Williamtown','Wollongong','Canberra','Tuggeranong','Mount Ginini','Ballarat','Bendigo','Sale,Australia','Melbourne Airport','Melbourne','Mildura','Nhil,Australia','Portland,Australia',
'Watsonia','Dartmoor,Australia','Brisbane','Cairns','GoldCoast,Australia','Townsville','Adelaide','Mount Gambier','Nuriootpa','Woomera','Albany,Australia','Witchcliffe','Pearce RAAF',
'Perth Airport','Perth','Salmon Gums','Walpole,Australia','Hobart','Launceston','Alice Springs,Australia','Darwin','Katherine','Uluru']

In [8]:
for i in (MapList):
      
    if findGeocode(i) != None:
        loc = findGeocode(i)  
        # coordinates returned from function is stored into two separate list
        latitude.append(loc.latitude)
        longitude.append(loc.longitude)
    # if coordinate for a city notfound, insert "NaN" indicating missing value 
    else:
        latitude.append(np.nan)
        longitude.append(np.nan)

In [9]:
#Make a new dataframe to store this information
Map_csv= pd.DataFrame(MapList,columns=['City'])
Map_csv.set_index('City')
Map_csv["Latitude"] = latitude
Map_csv["Longitude"] = longitude

In [10]:
Map_csv

Unnamed: 0,City,Latitude,Longitude
0,Albury,-36.080477,146.91628
1,Badgerys Creek,-33.881667,150.744163
2,Cobar,-31.498333,145.834444
3,Coffs Harbour,-30.296241,153.113529
4,"Moree,Australia",-29.46172,149.840715
5,"Newcastle,Australia",-32.919295,151.779535
6,Norah Head,-33.281667,151.567778
7,Norfolk Island,-29.028958,167.958729
8,"Penrith,Australia",-33.751079,150.694168
9,"Richmond,Australia",-20.569655,142.92836


In [11]:
#We find that Goldcoast, Australia is the only value that is false, since it picks up Goldcoast in San Diego, USA
# Hence, we need to manually change the location
Map_csv.iloc[30,1]= -27.9539379 
Map_csv.iloc[30,2]= 153.089157

In [12]:
# For places with the name "XXXX,Australia", remove ",Australia" from the name
Map_csv["City"] = Map_csv["City"].apply(lambda city_name: city_name.replace(',Australia', ''))
Map_csv

Unnamed: 0,City,Latitude,Longitude
0,Albury,-36.080477,146.91628
1,Badgerys Creek,-33.881667,150.744163
2,Cobar,-31.498333,145.834444
3,Coffs Harbour,-30.296241,153.113529
4,Moree,-29.46172,149.840715
5,Newcastle,-32.919295,151.779535
6,Norah Head,-33.281667,151.567778
7,Norfolk Island,-29.028958,167.958729
8,Penrith,-33.751079,150.694168
9,Richmond,-20.569655,142.92836


### Now we compute the distance between all the combinations of these locations

In [13]:
#use a mathematical formula to compute the distance between two points using their latitude and longitude coordinates

from math import sin, cos, sqrt, atan2, radians # import math tools

def separation(lat1, lon1, lat2, lon2): #Direct distance between two points on a sphere
    # approximate radius of earth in km
    R = 6373.0

    lat1 = radians(lat1)
    lon1 = radians(lon1)
    lat2 = radians(lat2)
    lon2 = radians(lon2)

    dlon = lon2 - lon1
    dlat = lat2 - lat1

    a = sin(dlat / 2)**2 + cos(lat1) * cos(lat2) * sin(dlon / 2)**2
    c = 2 * atan2(sqrt(a), sqrt(1 - a))

    distance = R * c

    return distance

In [14]:
from itertools import combinations #Tools allow you to get all possible combination for the list element
comb = combinations(list(Map_csv.City),2) #Target list: list(mapdata.name) , make a combination set consist of 2 element

dist = pd.DataFrame()

for i,j in list(comb): #i = Start, j = End location
    lat1 = Map_csv[Map_csv["City"] == i].Latitude
    lon1 = Map_csv[Map_csv["City"] == i].Longitude
    lat2 = Map_csv[Map_csv["City"] == j].Latitude
    lon2 = Map_csv[Map_csv["City"] == j].Longitude
    distance = separation(lat1, lon1, lat2, lon2) #Run the function defined above
    df1 = pd.DataFrame({'Start': [i],'End': [j],'Distance(km)':[distance]}) #Create a new dataframe contains the data needed
    dist = dist.append(df1) #Append the dataframe after each iteration
    
dist = dist.reset_index(drop=True) #All index for each row would be 0, because df1 only have 0th row
dist = dist.sort_values('Distance(km)')
dist.to_csv("Final_Distance.csv")
dist[0:20]

Unnamed: 0,Start,End,Distance(km)
1131,Perth Airport,Perth,10.200295
435,Sydney,Sydney Airport,11.044316
237,Newcastle,Williamtown,13.018746
615,Canberra,Tuggeranong,13.749246
54,Badgerys Creek,Penrith,15.242336
828,Melbourne,Watsonia,15.621495
798,Melbourne Airport,Melbourne,19.946071
802,Melbourne Airport,Watsonia,22.553459
1121,Pearce RAAF,Perth Airport,30.032594
648,Tuggeranong,Mount Ginini,31.366682
