In [17]:
import pandas as pd
from nltk.metrics.distance import edit_distance, binary_distance
from tqdm import tqdm_notebook as tqdm

In [2]:
correct_df = pd.read_csv('./Correct_cities.csv')

In [3]:
incorrect_df = pd.read_csv('./Misspelt_cities.csv')

In [4]:
correct_df.head(2)

Unnamed: 0,name,country,id
0,les Escaldes,Andorra,3040051
1,Andorra la Vella,Andorra,3041563


In [5]:
incorrect_df.head(2)

Unnamed: 0,misspelt_name,country
0,Hfjdúszoposzló,Hungary
1,Otrajnyy,Russia


.

**Sorting the DataFrames according to Country names**

In [6]:
correct_df.sort_values(by='country', inplace=True)

In [7]:
incorrect_df.sort_values('country', inplace=True)

**Checking for Duplicate City Names**

In [8]:
correct_df.shape

(23018, 3)

In [9]:
correct_df['name'].nunique()

21940

i.e out of 23018 cities, there are only 21940 unique city names <br>

In [10]:
total_duplicates = 23018-21940
total_duplicates

1078

In [11]:
correct_df[correct_df['name'].duplicated()][:5]

Unnamed: 0,name,country,id
163,Mercedes,Argentina,3430709
139,San Pedro,Argentina,3428577
218,Santo Tomé,Argentina,3835793
493,Cranbourne,Australia,2170078
479,Epping,Australia,2167279


In [13]:
correct_df[correct_df['name']=='Mercedes']

Unnamed: 0,name,country,id
162,Mercedes,Argentina,3430708
163,Mercedes,Argentina,3430709
4047,Mercedes,Costa Rica,3622881
15567,Mercedes,Philippines,1699833
22447,Mercedes,Uruguay,3441684


There are many duplicate city names, even some countries have more than 1 cities with the same name, <br>
E.g- There are 2 cities named **Mercedes** in **Argentina**, same for **San Pedro** in **Argentina**

In [14]:
correct_df[correct_df['name']=='San Pedro']

Unnamed: 0,name,country,id
226,San Pedro,Argentina,3836772
139,San Pedro,Argentina,3428577
4029,San Pedro,Costa Rica,3621717
14150,San Pedro,Mexico,3985129
15457,San Pedro,Philippines,1688749
21977,San Pedro,United States,5392528


In [15]:
correct_df[correct_df['name']=='Barcelona']

Unnamed: 0,name,country,id
6415,Barcelona,Spain,3128760
22655,Barcelona,Venezuela,3648559


In [16]:
correct_df[correct_df['name']=='Valencia']

Unnamed: 0,name,country,id
15392,Valencia,Philippines,1680116
5921,Valencia,Spain,2509954
22569,Valencia,Venezuela,3625549


## Naive Approach

##### Naive Approach is to sort cities by their country names and find the correct city name with the least edit_distance

This is an exhaustive search approach and will calulate the Levenshtein edit-distance between two city name of the same country, <br>
But even with this approach, since there can be more than 2 cities with same name in a country, 100% accuracy is not guranteed.

This block of code takes ~20 mins to run

In [18]:
# %%time
count = 0
correct_city_list = []
correct_city_id = []
for i in tqdm(range(incorrect_df.shape[0])):
    min_dist = 9999
    incorrect_name = incorrect_df.iloc[i]['misspelt_name']
    incorrect_country = incorrect_df.iloc[i]['country']
    temp_df = correct_df[correct_df['country']==incorrect_country]
    for city in temp_df['name']:
        dist = edit_distance(city, incorrect_name)
        if dist<min_dist:
            min_dist = dist
            correct_city = city
            city_id = list(temp_df[temp_df['name']==correct_city]['id'])[0]
            
    correct_city_list.append(correct_city)
    correct_city_id.append(city_id)
#     count += 1
#     print(count, incorrect_name, correct_city, city_id)

HBox(children=(IntProgress(value=0, max=23018), HTML(value='')))




In [19]:
len(correct_city_id)

23018

In [20]:
incorrect_df['id'] = correct_city_id

In [21]:
incorrect_df['correct_name'] = correct_city_list

In [22]:
incorrect_df.head()

Unnamed: 0,misspelt_name,country,id,correct_name
13609,Shīnḏfnḏ,Afghanistan,1125155,Shīnḏanḏ
10009,Maymanp,Afghanistan,1133453,Maymana
17068,Seahrak,Afghanistan,1125896,Shahrak
2932,Pcghmān,Afghanistan,1131316,Paghmān
9862,Sor-e Pul,Afghanistan,1127110,Sar-e Pul


In [24]:
incorrect_df.to_csv('./naive_solved.csv', index=False)