Dataset was imported with:
mongoimport --type csv -d texas_missing -c cases --drop --headerline --drop Missing.csv

In [22]:
# Import Dependencies
from pymongo import MongoClient
from pprint import pprint
import pandas as pd
from geopy.geocoders import Nominatim

In [23]:
# Nominatim to variable
geolocator = Nominatim(user_agent="testing")

# The default port used by MongoDB is 27017
mongo = MongoClient(port=27017)

In [24]:
# Confirm that the database was created
for dbname in mongo.list_database_names():
    print(dbname, "<----------------" if dbname == "texas_missing" else " ")

admin  
classDB  
config  
epa  
fruits_db  
local  
met  
texas_missing <----------------
travel_db  
uk_food  


In [25]:
# Assign the database to a variable
db = mongo['texas_missing']

# Review the collections in the db
print(db.list_collection_names())

['cases']


In [26]:
# Assign the collection to a variable
cases = db['cases']

In [27]:
# Review a document in the cases collection
pprint(cases.find_one())

{'': 3,
 'Age': '22 years old',
 'Classification': 'Missing',
 'Date of Birth': '11/14/1967 (56)',
 'Height and Weight': " 5'5, 110 pounds",
 'Missing From': ' Abilene, Texas',
 'Missing Since': '03/01/1989',
 'Person Name': 'Debra Marie Adams',
 'Race': 'White',
 'Sex': 'Female',
 '_id': ObjectId('658cf21eef708428cb9f0290')}


In [28]:
# View our csv file as a DataFrame
cases_df = pd.DataFrame(list(cases.find()))
print("Rows for all cases:", len(cases_df))
cases_df.head()

Rows for all cases: 966


Unnamed: 0,_id,Unnamed: 2,Person Name,Missing Since,Missing From,Classification,Sex,Race,Date of Birth,Age,Height and Weight
0,658cf21eef708428cb9f0290,3,Debra Marie Adams,03/01/1989,"Abilene, Texas",Missing,Female,White,11/14/1967 (56),22 years old,"5'5, 110 pounds"
1,658cf21eef708428cb9f0291,4,Michael Jefferson Adams,06/08/1987,"Abilene, Texas",Endangered Missing,Male,White,03/28/1969 (54),18 years old,"6'0 - 6'1, 145 - 150 pounds"
2,658cf21eef708428cb9f0292,0,Angela Abbrederis,04/02/1996,"Dallas, Texas",Endangered Missing,Female,"Asian, Biracial, White",10/25/1968 (55),27 years old,"5'7 - 5'9, 115 - 130 pounds"
3,658cf21eef708428cb9f0293,5,Louella Renee Addison,09/13/2019,"Austin, Texas",Endangered Missing,Female,Black,08/09/1966 (57),53 years old,"5'0, 120 - 150 pounds"
4,658cf21eef708428cb9f0294,6,Adolfo Santo Agramonte,06/20/2005,"McAllen, Texas",Migrant,Male,"Biracial, Black, Hispanic",,31 years old,"5'10 - 6'2, 180 pounds"


In [34]:
# Find all documents missing location
query = {'Missing From':{'$regex':"N/A"}}
results = cases.find(query)
print("Number of cases missing location:",cases.count_documents(query))

Number of cases missing location: 18


In [37]:
# Delete all documents missing location
cases.delete_many(query)
results = cases.find(query)
print("Number of cases missing location:",cases.count_documents(query))

Number of cases missing location: 0


In [52]:
# Smaller dataframe containing id's and locations
query = {}
fields = {'_id':1, 'Missing From':1}
results = cases.find(query,fields)
coord_df = pd.DataFrame(results)
print("length of clean dataframe:", len(coord_df))
coord_df.head()

length of clean dataframe: 948


Unnamed: 0,_id,Missing From
0,658cf21eef708428cb9f0290,"Abilene, Texas"
1,658cf21eef708428cb9f0291,"Abilene, Texas"
2,658cf21eef708428cb9f0292,"Dallas, Texas"
3,658cf21eef708428cb9f0293,"Austin, Texas"
4,658cf21eef708428cb9f0294,"McAllen, Texas"


In [54]:
# Find lat and lon for each row in "Missing From" using Geocode
lat = []
lon = []


for location in coord_df['Missing From']:
    try:
        location_info = geolocator.geocode(location)
        latitude = location_info.latitude
        longitude = location_info.longitude
        
        lat.append(latitude)
        lon.append(longitude)
        
    except:
        lat.append(None)
        lon.append(None)
        
coord_df['latitude'] = lat
coord_df['longitude'] = lon

In [61]:
# Apply lat and lon to clean DF
coordinates_df = pd.DataFrame(list(cases.find()))
coordinates_df['Latitude'] = lat
coordinates_df['Longitude'] = lon
print("Rows for all cases:", len(coordinates_df))
coordinates_df.head()

Rows for all cases: 948


Unnamed: 0,_id,Unnamed: 2,Person Name,Missing Since,Missing From,Classification,Sex,Race,Date of Birth,Age,Height and Weight,Latitude,Longitude
0,658cf21eef708428cb9f0290,3,Debra Marie Adams,03/01/1989,"Abilene, Texas",Missing,Female,White,11/14/1967 (56),22 years old,"5'5, 110 pounds",32.44645,-99.747591
1,658cf21eef708428cb9f0291,4,Michael Jefferson Adams,06/08/1987,"Abilene, Texas",Endangered Missing,Male,White,03/28/1969 (54),18 years old,"6'0 - 6'1, 145 - 150 pounds",32.44645,-99.747591
2,658cf21eef708428cb9f0292,0,Angela Abbrederis,04/02/1996,"Dallas, Texas",Endangered Missing,Female,"Asian, Biracial, White",10/25/1968 (55),27 years old,"5'7 - 5'9, 115 - 130 pounds",32.776272,-96.796856
3,658cf21eef708428cb9f0293,5,Louella Renee Addison,09/13/2019,"Austin, Texas",Endangered Missing,Female,Black,08/09/1966 (57),53 years old,"5'0, 120 - 150 pounds",30.271129,-97.7437
4,658cf21eef708428cb9f0294,6,Adolfo Santo Agramonte,06/20/2005,"McAllen, Texas",Migrant,Male,"Biracial, Black, Hispanic",,31 years old,"5'10 - 6'2, 180 pounds",26.204114,-98.23006


In [64]:
coordinates_df.to_csv("coordinates.csv", index=False)