# DS 3010 Final Project: Data Gathering & Cleaning

This notebook represents the data gathering and preprocessing stage of the data science lifecycle for the project. The team is using an open source dataset about business from Yelp. Yelp provided the data in the form of a JSON file. The decisions made during the preprocessing stage must contribute to the goal of helping entrepreneurs or business owners find optimal locations to open a new restaurant.

In [None]:
!pip install pandas numpy



In [None]:
import pandas as pd
import numpy as np

## Data Gathering

In [None]:
import json

def gather_data(toNormalize=False):
  '''
  Loads data from business.json into a Pandas dataframe
  Returns a Pandas dataframe with the data from business.json. Flattens out fields with nested objects if toNormalize=True.
  '''
  if not toNormalize:
    return pd.read_json('business.json', lines=True, orient='records')
  else:
    with open('business.json') as file:
      data = [json.loads(line) for line in file]
    return pd.json_normalize(data)

df = gather_data()
df.head()

Unnamed: 0,business_id,name,address,city,state,postal_code,latitude,longitude,stars,review_count,is_open,attributes,categories,hours
0,Pns2l4eNsfO8kk83dixA6A,"Abby Rappoport, LAC, CMQ","1616 Chapala St, Ste 2",Santa Barbara,CA,93101,34.426679,-119.711197,5.0,7,0,{'ByAppointmentOnly': 'True'},"Doctors, Traditional Chinese Medicine, Naturop...",
1,mpf3x-BjTdTEA3yCZrAYPw,The UPS Store,87 Grasso Plaza Shopping Center,Affton,MO,63123,38.551126,-90.335695,3.0,15,1,{'BusinessAcceptsCreditCards': 'True'},"Shipping Centers, Local Services, Notaries, Ma...","{'Monday': '0:0-0:0', 'Tuesday': '8:0-18:30', ..."
2,tUFrWirKiKi_TAnsVWINQQ,Target,5255 E Broadway Blvd,Tucson,AZ,85711,32.223236,-110.880452,3.5,22,0,"{'BikeParking': 'True', 'BusinessAcceptsCredit...","Department Stores, Shopping, Fashion, Home & G...","{'Monday': '8:0-22:0', 'Tuesday': '8:0-22:0', ..."
3,MTSW4McQd7CbVtyjqoe9mw,St Honore Pastries,935 Race St,Philadelphia,PA,19107,39.955505,-75.155564,4.0,80,1,"{'RestaurantsDelivery': 'False', 'OutdoorSeati...","Restaurants, Food, Bubble Tea, Coffee & Tea, B...","{'Monday': '7:0-20:0', 'Tuesday': '7:0-20:0', ..."
4,mWMc6_wTdE0EUBKIGXDVfA,Perkiomen Valley Brewery,101 Walnut St,Green Lane,PA,18054,40.338183,-75.471659,4.5,13,1,"{'BusinessAcceptsCreditCards': 'True', 'Wheelc...","Brewpubs, Breweries, Food","{'Wednesday': '14:0-22:0', 'Thursday': '16:0-2..."


In [None]:
# Dimensions of the data set
df.shape

(150346, 14)

In [None]:
# Columns and their data types
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150346 entries, 0 to 150345
Data columns (total 14 columns):
 #   Column        Non-Null Count   Dtype  
---  ------        --------------   -----  
 0   business_id   150346 non-null  object 
 1   name          150346 non-null  object 
 2   address       150346 non-null  object 
 3   city          150346 non-null  object 
 4   state         150346 non-null  object 
 5   postal_code   150346 non-null  object 
 6   latitude      150346 non-null  float64
 7   longitude     150346 non-null  float64
 8   stars         150346 non-null  float64
 9   review_count  150346 non-null  int64  
 10  is_open       150346 non-null  int64  
 11  attributes    136602 non-null  object 
 12  categories    150243 non-null  object 
 13  hours         127123 non-null  object 
dtypes: float64(3), int64(2), object(9)
memory usage: 16.1+ MB


## Data Cleaning

In [None]:
# Extract features related to location and performance
df = df.drop(['business_id', 'attributes', 'hours'], axis=1)
df.head()

Unnamed: 0,name,address,city,state,postal_code,latitude,longitude,stars,review_count,is_open,categories
0,"Abby Rappoport, LAC, CMQ","1616 Chapala St, Ste 2",Santa Barbara,CA,93101,34.426679,-119.711197,5.0,7,0,"Doctors, Traditional Chinese Medicine, Naturop..."
1,The UPS Store,87 Grasso Plaza Shopping Center,Affton,MO,63123,38.551126,-90.335695,3.0,15,1,"Shipping Centers, Local Services, Notaries, Ma..."
2,Target,5255 E Broadway Blvd,Tucson,AZ,85711,32.223236,-110.880452,3.5,22,0,"Department Stores, Shopping, Fashion, Home & G..."
3,St Honore Pastries,935 Race St,Philadelphia,PA,19107,39.955505,-75.155564,4.0,80,1,"Restaurants, Food, Bubble Tea, Coffee & Tea, B..."
4,Perkiomen Valley Brewery,101 Walnut St,Green Lane,PA,18054,40.338183,-75.471659,4.5,13,1,"Brewpubs, Breweries, Food"


In [None]:
# Check for null values (NaN)
df.isnull().sum()

Unnamed: 0,0
name,0
address,0
city,0
state,0
postal_code,0
latitude,0
longitude,0
stars,0
review_count,0
is_open,0


In [None]:
# Drop records with a missing categories
df = df.dropna()
df.shape

(150243, 11)

In [None]:
# Check for duplicates
sum(df.duplicated())

0

In [None]:
# Filter for restaurants that are still open
df = df[(df['categories'].str.contains('Restaurant', regex=False)) & (df['is_open'] == True)]
df.head()

Unnamed: 0,name,address,city,state,postal_code,latitude,longitude,stars,review_count,is_open,categories
3,St Honore Pastries,935 Race St,Philadelphia,PA,19107,39.955505,-75.155564,4.0,80,1,"Restaurants, Food, Bubble Tea, Coffee & Tea, B..."
5,Sonic Drive-In,615 S Main St,Ashland City,TN,37015,36.269593,-87.058943,2.0,6,1,"Burgers, Fast Food, Sandwiches, Food, Ice Crea..."
9,Sonic Drive-In,2312 Dickerson Pike,Nashville,TN,37207,36.208102,-86.76817,1.5,10,1,"Ice Cream & Frozen Yogurt, Fast Food, Burgers,..."
11,Vietnamese Food Truck,,Tampa Bay,FL,33602,27.955269,-82.45632,4.0,10,1,"Vietnamese, Food, Restaurants, Food Trucks"
12,Denny's,8901 US 31 S,Indianapolis,IN,46227,39.637133,-86.127217,2.5,28,1,"American (Traditional), Restaurants, Diners, B..."


## Data Storage

In [None]:
!pip install pymongo dnspython



In [None]:
from pymongo import MongoClient
import config

# Get MongoDB credentials
uri = f'mongodb+srv://{config.mongo_username}:{config.mongo_password}@ds3010.zpoyweh.mongodb.net/?retryWrites=true&w=majority&appName=ds3010'

# Connect to MongoDB
client = MongoClient(uri)

# Ping MongoDB instance to ensure a connnection
try:
  client.admin.command('ping')
  print('Pinged your deployment. You successfully connected to MongoDB!')
except Exception as e:
  print(e)

Pinged your deployment. You successfully connected to MongoDB!


In [None]:
def upload_data(restaurants):
  '''
  Uploads data about businesses to MongoDB
  '''
  # Access database
  db = client['Businesses']

  # Check if restaurants collection already exists
  if "Restaurants" not in db.list_collection_names():
    # Upload data
    restaurants_collection = db.create_collection('Restaurants')
    result = restaurants_collection.insert_many(restaurants)
    print(f"Uploaded {len(result.inserted_ids)} restaurants to MongoDB")
  else:
    articles_collection = db['Restaurants']
    print("Restaurant data are already stored in MongoDB")

upload_data(df.to_dict(orient='records'))

Uploaded 35004 restaurants to MongoDB
