# Importing and preparing supermarkets data

## Libraries and settings

In [29]:
# Libraries
import os
import fnmatch
import pandas as pd

# Ignore warnings
import warnings
warnings.filterwarnings("ignore")

## Importing data

In [30]:
# Get current working directory
print('Current working directory:', os.getcwd())

# Show files in the directory
flist = fnmatch.filter(os.listdir('.'), '*.json')
for i in flist:
    print(i)

# Read the data to a pandas data frame
df1 = pd.read_json('supermarkets_data.json', encoding='utf-8')
df1.head(5)

Current working directory: C:\Workspacezhaw\data analytics\Woche 2
supermarkets_data.json


Unnamed: 0,type,id,lat,lon,tags
0,node,33126515,47.155616,9.037915,"{'brand': 'Spar', 'brand:wikidata': 'Q610492',..."
1,node,36726161,47.226191,8.980329,"{'addr:city': 'Uznach', 'addr:housenumber': '2..."
2,node,39768209,47.225069,8.969981,"{'addr:city': 'Uznach', 'addr:postcode': '8730..."
3,node,39947904,47.376732,8.542161,"{'addr:city': 'Zürich', 'addr:country': 'CH', ..."
4,node,48932835,47.37502,8.522895,"{'addr:city': 'Zürich', 'addr:housenumber': '7..."


## Count number of rows and columns in the data frame

In [31]:
# Dimension (rows, columns)
print('Dimension:', df1.shape)

# Number of rows
print('Number of rows:', df1.shape[0])

# Number of columns
print('Number of columns:', df1.shape[1])

Dimension: (3260, 5)
Number of rows: 3260
Number of columns: 5


## Column 'tags' is a pandas Series with dictionaries -> change to data frame

In [32]:
# Type of the first item of column tags
print(type(df1.tags))
print(type(df1.tags[0]))

# Content of the first item of column tags
print(df1.tags[0].keys())

# Change to data frame
df2 = pd.DataFrame.from_records(df1.tags)
df2[['brand', 'shop', 'addr:city', 'addr:housenumber', 'addr:postcode']]

<class 'pandas.core.series.Series'>
<class 'dict'>
dict_keys(['brand', 'brand:wikidata', 'brand:wikipedia', 'name', 'opening_hours', 'shop'])


Unnamed: 0,brand,shop,addr:city,addr:housenumber,addr:postcode
0,Spar,supermarket,,,
1,Migros,supermarket,Uznach,25,8730
2,Coop,supermarket,Uznach,,8730
3,Coop,supermarket,Zürich,1,8001
4,Migros,supermarket,Zürich,7,8004
...,...,...,...,...,...
3255,Volg,supermarket,,,
3256,Landi,supermarket,Rickenbach Sulz,1,8545
3257,VOI,supermarket,,,
3258,,supermarket,,,


## Join df1 and df2

In [54]:
# Merge df and df2
# Include opening hours as additional variable in the data frame.
df = pd.merge(df1[['type', 'id', 'lat', 'lon']], 
              df2[['brand', 'shop', 'addr:city', 'addr:housenumber', 'addr:postcode', 'opening_hours']], 
              left_index=True, 
              right_index=True)
df.head(5)

Unnamed: 0,type,id,lat,lon,brand,shop,addr:city,addr:housenumber,addr:postcode,opening_hours
0,node,33126515,47.155616,9.037915,Spar,supermarket,,,,Mo-Th 08:00-19:00; Fr 08:00-20:00; Sa 08:00-17:00
1,node,36726161,47.226191,8.980329,Migros,supermarket,Uznach,25.0,8730.0,"Mo-Th 08:00-19:00, Fr 08:00-20:00, Sa 07:30-17..."
2,node,39768209,47.225069,8.969981,Coop,supermarket,Uznach,,8730.0,
3,node,39947904,47.376732,8.542161,Coop,supermarket,Zürich,1.0,8001.0,Mo-Sa 06:00-22:00
4,node,48932835,47.37502,8.522895,Migros,supermarket,Zürich,7.0,8004.0,Mo-Sa 08:00-21:00; PH off


## Count and identify the number of missing values (if any)

In [34]:
# Count missing values
print(pd.isna(df).sum())

# Identify rows with missing values, e.g.:
df.loc[pd.isna(df['addr:city'])]

type                   0
id                     0
lat                    0
lon                    0
brand               1246
shop                   0
addr:city           1808
addr:housenumber    1702
addr:postcode       1733
dtype: int64


Unnamed: 0,type,id,lat,lon,brand,shop,addr:city,addr:housenumber,addr:postcode
0,node,33126515,47.155616,9.037915,Spar,supermarket,,,
5,node,60271452,47.406671,9.305450,,supermarket,,,
6,node,70656485,47.491253,8.733981,,supermarket,,,
10,node,81321513,47.532917,9.066408,Landi,supermarket,,,
13,node,95582038,47.050385,9.059214,,supermarket,,,
...,...,...,...,...,...,...,...,...,...
3253,node,9947587529,47.000852,8.611420,,supermarket,,,
3255,node,9963973121,47.321761,9.426943,Volg,supermarket,,,
3257,node,9975876019,46.011774,8.965955,VOI,supermarket,,,
3258,node,9978766657,47.424524,7.126737,,supermarket,,,


## Count and identify duplicated values (if any)

In [35]:
# Count duplicated values
print(df.duplicated().sum())

# Identify rows with duplicated values, e.g.:
df[df[['id']].duplicated()]

0


Unnamed: 0,type,id,lat,lon,brand,shop,addr:city,addr:housenumber,addr:postcode


## Get data types of all variables

In [36]:
# Get data types (note that in pandas, a string is referred to as 'object')
df.dtypes

type                 object
id                    int64
lat                 float64
lon                 float64
brand                object
shop                 object
addr:city            object
addr:housenumber     object
addr:postcode        object
dtype: object

### Save data to file

In [37]:
df.to_csv('supermarkets_data_prepared.csv', 
          sep=",", 
          encoding='utf-8',
          index=False)

### Jupyter notebook --footer info-- (please always provide this at the end of each submitted notebook)

In [38]:
import os
import platform
import socket
from platform import python_version
from datetime import datetime

print('-----------------------------------')
print(os.name.upper())
print(platform.system(), '|', platform.release())
print('Datetime:', datetime.now().strftime("%Y-%m-%d %H:%M:%S"))
print('Python Version:', python_version())
print('-----------------------------------')

-----------------------------------
NT
Windows | 10
Datetime: 2022-10-03 12:04:18
Python Version: 3.9.7
-----------------------------------


### Additional filters on supermarkets

In [39]:
df_filtered = df.loc[(df['brand'] == 'Coop') & (df['addr:city'] == 'Zürich')]
print(df_filtered)

      type          id        lat       lon brand         shop addr:city  \
3     node    39947904  47.376732  8.542161  Coop  supermarket    Zürich   
9     node    79977755  47.340070  8.530546  Coop  supermarket    Zürich   
59    node   265776668  47.376417  8.559594  Coop  supermarket    Zürich   
75    node   268603429  47.367360  8.546174  Coop  supermarket    Zürich   
81    node   270692983  47.357940  8.554646  Coop  supermarket    Zürich   
84    node   271028686  47.366770  8.548146  Coop  supermarket    Zürich   
96    node   276363821  47.418888  8.505699  Coop  supermarket    Zürich   
122   node   283103824  47.393648  8.529543  Coop  supermarket    Zürich   
123   node   283126967  47.392524  8.524519  Coop  supermarket    Zürich   
226   node   321361788  47.395725  8.540471  Coop  supermarket    Zürich   
447   node   470217716  47.384789  8.527125  Coop  supermarket    Zürich   
625   node   651273252  47.373288  8.530562  Coop  supermarket    Zürich   
632   node  

In [40]:
df_filtered = df.loc[(df['brand'] == 'Migros') & (df['addr:city'] == 'Zürich')]
print(df_filtered)

      type          id        lat       lon   brand         shop addr:city  \
4     node    48932835  47.375020  8.522895  Migros  supermarket    Zürich   
11    node    83330862  47.344749  8.529981  Migros  supermarket    Zürich   
16    node   119249170  47.375255  8.536107  Migros  supermarket    Zürich   
50    node   262400822  47.364072  8.530945  Migros  supermarket    Zürich   
71    node   267346993  47.385598  8.531471  Migros  supermarket    Zürich   
82    node   270958272  47.358367  8.554074  Migros  supermarket    Zürich   
83    node   271028298  47.365678  8.548041  Migros  supermarket    Zürich   
85    node   271029206  47.364596  8.553846  Migros  supermarket    Zürich   
89    node   273942728  47.357610  8.571369  Migros  supermarket    Zürich   
193   node   310133197  47.419522  8.548286  Migros  supermarket    Zürich   
209   node   312838980  47.379200  8.508799  Migros  supermarket    Zürich   
225   node   321361643  47.392553  8.538428  Migros  supermarket

In [57]:
# Filter supermarkets with available brand, city, house number and postcode
#Filter and count all Coop supermarkets in the cities of Zürich, Basel & Bern.
df_filtered = df.loc[(df['brand'] == 'Coop') & (df['addr:city'] == 'Zürich' ) | (df['addr:city'] == 'Bern' ) | (df['addr:city'] == 'Basel' ) & (df['addr:housenumber'] == '25') & (df['addr:housenumber'] == '25') & (df['addr:postcode'] == '8730') ]

print(df_filtered)

      type          id        lat       lon               brand         shop  \
3     node    39947904  47.376732  8.542161                Coop  supermarket   
9     node    79977755  47.340070  8.530546                Coop  supermarket   
59    node   265776668  47.376417  8.559594                Coop  supermarket   
75    node   268603429  47.367360  8.546174                Coop  supermarket   
81    node   270692983  47.357940  8.554646                Coop  supermarket   
...    ...         ...        ...       ...                 ...          ...   
2433  node  5243199320  46.945196  7.390277  VOI Migros Partner  supermarket   
2560  node  5915584594  46.934169  7.387288                 NaN  supermarket   
2575  node  6011935004  47.412872  8.547012                Coop  supermarket   
2684  node  6541934447  46.958155  7.453782              Migros  supermarket   
3084  node  8959998269  46.946807  7.395060              Migros  supermarket   

     addr:city addr:housenumber addr:po