In [37]:
import pandas as pd

# Load the dataset
df = pd.read_csv("zurich_canton_supermarkets.csv")

# Display the first few rows
df.head()

Unnamed: 0,name,address,lat,lng,type
0,Migros-Supermarkt - Affoltern a. A.,"Oberdorfstrasse 4, Affoltern am Albis",47.278438,8.453841,Migros
1,Migros-Supermarkt - Muri AG,"Kirchenfeldstrasse 8, Muri",47.272012,8.341844,Migros
2,Migros-Supermarkt - Zug - Metalli,"Baarerstrasse 22, Zug",47.173133,8.518225,Migros
3,Migros-Supermarkt - Baar,"Bahnhofstrasse 5, Baar",47.195556,8.525,Migros
4,Migros-Supermarkt - Zug Herti,"Hertizentrum 10, Zug",47.178032,8.505905,Migros


In [38]:
# Check for missing values in each column
df.isnull().sum()

name       0
address    0
lat        0
lng        0
type       0
dtype: int64

In [39]:
# Basic information about the dataframe
df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1772 entries, 0 to 1771
Data columns (total 5 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   name     1772 non-null   object 
 1   address  1772 non-null   object 
 2   lat      1772 non-null   float64
 3   lng      1772 non-null   float64
 4   type     1772 non-null   object 
dtypes: float64(2), object(3)
memory usage: 69.3+ KB


In [40]:
# Drop the 'type' column
df = df.drop(columns=['type'])

# Verify the column is removed
df.head()

Unnamed: 0,name,address,lat,lng
0,Migros-Supermarkt - Affoltern a. A.,"Oberdorfstrasse 4, Affoltern am Albis",47.278438,8.453841
1,Migros-Supermarkt - Muri AG,"Kirchenfeldstrasse 8, Muri",47.272012,8.341844
2,Migros-Supermarkt - Zug - Metalli,"Baarerstrasse 22, Zug",47.173133,8.518225
3,Migros-Supermarkt - Baar,"Bahnhofstrasse 5, Baar",47.195556,8.525
4,Migros-Supermarkt - Zug Herti,"Hertizentrum 10, Zug",47.178032,8.505905


In [41]:
# Define the list of allowed supermarket names
valid_names = ['Migros', 'Coop', 'Lidl', 'Denner', 'Aldi']

# Check which rows in the 'name' column contain any of the valid names
mask = df['name'].str.contains('|'.join(valid_names), case=False, na=False)

# Filter out invalid rows
invalid_names = df[~mask]

# Show entries with names that do NOT match the allowed list
invalid_names

Unnamed: 0,name,address,lat,lng
15,Seetal-Center Hochdorf,"Hauptstrasse 5, Hochdorf",47.165833,8.291667
52,Volg Rifferswil,"Dorfpl. 1, Rifferswil",47.242633,8.496058
69,Seetal-Center Hochdorf,"Hauptstrasse 5, Hochdorf",47.165833,8.291667
77,Hochdorf EKZ,"Hauptstrasse 5, Hochdorf",47.166113,8.291484
79,Muris Market,"Luzernerstrasse 32, Muri",47.270712,8.343016
...,...,...,...,...
1763,LANDI Mittelthurgau,"Tägerwilerstrasse 6, Kreuzlingen",47.655638,9.163059
1765,EDEKA Überlingen,"Lippertsreuter Str. 1, Überlingen",47.769606,9.170363
1766,NORMA,"Lippertsreuter Str. 35, Überlingen",47.771942,9.174552
1767,Kaufland,"Nußdorfer Str. 101, Überlingen",47.754009,9.190746


In [42]:
# Keep only rows where 'name' contains one of the valid keywords
df = df[df['name'].str.contains('|'.join(valid_names), case=False, na=False)]

# Reset index after filtering (optional)
df = df.reset_index(drop=True)

# Preview the cleaned DataFrame
df.head()

Unnamed: 0,name,address,lat,lng
0,Migros-Supermarkt - Affoltern a. A.,"Oberdorfstrasse 4, Affoltern am Albis",47.278438,8.453841
1,Migros-Supermarkt - Muri AG,"Kirchenfeldstrasse 8, Muri",47.272012,8.341844
2,Migros-Supermarkt - Zug - Metalli,"Baarerstrasse 22, Zug",47.173133,8.518225
3,Migros-Supermarkt - Baar,"Bahnhofstrasse 5, Baar",47.195556,8.525
4,Migros-Supermarkt - Zug Herti,"Hertizentrum 10, Zug",47.178032,8.505905


In [43]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1535 entries, 0 to 1534
Data columns (total 4 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   name     1535 non-null   object 
 1   address  1535 non-null   object 
 2   lat      1535 non-null   float64
 3   lng      1535 non-null   float64
dtypes: float64(2), object(2)
memory usage: 48.1+ KB


In [44]:
# Split 'address' into street and city using the comma as separator
df[['street', 'city']] = df['address'].str.split(',', n=1, expand=True)

# Remove leading/trailing spaces
df['street'] = df['street'].str.strip()
df['city'] = df['city'].str.strip()

# Drop the original 'address' column
df = df.drop(columns=['address'])

# Preview the result
df.head()

Unnamed: 0,name,lat,lng,street,city
0,Migros-Supermarkt - Affoltern a. A.,47.278438,8.453841,Oberdorfstrasse 4,Affoltern am Albis
1,Migros-Supermarkt - Muri AG,47.272012,8.341844,Kirchenfeldstrasse 8,Muri
2,Migros-Supermarkt - Zug - Metalli,47.173133,8.518225,Baarerstrasse 22,Zug
3,Migros-Supermarkt - Baar,47.195556,8.525,Bahnhofstrasse 5,Baar
4,Migros-Supermarkt - Zug Herti,47.178032,8.505905,Hertizentrum 10,Zug


In [45]:
# Define the list of valid supermarket types
valid_names = ['Migros', 'Coop', 'Lidl', 'Denner', 'Aldi']

# Function to extract type from name
def extract_type(name):
    for store in valid_names:
        if store.lower() in name.lower():
            return store
    return 'Other'

# Apply the function to create the 'type' column
df['type'] = df['name'].apply(extract_type)

# Preview the result
df.head()


Unnamed: 0,name,lat,lng,street,city,type
0,Migros-Supermarkt - Affoltern a. A.,47.278438,8.453841,Oberdorfstrasse 4,Affoltern am Albis,Migros
1,Migros-Supermarkt - Muri AG,47.272012,8.341844,Kirchenfeldstrasse 8,Muri,Migros
2,Migros-Supermarkt - Zug - Metalli,47.173133,8.518225,Baarerstrasse 22,Zug,Migros
3,Migros-Supermarkt - Baar,47.195556,8.525,Bahnhofstrasse 5,Baar,Migros
4,Migros-Supermarkt - Zug Herti,47.178032,8.505905,Hertizentrum 10,Zug,Migros


In [46]:
# Save the DataFrame to a new CSV file
df.to_csv("cleaned_supermarkets_with_type.csv", index=False)

In [47]:
# Cleaned and corrected list of official cities in Canton Zurich
valid_cities = [
    "Aeugst am Albis", "Affoltern am Albis", "Bonstetten", "Hausen am Albis", "Hedingen",
    "Kappel am Albis", "Knonau", "Maschwanden", "Mettmenstetten", "Obfelden", "Ottenbach",
    "Rifferswil", "Stallikon", "Wettswil am Albis", "Andelfingen", "Benken", "Berg am Irchel",
    "Buch am Irchel", "Dachsen", "Dorf", "Feuerthalen", "Flaach", "Flurlingen", "Henggart",
    "Kleinandelfingen", "Laufen-Uhwiesen", "Marthalen", "Ossingen", "Rheinau", "Stammheim",
    "Thalheim an der Thur", "Trüllikon", "Truttikon", "Volken", "Bachenbülach", "Bassersdorf",
    "Bülach", "Dietlikon", "Eglisau", "Embrach", "Freienstein-Teufen", "Glattfelden", "Hochfelden",
    "Höri", "Hüntwangen", "Kloten", "Lufingen", "Nürensdorf", "Oberembrach", "Opfikon", "Rafz",
    "Rorbas", "Wallisellen", "Wasterkingen", "Wil", "Winkel", "Bachs", "Boppelsen", "Buchs",
    "Dällikon", "Dänikon", "Dielsdorf", "Hüttikon", "Neerach", "Niederglatt", "Niederhasli",
    "Niederweningen", "Oberglatt", "Oberweningen", "Otelfingen", "Regensberg", "Regensdorf",
    "Rümlang", "Schleinikon", "Schöfflisdorf", "Stadel", "Steinmaur", "Weiach", "Aesch",
    "Birmensdorf", "Dietikon", "Geroldswil", "Oberengstringen", "Oetwil an der Limmat", "Schlieren",
    "Uitikon", "Unterengstringen", "Urdorf", "Weiningen", "Bäretswil", "Bubikon", "Dürnten",
    "Fischenthal", "Gossau", "Grüningen", "Hinwil", "Rüti", "Seegräben", "Wald", "Wetzikon",
    "Adliswil", "Horgen", "Kilchberg", "Langnau am Albis", "Oberrieden", "Richterswil",
    "Rüschlikon", "Thalwil", "Wädenswil", "Erlenbach", "Herrliberg", "Hombrechtikon", "Küsnacht",
    "Männedorf", "Meilen", "Oetwil am See", "Stäfa", "Uetikon am See", "Zollikon", "Zumikon",
    "Bauma", "Fehraltorf", "Hittnau", "Illnau-Effretikon", "Lindau", "Pfäffikon", "Russikon",
    "Weisslingen", "Wila", "Wildberg", "Dübendorf", "Egg", "Fällanden", "Greifensee", "Maur",
    "Mönchaltorf", "Schwerzenbach", "Uster", "Volketswil", "Wangen-Brüttisellen", "Altikon",
    "Brütten", "Dägerlen", "Dättlikon", "Dinhard", "Elgg", "Ellikon an der Thur", "Elsau",
    "Hagenbuch", "Hettlingen", "Neftenbach", "Pfungen", "Rickenbach", "Schlatt", "Seuzach",
    "Turbenthal", "Wiesendangen", "Winterthur", "Zell", "Zürich"
]

# Standardize city names in your DataFrame
df['city'] = df['city'].str.strip()

# Filter rows where city is in the official list
df = df[df['city'].isin(valid_cities)].reset_index(drop=True)

# Save the filtered DataFrame
df.to_csv("final_supermarkets_zurich.csv", index=False)


In [48]:
# Dictionary: city → district
city_to_district = {
    "Altikon": "Winterthur", "Adliswil": "Horgen", "Aesch": "Dietikon", "Aeugst am Albis": "Affoltern",
    "Affoltern am Albis": "Affoltern", "Andelfingen": "Andelfingen", "Bachenbülach": "Bülach",
    "Bachs": "Dielsdorf", "Bäretswil": "Hinwil", "Bassersdorf": "Bülach", "Bauma": "Pfäffikon",
    "Benken": "Andelfingen", "Berg am Irchel": "Andelfingen", "Birmensdorf": "Dietikon",
    "Bonstetten": "Affoltern", "Boppelsen": "Dielsdorf", "Brütten": "Winterthur", "Bubikon": "Hinwil",
    "Buch am Irchel": "Andelfingen", "Buchs": "Dielsdorf", "Bülach": "Bülach", "Dachsen": "Andelfingen",
    "Dägerlen": "Winterthur", "Dällikon": "Dielsdorf", "Dänikon": "Dielsdorf", "Dättlikon": "Winterthur",
    "Dielsdorf": "Dielsdorf", "Dietikon": "Dietikon", "Dietlikon": "Bülach", "Dinhard": "Winterthur",
    "Dorf": "Andelfingen", "Dübendorf": "Uster", "Dürnten": "Hinwil", "Egg": "Uster", "Eglisau": "Bülach",
    "Elgg": "Winterthur", "Ellikon an der Thur": "Winterthur", "Elsau": "Winterthur", "Embrach": "Bülach",
    "Erlenbach": "Meilen", "Fällanden": "Uster", "Fehraltorf": "Pfäffikon", "Feuerthalen": "Andelfingen",
    "Fischenthal": "Hinwil", "Flaach": "Andelfingen", "Flurlingen": "Andelfingen",
    "Freienstein-Teufen": "Bülach", "Geroldswil": "Dietikon", "Glattfelden": "Bülach",
    "Gossau": "Hinwil", "Greifensee": "Uster", "Grüningen": "Hinwil", "Hagenbuch": "Winterthur",
    "Hausen am Albis": "Affoltern", "Hedingen": "Affoltern", "Henggart": "Andelfingen",
    "Herrliberg": "Meilen", "Hettlingen": "Winterthur", "Hinwil": "Hinwil", "Hittnau": "Pfäffikon",
    "Hochfelden": "Bülach", "Hombrechtikon": "Meilen", "Horgen": "Horgen", "Höri": "Bülach",
    "Hüntwangen": "Bülach", "Hüttikon": "Dielsdorf", "Illnau-Effretikon": "Pfäffikon",
    "Kappel am Albis": "Affoltern", "Kilchberg": "Horgen", "Kleinandelfingen": "Andelfingen",
    "Kloten": "Bülach", "Knonau": "Affoltern", "Küsnacht": "Meilen", "Langnau am Albis": "Horgen",
    "Laufen-Uhwiesen": "Andelfingen", "Lindau": "Pfäffikon", "Lufingen": "Bülach", "Männedorf": "Meilen",
    "Marthalen": "Andelfingen", "Maschwanden": "Affoltern", "Maur": "Uster", "Meilen": "Meilen",
    "Mettmenstetten": "Affoltern", "Mönchaltorf": "Uster", "Neerach": "Dielsdorf",
    "Neftenbach": "Winterthur", "Niederglatt": "Dielsdorf", "Niederhasli": "Dielsdorf",
    "Niederweningen": "Dielsdorf", "Nürensdorf": "Bülach", "Oberembrach": "Bülach",
    "Oberengstringen": "Dietikon", "Oberglatt": "Dielsdorf", "Oberrieden": "Horgen",
    "Oberweningen": "Dielsdorf", "Obfelden": "Affoltern", "Oetwil am See": "Meilen",
    "Oetwil an der Limmat": "Dietikon", "Opfikon": "Bülach", "Ossingen": "Andelfingen",
    "Otelfingen": "Dielsdorf", "Ottenbach": "Affoltern", "Pfäffikon": "Pfäffikon",
    "Pfungen": "Winterthur", "Rafz": "Bülach", "Regensberg": "Dielsdorf", "Regensdorf": "Dielsdorf",
    "Rheinau": "Andelfingen", "Richterswil": "Horgen", "Rickenbach": "Winterthur",
    "Rifferswil": "Affoltern", "Rorbas": "Bülach", "Rümlang": "Dielsdorf", "Rüschlikon": "Horgen",
    "Russikon": "Pfäffikon", "Rüti": "Hinwil", "Schlatt": "Winterthur", "Schleinikon": "Dielsdorf",
    "Schlieren": "Dietikon", "Schöfflisdorf": "Dielsdorf", "Schwerzenbach": "Uster",
    "Seegräben": "Hinwil", "Seuzach": "Winterthur", "Stadel": "Dielsdorf", "Stäfa": "Meilen",
    "Stallikon": "Affoltern", "Stammheim": "Andelfingen", "Steinmaur": "Dielsdorf",
    "Thalheim an der Thur": "Andelfingen", "Thalwil": "Horgen", "Trüllikon": "Andelfingen",
    "Truttikon": "Andelfingen", "Turbenthal": "Winterthur", "Uetikon am See": "Meilen",
    "Uitikon": "Dietikon", "Unterengstringen": "Dietikon", "Urdorf": "Dietikon", "Uster": "Uster",
    "Volken": "Andelfingen", "Volketswil": "Uster", "Wädenswil": "Horgen", "Wald": "Hinwil",
    "Wallisellen": "Bülach", "Wangen-Brüttisellen": "Uster", "Wasterkingen": "Bülach",
    "Weiach": "Dielsdorf", "Weiningen": "Dietikon", "Weisslingen": "Pfäffikon",
    "Wettswil am Albis": "Affoltern", "Wetzikon": "Hinwil", "Wiesendangen": "Winterthur",
    "Wil": "Bülach", "Wila": "Pfäffikon", "Wildberg": "Pfäffikon", "Winkel": "Bülach",
    "Winterthur": "Winterthur", "Zell": "Winterthur", "Zollikon": "Meilen", "Zumikon": "Meilen",
    "Zürich": "Zürich"
}


In [49]:
# Ensure city names are standardized
df['city'] = df['city'].str.strip()

# Add the 'district' column by mapping from city
df['district'] = df['city'].map(city_to_district)

# Check for any cities not matched (optional)
unmatched = df[df['district'].isnull()]

# Save updated DataFrame
df.to_csv("supermarkets_with_district.csv", index=False)


In [50]:
# Drop exact duplicates based on name, street, and city
df_unique = df.drop_duplicates(subset=["name", "street", "city"]).reset_index(drop=True)


In [51]:
# Save updated DataFrame
df.to_csv("supermarkets_with_district.csv", index=False)

In [52]:
import pandas as pd

df = pd.read_csv("supermarkets_with_district.csv")

# Check for exact duplicates
duplicates = df[df.duplicated()]
print(f"Exact duplicate rows: {len(duplicates)}")
duplicates


Exact duplicate rows: 308


Unnamed: 0,name,lat,lng,street,city,type,district
3,Migros-Supermarkt - Affoltern a. A.,47.278438,8.453841,Oberdorfstrasse 4,Affoltern am Albis,Migros,Affoltern
7,Migros-Supermarkt - Affoltern a. A.,47.278438,8.453841,Oberdorfstrasse 4,Affoltern am Albis,Migros,Affoltern
11,Denner Partner,47.229772,8.527230,Albisstrasse 8,Kappel am Albis,Denner,Affoltern
12,ALDI SUISSE,47.272743,8.447018,Industriestrasse 5,Affoltern am Albis,Aldi,Affoltern
13,Lidl Schweiz,47.278007,8.453030,Centralweg 3,Affoltern am Albis,Lidl,Affoltern
...,...,...,...,...,...,...,...
833,Coop Pronto Shop mit Tankstelle Embrach,47.517294,8.594219,Industriestrasse 2,Embrach,Coop,Bülach
840,Coop Supermarkt Feuerthalen Rhymarkt,47.690987,8.646651,Schützenstrasse 30,Feuerthalen,Coop,Andelfingen
841,Migros-Supermarkt - Andelfingen,47.601964,8.688717,Weinlandstrasse 4,Kleinandelfingen,Migros,Andelfingen
843,Migros-Supermarkt - Seuzach,47.535579,8.730276,Winterthurerstrasse 5,Seuzach,Migros,Winterthur


In [53]:
# Drop exact duplicates and reset index
df_cleaned = df.drop_duplicates().reset_index(drop=True)

# Save cleaned dataset
df_cleaned.to_csv("supermarkets_with_district.csv", index=False)
