In [5]:
import pandas as pd
from geopy.geocoders import Nominatim
from geopy.extra.rate_limiter import RateLimiter

# Load the avocado dataset
df = pd.read_csv('avocado.csv')
df.info

# Step 1: Get the unique regions
unique_regions = df['region'].unique()
print(f"Unique regions: {len(unique_regions)}")

# Initialize geolocator with Nominatim service
geolocator = Nominatim(user_agent="avocado_region_classifier")

# Use RateLimiter to ensure we don't exceed the query limit (adjust delay if needed)
geocode = RateLimiter(geolocator.geocode, min_delay_seconds=1)

# Step 2: Function to classify the region based on geopy results
def classify_region_geopy(region):
    try:
        location = geocode(region)
        if location:
            address = location.raw.get('address', {})
            if 'city' in address:
                return 'City'
            elif 'state' in address:
                return 'State'
            elif 'region' in address:
                return 'Region'
            else:
                return 'Unknown'
        else:
            return 'Unknown'
    except:
        return 'Error'

# Step 3: Apply geocoding function to unique regions
unique_region_types = {region: classify_region_geopy(region) for region in unique_regions}

# Step 4: Map the geocoded results back to the original dataframe
df['region_type'] = df['region'].map(unique_region_types)

# Step 5: Show classification counts and save the updated dataframe
print(df['region_type'].value_counts())
df.to_csv('avocado_with_region_types.csv', index=False)


Unique regions: 54
region_type
Unknown    18249
Name: count, dtype: int64


In [7]:
import pandas as pd
from geopy.geocoders import Nominatim
from geopy.extra.rate_limiter import RateLimiter

# Load the avocado dataset
df = pd.read_csv('avocado.csv')

# Step 1: Get the unique regions
unique_regions = df['region'].unique()
print(f"Unique regions: {len(unique_regions)}")

# Initialize geolocator with Nominatim service
geolocator = Nominatim(user_agent="avocado_region_classifier")

# Use RateLimiter to ensure we don't exceed the query limit (adjust delay if needed)
geocode = RateLimiter(geolocator.geocode, min_delay_seconds=1)

# Step 2: Function to classify the region based on geopy results
def classify_region_geopy(region):
    try:
        location = geocode(region)
        if location:
            address = location.raw.get('address', {})
            print(f"Geocoding {region}: {address}")  # Print out the address for debugging
            # Check if we have information about city, state, or region
            if 'city' in address:
                return 'City'
            elif 'state' in address:
                return 'State'
            elif 'country' in address or 'region' in address:
                return 'Region'
            else:
                return 'Unknown'
        else:
            return 'Unknown'
    except Exception as e:
        print(f"Error geocoding {region}: {e}")
        return 'Error'

# Step 3: Apply geocoding function to unique regions
unique_region_types = {region: classify_region_geopy(region) for region in unique_regions}

# Step 4: Map the geocoded results back to the original dataframe
df['region_type'] = df['region'].map(unique_region_types)

# Step 5: Show classification counts and save the updated dataframe
print(df['region_type'].value_counts())
df.to_csv('avocado_with_region_types.csv', index=False)


Unique regions: 54
Geocoding Albany: {}
Geocoding Atlanta: {}
Geocoding Boise: {}
Geocoding Boston: {}
Geocoding California: {}
Geocoding Charlotte: {}
Geocoding Chicago: {}
Geocoding Columbus: {}
Geocoding Denver: {}
Geocoding Detroit: {}
Geocoding GrandRapids: {}
Geocoding Houston: {}
Geocoding Indianapolis: {}
Geocoding Jacksonville: {}
Geocoding LasVegas: {}
Geocoding LosAngeles: {}
Geocoding Louisville: {}
Geocoding Midsouth: {}
Geocoding Nashville: {}
Geocoding NewYork: {}
Geocoding Northeast: {}
Geocoding Orlando: {}
Geocoding Philadelphia: {}
Geocoding Pittsburgh: {}
Geocoding Plains: {}
Geocoding Portland: {}
Geocoding Roanoke: {}
Geocoding Sacramento: {}
Geocoding SanDiego: {}
Geocoding SanFrancisco: {}
Geocoding Seattle: {}
Geocoding SouthCentral: {}
Geocoding Southeast: {}
Geocoding Spokane: {}
Geocoding StLouis: {}
Geocoding Syracuse: {}
Geocoding Tampa: {}
Geocoding West: {}
region_type
Unknown    18249
Name: count, dtype: int64


In [9]:
import pandas as pd

# Load the avocado dataset
df = pd.read_csv('avocado.csv')

# Step 1: Define the unique regions and manually classify them
region_classification = {
    'Albany': 'City',
    'Atlanta': 'City',
    'BaltimoreWashington': 'Region',
    'Boise': 'City',
    'Boston': 'City',
    'BuffaloRochester': 'Region',
    'California': 'State',
    'Charlotte': 'City',
    'Chicago': 'City',
    'CincinnatiDayton': 'Region',
    'Columbus': 'City',
    'DallasFtWorth': 'Region',
    'Denver': 'City',
    'Detroit': 'City',
    'GrandRapids': 'City',
    'GreatLakes': 'Region',
    'HarrisburgScranton': 'Region',
    'HartfordSpringfield': 'Region',
    'Houston': 'City',
    'Indianapolis': 'City',
    'Jacksonville': 'City',
    'LasVegas': 'City',
    'LosAngeles': 'City',
    'Louisville': 'City',
    'MiamiFtLauderdale': 'Region',
    'Midsouth': 'Region',
    'Nashville': 'City',
    'NewOrleansMobile': 'Region',
    'NewYork': 'City',
    'Northeast': 'Region',
    'NorthernNewEngland': 'Region',
    'Orlando': 'City',
    'Philadelphia': 'City',
    'PhoenixTucson': 'Region',
    'Pittsburgh': 'City',
    'Plains': 'Region',
    'Portland': 'City',
    'RaleighGreensboro': 'Region',
    'RichmondNorfolk': 'Region',
    'Roanoke': 'City',
    'Sacramento': 'City',
    'SanDiego': 'City',
    'SanFrancisco': 'City',
    'Seattle': 'City',
    'SouthCarolina': 'State',
    'SouthCentral': 'Region',
    'Southeast': 'Region',
    'Spokane': 'City',
    'StLouis': 'City',
    'Syracuse': 'City',
    'Tampa': 'City',
    'West': 'Region',
    'WestTexNewMexico': 'Region'
}

# Step 2: Map the classification back to the DataFrame
df['region_type'] = df['region'].map(region_classification)

# Step 3: Show classification counts and save the updated dataframe
print(df['region_type'].value_counts())
df.to_csv('avocado_with_region_types.csv', index=False)


region_type
City      10478
Region     6757
State       676
Name: count, dtype: int64


In [12]:
import pandas as pd

# Load the dataset
df = pd.read_csv('avocado_with_region_types.csv')

# Step 1: Filter only regions and cities
df_filtered = df[df['region_type'].isin(['City', 'Region'])]

# Step 2: Group by region and city to check for duplicates
# We'll count how often each city appears in regions and flag duplicates
city_region_counts = df_filtered.groupby(['region_type', 'region']).size().reset_index(name='count')

# Step 3: Filter for repeated cities
repeated_cities_in_region = city_region_counts[city_region_counts['count'] > 1]

# Display the repeated cities
print(repeated_cities_in_region)


   region_type               region  count
0         City               Albany    338
1         City              Atlanta    338
2         City                Boise    338
3         City               Boston    338
4         City            Charlotte    338
5         City              Chicago    338
6         City             Columbus    338
7         City               Denver    338
8         City              Detroit    338
9         City          GrandRapids    338
10        City              Houston    338
11        City         Indianapolis    338
12        City         Jacksonville    338
13        City             LasVegas    338
14        City           LosAngeles    338
15        City           Louisville    338
16        City            Nashville    338
17        City              NewYork    338
18        City              Orlando    338
19        City         Philadelphia    338
20        City           Pittsburgh    338
21        City             Portland    338
22        C