In [1]:
import pandas as pd
from fuzzywuzzy import fuzz
from fuzzywuzzy import process

# Load your data
df = pd.read_csv('D:\olist_geolocation_dataset.csv')

# Drop duplicates for performance
unique_cities = df['geolocation_city'].dropna().unique()

# Create list to store possible mistyped pairs
similar_pairs = []

# Compare each city with others
for i, city in enumerate(unique_cities):
    for compare_city in unique_cities[i+1:]:
        score = fuzz.ratio(city.lower(), compare_city.lower())
        if 85 < score < 100:  # not identical but very close
            similar_pairs.append((city, compare_city, score))

# Convert to DataFrame
similar_df = pd.DataFrame(similar_pairs, columns=['City1', 'City2', 'Similarity'])

# View suspicious pairs
print(similar_df.sort_values(by='Similarity', ascending=False))


                                City1                             City2  \
3399        presidente castelo branco        presidente castello branco   
627            sao joao do pau d'alho             sao joao do pau dalho   
3135            figueirópolis d'oeste              figueirópolis doeste   
3032          nova brasilandia doeste          nova brasilandia d oeste   
3137  vila bela da santssima trindade  vila bela da santíssima trindade   
...                               ...                               ...   
1783           sao francisco do conde            sao francisco do oeste   
1781           são sebastião do passé              são sebastião do caí   
1780           são sebastião do passe              são sebastião do caí   
1778           sao sebastiao do passe              sao sebastiao do cai   
3664                          ciriaco                           ciríaco   

      Similarity  
3399          98  
627           98  
3135          98  
3032          98  
3137

In [11]:
import pandas as pd
from fuzzywuzzy import fuzz
from sqlalchemy import create_engine

# Step 1: Connect to PostgreSQL
username = 'postgres'
password = 'deepakbisht69'
host = 'localhost'
port = '5432'
database = 'e-commerce project'

engine = create_engine(f'postgresql://{username}:{password}@{host}:{port}/{database}')
conn = engine.connect()

# Step 2: Load geolocation data
df = pd.read_sql("SELECT geo_id, geolocation_city FROM olist_geolocation", conn)
df['geolocation_city_clean'] = df['geolocation_city'].str.lower().str.strip()

# Step 3: Get frequency of each city
city_freq = df['geolocation_city_clean'].value_counts().reset_index()
city_freq.columns = ['city', 'count']

# Step 4: Identify similar cities
from itertools import combinations
similar_map = {}

city_list = city_freq['city'].tolist()

for city1, city2 in combinations(city_list, 2):
    score = fuzz.ratio(city1, city2)
    if 95 < score < 100:
        # Choose the more frequent one as the correct name
        count1 = city_freq[city_freq['city'] == city1]['count'].values[0]
        count2 = city_freq[city_freq['city'] == city2]['count'].values[0]
        correct = city1 if count1 >= count2 else city2
        wrong = city2 if count1 >= count2 else city1
        similar_map[wrong] = correct

# Step 5: Replace wrong values with correct ones
df['geolocation_city_corrected'] = df['geolocation_city_clean'].replace(similar_map)

# Step 6: Update PostgreSQL using geo_id
from sqlalchemy.sql import text

with engine.begin() as connection:
    for index, row in df.iterrows():
        connection.execute(
            text("""
                UPDATE olist_geolocation
                SET geolocation_city = :new_val
                WHERE geo_id = :geo_id
            """),
            {'new_val': row['geolocation_city_corrected'], 'geo_id': row['geo_id']}
        )

print("✅ Successfully corrected similar city names in geolocation_city.")


Exception during reset or similar
Traceback (most recent call last):
  File "c:\Users\DELL\AppData\Local\Programs\Python\Python311\Lib\site-packages\sqlalchemy\pool\base.py", line 987, in _finalize_fairy
    fairy._reset(
  File "c:\Users\DELL\AppData\Local\Programs\Python\Python311\Lib\site-packages\sqlalchemy\pool\base.py", line 1433, in _reset
    pool._dialect.do_rollback(self)
  File "c:\Users\DELL\AppData\Local\Programs\Python\Python311\Lib\site-packages\sqlalchemy\engine\default.py", line 703, in do_rollback
    dbapi_connection.rollback()
psycopg2.OperationalError: server closed the connection unexpectedly
	This probably means the server terminated abnormally
	before or while processing the request.

Exception during reset or similar
Traceback (most recent call last):
  File "c:\Users\DELL\AppData\Local\Programs\Python\Python311\Lib\site-packages\sqlalchemy\pool\base.py", line 987, in _finalize_fairy
    fairy._reset(
  File "c:\Users\DELL\AppData\Local\Programs\Python\Python31

✅ Successfully corrected similar city names in geolocation_city.


In [12]:
import pandas as pd
from fuzzywuzzy import fuzz
from sqlalchemy import create_engine

# -------------------------------
# 1. CONNECT TO POSTGRESQL
# -------------------------------
username = 'postgres'
password = 'deepakbisht69'
host = 'localhost'
port = '5432'
database = 'e-commerce project'

engine = create_engine(f'postgresql://{username}:{password}@{host}:{port}/{database}')
conn = engine.connect()

# -------------------------------
# 2. LOAD CLEANED COLUMN
# -------------------------------
df = pd.read_sql("SELECT DISTINCT geolocation_city FROM olist_geolocation", conn)

# -------------------------------
# 3. PREPARE FOR FUZZY MATCHING
# -------------------------------
df['cleaned'] = df['geolocation_city'].str.strip().str.lower()
unique_cities = df['cleaned'].dropna().unique()

# -------------------------------
# 4. FIND SIMILARITY > 90
# -------------------------------
from itertools import combinations
similar_pairs = []

for city1, city2 in combinations(unique_cities, 2):
    score = fuzz.ratio(city1, city2)
    if 90 < score < 100:  # Similar but not same
        similar_pairs.append((city1, city2, score))

# -------------------------------
# 5. SHOW RESULTS
# -------------------------------
similar_df = pd.DataFrame(similar_pairs, columns=['City1', 'City2', 'Similarity'])
similar_df = similar_df.sort_values(by='Similarity', ascending=False)

print(f"🔍 Total possible typo/conflict pairs remaining: {len(similar_df)}")
print(similar_df.head(20))  # View top suspicious ones


🔍 Total possible typo/conflict pairs remaining: 358
           City1        City2  Similarity
179   itambaraca    itamaraca          95
138   catanduvas    catanduva          95
185  petrolandia   perolandia          95
184   fronteiras    fronteira          95
164   montenegro  monte negro          95
162    alexandra   alexandria          95
159   charqueada  charqueadas          95
149    pau darco   pau d arco          95
145  itainopolis   itaiopolis          95
134   cearamirim  ceara mirim          95
38    crisopolis  cristopolis          95
126  materlandia   matelandia          95
125   palmeirais    palmeiras          95
100   gameleiras    gameleira          95
97   jaboticabal   jaboticaba          95
94   rio bracnco   rio branco          95
84    lagoa nova  alagoa nova          95
78     alagoinha   alagoinhas          95
193   mogi guacu    mogiguacu          95
205  paragominas   aragominas          95


In [16]:
import pandas as pd
import re
from sqlalchemy import create_engine

# Database connection info
db_name = "e-commerce project"
username = "postgres"
password = "deepakbisht69"  # Replace with your password
host = "localhost"
port = "5432"

# Connect to PostgreSQL
engine = create_engine(f"postgresql+psycopg2://{username}:{password}@{host}:{port}/{db_name}")
conn = engine.connect()

# Read geolocation_city column from table
df = pd.read_sql("SELECT geo_id, geolocation_city FROM olist_geolocation", conn)

# Function to detect repeated word sequences
def has_repeated_phrase(city):
    if pd.isnull(city):
        return False
    words = city.lower().split()
    for i in range(len(words) - 2):  # Check for 2 or 3 word phrases
        phrase = ' '.join(words[i:i+2])
        if phrase and city.lower().count(phrase) > 1:
            return True
    return False

# Filter rows with suspicious repeats
df['has_repeat'] = df['geolocation_city'].apply(has_repeated_phrase)
repeated_df = df[df['has_repeat']]

# Display suspicious rows
print("🔍 Suspicious rows with repeated phrases:\n")
print(repeated_df[['geo_id', 'geolocation_city']].to_string(index=False))

# Optional: Save to CSV for manual review
# repeated_df.to_csv("repeated_cities_check.csv", index=False)


🔍 Suspicious rows with repeated phrases:

 geo_id                     geolocation_city
 998244 rio de janeiro rio de janeiro brasil
