In [42]:
import pandas as pd
df1 = pd.read_csv(r'C:\courses\DataManagement and Vis\project\Municipal_Court_Caseload_Information_FY_2023.csv')
print(df1.head())

  Offense Case Type Offense Date Offense Time Violation Charge Code  \
0                NT    10/1/2023      0:26:00                 60110   
1                TR    10/1/2023     13:40:00                   710   
2                TR    10/1/2023     16:39:00                 32210   
3                TR    10/1/2023     17:04:00                   610   
4                OR    10/1/2023     10:26:00                 64111   

                Offense Charge Description       Offense Street Name  \
0                      Public Intoxication            4905 TERI ROAD   
1                            Ran Red Light       4010 SOUTHWEST PKWY   
2  Crossing Property To Turn Right Or Left         2414 S LAMAR BLVD   
3                            Ran Stop Sign          BLUE CREST DRIVE   
4                        Animal - At Large  6600 BLOCK ASHLAND DRIVE   

  Offense Cross Street  School Zone  Construction Zone Case Closed   Race  \
0                  NaN        False              False        T

Counting Total Rows and Unique Street names in Main dataset Municipal_Court

In [45]:
df2 = pd.read_csv(r'C:\courses\DataManagement and Vis\project\austin_street_to_zip_mapping.csv')
print(df2.head())

             STREET  POSTCODE    CITY
0  DOYLE OVERTON RD     78719  Austin
1           MAHA RD     78719  Austin
2       POCMONT TRL     78719  Austin
3         EVELYN RD     78747  Austin
4    S SH  45 E  WB     78747  Austin


##Cleaning Street columns in main dataset

In [125]:
import re

replacements =  {
    "ROAD": "RD", "RD": "RD",
    "STREET": "ST", "ST": "ST",
    "AVENUE": "AVE", "AVE": "AVE",
    "DRIVE": "DR", "DR": "DR",
    "COURT": "CT", "CT": "CT",
    "LANE": "LN", "LN": "LN",
    "CIRCLE": "CIR", "CIR": "CIR",
    "TRAIL": "TRL", "TRL": "TRL",
    "PARKWAY": "PKWY", "PKWY": "PKWY",
    "HIGHWAY": "HWY", "HWY": "HWY",
    "PLACE": "PL", "PL": "PL",
    "PATH": "PATH",
    "WAY": "WAY",
    "LOOP": "LOOP",
    "COVE": "CV", "CV": "CV",

    # directions (important!)
    "NORTH": "N", "N": "N",
    "SOUTH": "S", "S": "S",
    "EAST": "E", "E": "E",
    "WEST": "W", "W": "W",
    "NORTHEAST": "NE", "NE": "NE",
    "NORTHWEST": "NW", "NW": "NW",
    "SOUTHEAST": "SE", "SE": "SE",
    "SOUTHWEST": "SW", "SW": "SW",

    # lane directions
    "NB": "NB", "SB": "SB", "EB": "EB", "WB": "WB"
}

direction_garbage = {
   "BLOCK", "NB", "SB", "EB", "WB", "NORTHBOUND", "SOUTHBOUND", "EASTBOUND", "WESTBOUND"
}
ordinals = {
    "FIRST": "1ST", "SECOND": "2ND", "THIRD": "3RD", "FOURTH": "4TH",
    "FIFTH": "5TH", "SIXTH": "6TH", "SEVENTH": "7TH", "EIGHTH": "8TH",
    "NINTH": "9TH", "TENTH": "10TH"
}

def clean_and_normalize(s):
    if pd.isna(s) or s == 'nan':
        return s

    s = s.upper()

    # deleting numbers in first position with a space 
    s = re.sub(r"^\d+\s+", "", s)

    # deleting punctuations
    s = re.sub(r"[^\w\s]", "", s)

    tokens = s.split()
    clean_tokens = []

    for i, t in enumerate(tokens):
        # deleting noises (NB, SB, etc)
        if t in direction_garbage:
            continue
        #FIRST -> 1ST
        t = ordinals.get(t, t)

        # correctint the abbrevations (ROAD -> RD)
        t = replacements.get(t, t)

        clean_tokens.append(t)

    return " ".join(clean_tokens).strip()

##Clean and normalize main dataset's street name Municipal_Court

In [126]:
df1["city_clean"] = df1["Offense Street Name"].apply(clean_and_normalize)

count_before1 = len(df1)
#removing duplicated streets
df1_unique = df1.drop_duplicates(subset=["city_clean"])
count_after1 = len(df1_unique)

print(f"Total number of rows in first dataset Municipal_Court: {count_before1}")
print(f"Number of rows after deleting duplicated names: {count_after1}")

Total number of rows in first dataset Municipal_Court: 225624
Number of rows after deleting duplicated names: 5449


In [127]:
df1_unique.to_csv("df1_unique_streets.csv", index=False)

In [128]:
print(df1["city_clean"].head(20))

0                TERI RD
1                SW PKWY
2           S LAMAR BLVD
3          BLUE CREST DR
4             ASHLAND DR
5            W PARMER LN
6          LEVANDER LOOP
7          LEVANDER LOOP
8                LAZY ON
9       WALSH TARLTON LN
10          S MOPAC EXPY
11    S IH 35 SERVICE RD
12    S IH 35 SERVICE RD
13    S IH 35 SERVICE RD
14      WALSH TARLTON LN
15      WALSH TARLTON LN
16            RIO GRANDE
17                W 22ND
18                W 22ND
19           SAN GABRIEL
Name: city_clean, dtype: object


##Clean and normalize second dataset's street name austin_street

In [129]:
# Replace NaN with an empty string, then clean
df2["city_clean"] = df2["STREET"].fillna("").astype(str).str.upper().str.strip()

# Your normalization function now receives "" instead of a float
df2["city_clean"] = df2["city_clean"].apply(clean_and_normalize)

In [130]:
count_before2 = len(df2)

#removing duplicated streets
df2_unique = df2.drop_duplicates(subset=["city_clean", "POSTCODE"]) # there are some street name with diffretnt zipcode, so they are not equal

count_after2 = len(df2_unique)

print(f"Total number of rows in second dataset austin_street: {count_before2}")
print(f"Number of rows after deleting duplicated names: {count_after2}")

Total number of rows in second dataset austin_street: 11840
Number of rows after deleting duplicated names: 11773


In [131]:
df2_unique.to_csv("df2_unique_streets.csv", index=False)

In [132]:
print(df2["city_clean"].head())

0    DOYLE OVERTON RD
1             MAHA RD
2         POCMONT TRL
3           EVELYN RD
4           S SH 45 E
Name: city_clean, dtype: object


##to find how many unique cities matches in 2 different datasets

In [133]:
#Integerating 2 datasets
common = set(df1_unique["city_clean"]) & set(df2_unique["city_clean"])
# Remove 'nan' or empty strings if they aren't real cities -> Data Quality
common.discard('NAN') 
common.discard('')
common.discard('nan')
print("Matches:", len(common))

Matches: 1986


In [134]:
common_list = list(common)

print(common_list[:10])

['POLAR DR', 'W 22ND ST', 'MC CURDY ST', 'ASHEN LN', 'PINE KNOLL DR', 'LAWRENCE ST', 'W WELLS BRANCH PKWY', 'CHIPPEWAY LN', 'MEADOW CREEK DR', 'DOYAL DR']


In [107]:
search_name = "S LAMAR BLVD" 

if search_name in common:
    print(f"✅ {search_name} exists in both files!")
else:
    print(f"❌ {search_name} was not found in the common list.")

✅ S LAMAR BLVD exists in both files!


In [119]:
missing_streets = set(df1["city_clean"]) - set(df2["city_clean"])
print(list(missing_streets)[:10])

['E 51ST CLARKSON AVE', 'BURTON RD', 'RED BUD TRL', 'E PARMER PN', 'FAIR FIELD DR', 'MILTON E', 'S 1 ST ST', 'S SH 130 SERVICE RD', 'MOPAC LOT', 'N MOPAC SVRD']


In [123]:
df_neigh = pd.read_csv(r'C:\courses\DataManagement and Vis\project\austin_zip_to_neighborhood_full.csv')
df_pop = pd.read_csv(r'C:\courses\DataManagement and Vis\project\austin_population_by_zip_scraped.csv')
valid_pop = df_pop[df_pop["Population"] > 0]

zip_neigh = set(df_neigh["ZIP_Code"])
zip_pop_valid = set(valid_pop["ZIP_Code"])
zip_street = set(df2["POSTCODE"])

common_zips = zip_neigh & zip_pop_valid & zip_street

print("Common ZIPs:", len(common_zips))
common_zips

Common ZIPs: 40


{78701,
 78702,
 78703,
 78704,
 78705,
 78721,
 78722,
 78723,
 78724,
 78725,
 78726,
 78727,
 78728,
 78729,
 78730,
 78731,
 78732,
 78733,
 78735,
 78736,
 78737,
 78738,
 78739,
 78741,
 78742,
 78744,
 78745,
 78746,
 78747,
 78748,
 78749,
 78750,
 78751,
 78752,
 78753,
 78754,
 78756,
 78757,
 78758,
 78759}

In [124]:
street_zip_common = df2[df2["STREET"].isin(common)
]

street_zip_common_valid = street_zip_common[
    street_zip_common["POSTCODE"].isin(common_zips)
]
streets_with_valid_zip = street_zip_common_valid["STREET"].unique()

print("Number of common streets with valid ZIP:", len(streets_with_valid_zip))
streets_with_valid_zip[:30]  

Number of common streets with valid ZIP: 1703


array(['OLD LOCKHART RD', 'THAXTON RD', 'BRADSHAW RD', 'PINEHURST DR',
       'OLD SAN ANTONIO RD', 'EDEN DR', 'MARY LEWIS DR', 'SUNDAY DR',
       'ONION CREEK PKWY', 'SOUTHERNER WAY', 'DIMITRIOS DR',
       'PRESTON TRAILS DR', 'BELL TOWER LN', 'DEER CHASE TRL',
       'CROWN COLONY DR', 'KENNEDY ST', 'ABBY ANN LN', 'BOCA RATON DR',
       'STEINBECK DR', 'FARRAH LN', 'PINNACLE CREST LOOP', 'MANCHACA RD',
       'BRODIE LN', 'INTERLACHEN LN', 'BUZZ SCHNEIDER LN', 'SUNSET DR',
       'PAVELICH PASS', 'HACIENDA DR', 'DAVE SILK DR', 'WINTER HAVEN RD'],
      dtype=object)