# First Integration

In [1]:
import pandas as pd

In [3]:
crimes = pd.read_csv(r"E:\Bahare (important)\uni\Master\Data Managenement\Datasets\Municipal_Court_Caseload_Information_FY_2023.csv")  
df1 = pd.read_csv(r"E:\Bahare (important)\uni\Master\Data Managenement\Cleaning Data\df1_unique_streets_only.csv")  
df2 = pd.read_csv(r"E:\Bahare (important)\uni\Master\Data Managenement\Cleaning Data\df2_unique_streets_zipcode.csv")  
neigh = pd.read_csv(r"E:\Bahare (important)\uni\Master\Data Managenement\Datasets\austin_zip_to_neighborhood_scraped.csv")  
pop = pd.read_csv(r"E:\Bahare (important)\uni\Master\Data Managenement\Datasets\austin_population_by_zip_scraped.csv")  

In [4]:
# تمیزکاری نهایی (upper case برای match بهتر)
crimes['street_clean'] = crimes['Offense Street Name'].str.upper().str.strip()
df2['street_clean2'] = df2['street_clean2'].str.upper().str.strip()
df1['street_clean1'] = df1['street_clean1'].str.upper().str.strip()

In [5]:
# مرحله 1: اضافه کردن ZIP به جرایم اصلی (با merge روی cleaned street)
crimes_with_zip = pd.merge(crimes, df2[['street_clean2', 'POSTCODE']], left_on='street_clean', right_on='street_clean2', how='left')
crimes_with_zip.rename(columns={'POSTCODE': 'ZIP_Code'}, inplace=True)

In [6]:
# error اندازه‌گیری: % جرایم که ZIP پیدا شد
match_rate = crimes_with_zip['ZIP_Code'].notna().mean() * 100
print(f"Match rate (ZIP پیدا شد): {match_rate:.2f}%")

Match rate (ZIP پیدا شد): 1.93%


In [7]:
# مرحله 2: اضافه کردن Neighborhood
crimes_with_neigh = pd.merge(crimes_with_zip, neigh, on='ZIP_Code', how='left')

In [8]:
# مرحله 3: اضافه کردن Population (فقط ZIP با جمعیت >0)
valid_pop = pop[pop['Population'] > 0][['ZIP_Code', 'Population']]
crimes_integrated = pd.merge(crimes_with_neigh, valid_pop, on='ZIP_Code', how='left')

In [9]:
# مرحله 4: گروه‌بندی برای تحلیل (تعداد جرم, نرخ جرم per Neighborhood)
crimes_by_neigh = crimes_integrated.groupby('Neighborhood').agg({
    'Offense Case Type': 'count',  # تعداد جرم
    'Population': 'mean',  # جمعیت متوسط (چون یکسان)
    'Race': lambda x: x.value_counts().to_dict() if not x.empty else {}  # توزیع نژاد
}).reset_index()
crimes_by_neigh.rename(columns={'Offense Case Type': 'Crime_Count'}, inplace=True)
crimes_by_neigh['Crime_Rate_per_1000'] = (crimes_by_neigh['Crime_Count'] / crimes_by_neigh['Population']) * 1000

In [10]:
# مرتب‌سازی بر اساس نرخ جرم
crimes_by_neigh = crimes_by_neigh.sort_values('Crime_Rate_per_1000', ascending=False)

In [11]:
# ذخیره نهایی
crimes_integrated.to_csv('final_integrated_crimes.csv', index=False)
crimes_by_neigh.to_csv('analysis_by_neighborhood.csv', index=False)

## First Match Rate

In [12]:
print("Integration تموم شد!")
print("Match rate:", match_rate, "%")
print("\nنمونه تحلیل (محله‌ها با بیشترین نرخ جرم):")
print(crimes_by_neigh.head(10))

Integration تموم شد!
Match rate: 1.9323013669747338 %

نمونه تحلیل (محله‌ها با بیشترین نرخ جرم):
              Neighborhood  Crime_Count    Population  \
6          Downtown Austin          345  11625.000000   
8              East Austin          387  23556.532300   
15            North Austin          475  45596.637895   
24            South Austin          259  35019.069498   
3               Cherrywood           48   6618.000000   
23                Rosedale           61   8426.000000   
0     Allandale, Crestview          158  23847.000000   
29  Tarrytown, Clarksville          134  22194.000000   
30            Wells Branch          131  25555.000000   
11               Hyde Park           83  17071.000000   

                                                 Race  Crime_Rate_per_1000  
6   {'White': 117, 'Black': 20, 'WHITE': 9, 'Middl...            29.677419  
8   {'White': 165, 'Black': 34, 'WHITE': 18, 'Asia...            16.428564  
15  {'White': 178, 'Black': 16, 'WHITE': 14,