In [17]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer


df = pd.read_csv('property_and_building_violations.csv')


descriptions = df['description'].dropna().astype(str).str.lower()


vectorizer = CountVectorizer(ngram_range=(3, 3), stop_words='english')
X = vectorizer.fit_transform(descriptions)


sum_words = X.sum(axis=0)
words_freq = [(word, sum_words[0, idx]) for word, idx in vectorizer.vocabulary_.items()]
words_freq = sorted(words_freq, key=lambda x: x[1], reverse=True)


freq_df = pd.DataFrame(words_freq[:50], columns=['trigram', 'count'])
print(freq_df)


                        trigram  count
0         failure obtain permit   4006
1          failed comply permit    444
2            comply permit term    444
3         failure secure permit    421
4          building use premise    285
5               use premise req    285
6       protection adj property    264
7           use premises permit     65
8        electrical work permit     62
9     mechanical execution work     60
10     maintenance means egress     42
11      emergency escape rescue     40
12          guarding live parts     38
13           failed comply prmt     38
14            comply prmt terms     38
15          mass state plumbing     30
16          state plumbing code     30
17            exits exit access     25
18            number exits exit     24
19     illegally occupied prior     23
20            acts 1956 amended     21
21              stop work order     20
22      accessible means egress     18
23  municipal bylaws ordinances     17
24          permits perfo

In [20]:
import pandas as pd


df = pd.read_csv('property_and_building_violations.csv')


def categorize_violation(desc):
    if pd.isna(desc):
        return 'Unknown'
    desc = desc.upper()
    if 'PERMIT' in desc or 'COMPLY' in desc:
        return 'Permit Issues'
    elif 'MAINTAIN' in desc:
        return 'Property Maintenance Issue'
    elif 'UNSAFE' in desc or 'DANGEROUS' in desc:
        return 'Unsafe Conditions'
    elif 'ELECTRICAL' in desc or 'WIRING' in desc:
        return 'Electrical Violation'
    elif 'PREMISE' in desc:
        return 'Premise Violation'
    elif 'CERTIFICATION' in desc:
        return 'Certification Issues'
    elif 'WATER' in desc or 'PLUMBING' in desc:
        return 'Water Violation'
    else:
        return 'Other'


df['violation_category'] = df['description'].apply(categorize_violation)


df['ward_str'] = df['ward'].astype(str).str.strip()
df = df[df['ward_str'].str.isdigit()]


agg = df.groupby(['violation_category', 'ward_str'], as_index=False).agg(count=('case_no', 'count'))


csv_path = 'violation_counts_by_category_ward.csv'
agg.to_csv(csv_path, index=False)


print("Preview: Counts by Category and Ward", agg.head(20))

print(f"CSV file saved to: {csv_path}")


Preview: Counts by Category and Ward       violation_category ward_str  count
0   Certification Issues       01     98
1   Certification Issues       02      7
2   Certification Issues       03    320
3   Certification Issues       04     35
4   Certification Issues       05    174
5   Certification Issues       06     21
6   Certification Issues       07     20
7   Certification Issues       08     15
8   Certification Issues       09     38
9   Certification Issues       10      9
10  Certification Issues       11      5
11  Certification Issues       12      9
12  Certification Issues       13     13
13  Certification Issues       14     17
14  Certification Issues       15      4
15  Certification Issues       16      2
16  Certification Issues       17      1
17  Certification Issues       18     31
18  Certification Issues       19      2
19  Certification Issues       20      2
CSV file saved to: violation_counts_by_category_ward.csv


CHECK OUT THE FLOURISH VISUALIZATIONS HERE: https://public.flourish.studio/visualisation/22902450/