In [18]:
import pandas as pd
from pathlib import Path
from sklearn.cluster import AgglomerativeClustering
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [19]:
df = pd.read_csv(Path("../data/cincy311_cleaned.tsv"), delimiter = "|")

In [42]:
unique_codes = df.service_code.str[1:-1].unique().tolist()

In [43]:
unique_codes

['BLD-RES',
 'PLMB_DEF',
 'HNDCPPRK',
 'GRCRYCNC',
 'ROACHES',
 'BD_UNSAN',
 'LITR-PRV',
 'SRVCMPSC',
 'MOLD',
 'DMGTROD',
 'BLDRRFIF',
 'RATS',
 'BLD_VACR',
 'SVCCMPLT',
 'SIDWLKH',
 'STRRPR',
 'NW/MSSNG',
 'DUMP-PVS',
 'BLD_VACC',
 'MICE-BLD',
 'LGHT,NWS',
 'DOG_WSTE',
 'PRKNGYRD',
 'PAVMK',
 'ABAN-VPR',
 'DFLTPARK',
 'ADASDWKO',
 'SGNSHPNI',
 'CRNCAN-N',
 'SWGTSDOP',
 'ANIMBDOG',
 'GRFITI',
 'BLD-COM',
 'TLGR-PRV',
 'BLD_USER',
 'TRTRIMUF',
 'CLEAN-ST',
 'RTSNSWR',
 'RCNEWLIC',
 'USE_R',
 'SVCCMPLY',
 'NCRHMNT,',
 'HEAT',
 'SIGNREMV',
 'SRVCMPCS',
 'TLGR-PS',
 'GUARDR',
 'FOOD_RES',
 'TRFFCSLN',
 'FDBRNLLN',
 'ABAN-VNA',
 'TREEPR',
 'SEWAG_EX',
 'FENCLINR',
 'CBHBBE',
 'TTRMPVUF',
 'LTRSTPNH',
 'FACTPARK',
 'CPARKF-C',
 'RQSTNGHB',
 'LIGHTSIH',
 'PLANTERS',
 'TPLANT',
 'SGNL,TNW',
 'CHNGFLSH',
 'FRHYDRNT',
 'BLDFRESC',
 'HOMEOWNI',
 'STERMPRK',
 'SGNL,TRF',
 'RATSOUT',
 'RSTRNTRC',
 'BUSFMRES',
 'SDWLK,NW',
 'LTTR-BLD',
 'WDSBSTRC',
 'STRTPLTC',
 'CAFESRSP',
 'BLD_COLL',
 'RSTRNTIC'

In [44]:
tf_idf = TfidfVectorizer()



In [45]:
dt = tf_idf.fit_transform(unique_codes)
clf = AgglomerativeClustering(n_clusters=10, affinity=cosine_similarity, linkage="complete")
model = clf.fit_predict(dt.toarray())

In [46]:
model

array([5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 3, 5, 3,
       5, 3, 5, 3, 5, 3, 5, 3, 5, 3, 6, 3, 5, 6, 3, 5, 6, 3, 5, 6, 3, 5,
       6, 5, 3, 6, 5, 3, 6, 5, 3, 6, 5, 3, 6, 5, 3, 6, 5, 3, 6, 5, 3, 6,
       5, 3, 6, 3, 5, 6, 3, 6, 9, 5, 3, 6, 9, 5, 3, 6, 9, 5, 3, 6, 9, 5,
       3, 6, 9, 5, 3, 6, 9, 5, 3, 6, 9, 5, 3, 6, 9, 5, 3, 6, 9, 5, 3, 6,
       9, 5, 3, 6, 9, 5, 3, 6, 9, 5, 3, 7, 6, 9, 5, 3, 7, 6, 9, 5, 7, 3,
       6, 9, 5, 7, 3, 6, 9, 5, 7, 3, 6, 9, 5, 7, 3, 6, 9, 5, 7, 3, 6, 9,
       5, 7, 3, 6, 9, 5, 7, 3, 6, 9, 5, 7, 3, 6, 9, 5, 7, 3, 6, 9, 5, 7,
       3, 6, 9, 5, 7, 3, 6, 9, 5, 7, 3, 6, 9, 5, 7, 3, 6, 9, 5, 7, 3, 6,
       9, 5, 7, 3, 6, 9, 5, 7, 3, 6, 9, 5, 7, 3, 9, 6, 5, 7, 3, 9, 6, 5,
       7, 3, 9, 6, 5, 7, 3, 9, 6, 5, 7, 3, 9, 6, 5, 7, 3, 9, 6, 5, 9, 7,
       3, 6, 5, 9, 7, 3, 6, 5, 9, 7, 3, 6, 5, 9, 7, 3, 6, 5, 9, 7, 3, 6,
       5, 9, 7, 3, 2, 6, 5, 9, 7, 3, 4, 2, 6, 9, 5, 7, 3, 4, 2, 6, 9, 5,
       3, 7, 1, 4, 2, 6, 9, 5, 3, 7, 1, 4, 2, 6, 9,

In [47]:
model_results_df = pd.DataFrame([unique_codes, model]).T
model_results_df.columns = ["service_name", "cluster_class"]

In [48]:
model_results_df

Unnamed: 0,service_name,cluster_class
0,BLD-RES,5
1,PLMB_DEF,5
2,HNDCPPRK,5
3,GRCRYCNC,5
4,ROACHES,5
...,...,...
396,RCYCLNSR,5
397,TMPRRYRP,3
398,FDOOR_MD,4
399,PVMTPRDR,0


In [51]:
for i in model_results_df.cluster_class.unique():
    display(model_results_df[model_results_df.cluster_class == i])

Unnamed: 0,service_name,cluster_class
0,BLD-RES,5
1,PLMB_DEF,5
2,HNDCPPRK,5
3,GRCRYCNC,5
4,ROACHES,5
...,...,...
347,SCARPOOR,5
359,STERMDES,5
370,WASPPARK,5
383,PLCJUNKV,5


Unnamed: 0,service_name,cluster_class
19,MICE-BLD,3
21,DOG_WSTE,3
23,PAVMK,3
25,DFLTPARK,3
27,SGNSHPNI,3
...,...,...
348,UTILASPH,3
360,LIFESAFF,3
371,BCYCLABN,3
384,CLG-INLT,3


Unnamed: 0,service_name,cluster_class
32,BLD-COM,6
35,TRTRIMUF,6
38,RCNEWLIC,6
41,"NCRHMNT,",6
44,SRVCMPCS,6
...,...,...
351,BEAUPARK,6
362,AAADACCC,6
373,BLDUNSC,6
386,ELECNOPW,6


Unnamed: 0,service_name,cluster_class
74,LTTR-BLD,9
78,BLD_COLL,9
82,BUSSTRSP,9
86,ZONCCER,9
90,MLDCHLDR,9
94,VDRPRCW,9
98,FDBRNCMP,9
102,USE_C,9
106,ERC_PV,9
110,PARKINGC,9


Unnamed: 0,service_name,cluster_class
121,BLD-ELEH,7
126,BLDFRFAC,7
130,TLGR-PRB,7
135,LIGHREMV,7
140,DWAYPPST,7
145,HISTMARC,7
150,FRDRLCK,7
155,TRREPR,7
160,PTHOLE,7
165,YDWSTA-J,7


Unnamed: 0,service_name,cluster_class
268,TLGR-BLD,2
275,NFRMTNRQ,2
282,SCHLANNL,2
290,SGNL_TNW,2
298,LTTR-REC,2
306,YRD-WSTE,2
314,TRSBLCKS,2
323,RLRDSCOM,2
333,HANDRL,2
343,VHCLPRGS,2


Unnamed: 0,service_name,cluster_class
274,BLD-ELVP,4
281,RCNBTSRM,4
289,LTTR-PRK,4
297,TLGR-DTE,4
305,GYMSHSW,4
313,LOT_VACE,4
322,HIKEPARK,4
331,LANDSLDE,4
340,RECSRVCP,4
350,TREEPARK,4


Unnamed: 0,service_name,cluster_class
288,BLD-ELIC,1
296,VOID-REP,1
304,MSQTSPN,1
312,PDSTLLGH,1
321,MDDVSRY,1
330,HTS-STP,1
339,SWAP18,1
349,FREPREVI,1
355,LTTR-PFC,1
358,BLD-DISV,1


Unnamed: 0,service_name,cluster_class
319,BLD-DUNS,8
327,TLGR-REC,8
335,PLMB HAZ,8
345,LTTR-CDV,8
356,SCREGELM,8
367,POTHPARK,8
379,SIGNOPLE,8
391,HIGH,8


Unnamed: 0,service_name,cluster_class
342,BLD-DCOR,0
352,BCYCLSTN,0
363,NMLBTRP,0
374,BLD-RCO,0
375,SGN_TNW,0
381,TLGR-HLT,0
387,GRFITI-H,0
393,BARRIC-H,0
399,PVMTPRDR,0
