### The objective is to determine whether the offenders have related usernames, for instance if they contain specific keywords.

### Read usernames

In [6]:
import pandas as pd


df = pd.read_csv(r'data/filtered.csv')

In [12]:
df['username'].to_csv('data/usernames.csv')

In [10]:
len(df['username'].unique())

400796

### Select n=8k usernames

In [3]:
df=df.head(8000)
df=df.filter(items=['username'])
df=df['username'].tolist()

### Calculate Jaro-Winkler similarity

<a href="https://srinivas-kulkarni.medium.com/jaro-winkler-vs-levenshtein-distance-2eab21832fd6">Here is a detailed explanation of the calculation</a>

In [5]:
import time

# get the start time
st = time.time()

import numpy as np
from sklearn.cluster import AffinityPropagation
import jellyfish
    
words = df
words = np.asarray(words) #So that indexing with a list will work
lev_similarity = np.array([[jellyfish.jaro_winkler_similarity(w1,w2) for w1 in words] for w2 in words])

# get the end time
et = time.time()

# get the execution time
elapsed_time = et - st
print('Execution time:', elapsed_time, 'seconds')


Execution time: 35.043111085891724 seconds


### Cluster using AffinityPropagation algorithm

<a href="https://www.geeksforgeeks.org/affinity-propagation-in-ml-to-find-the-number-of-clusters/">Here is a detailed explanation of the calculation</a>

In [4]:
affprop = AffinityPropagation(affinity="precomputed", damping=0.9)
affprop.fit(lev_similarity)



### Top 10 clusters

In [5]:
results=pd.DataFrame(affprop.labels_, columns=['cluster'])
results['centers']=affprop.cluster_centers_indices_[results['cluster']]
results['centers_words']=words[affprop.cluster_centers_indices_[results['cluster']]]
results['words']=words
results

# results.groupby(['cluster'], ['center']).count()
df_grouped=results.groupby(['cluster', 'centers','centers_words']).size().reset_index(name='counts')    
#df_grouped
df_grouped.sort_values(by=['counts'], inplace=True, ascending=False)
df_grouped.head(10)



Unnamed: 0,cluster,centers,centers_words,counts
6,6,45,boy,79
521,521,5949,alo,46
139,139,1489,boylover,39
138,138,1485,marco,38
213,213,2418,boyboy,34
423,423,4746,alala,33
87,87,765,sala,31
454,454,5110,saraa,31
339,339,3843,anon8,30
74,74,652,tera,29


### Show the usernames in these 10 groups

In [6]:
for cluster_id in df_grouped.head(10).cluster:
    printing=', '.join(results[results['cluster']==cluster_id].words) 
    
    # # Adding all the values
    # res = ', '.join(df['word'])
    
    # Display result
    print('Cluster ',df_grouped[df_grouped['cluster']==cluster_id].centers_words.values,'(id:',cluster_id,'):\n',printing,'\n')

Cluster  ['boy'] (id: 6 ):
 boy, boydaddyluvbb, boyfreakk, boylickr, boyyyish, boyhood16, cpboy1, aboyaddict, boys1998, boopygoop, boysrus222, Lilboyperv, boylust, boysafe, boqw, newboysfour, boyvidsv4040, boymania, boby, boypassion, badboy225222, boysFTW, boysngirls, boys1212, boyfuck, boys4all, boysweet, Lboy13, boysucker01, pboy82, boykeeee, boydip, boypussyeater, boysclown, boyseeker, boy100, slwtboy762, boyunder, sboygirl69, boyamour, kissboycam, boyfan2, 1boy, boyuser, boyvidscck, boy2girl, boyrimmer, lboyblue, boyvids13, boyforonion, Kboy, boysarebest, borissydney, boyzzz69, Gamerboylover, boyforce, boy919191, rboyvids4, bobbyv, luvboy840, boy0000, boyorgasm421, boy13601, broly69, TobyKaa, boyyluvv, boysflix, boy811, bonjourb0yscatl0ve, boy218416, boy999, vidboy995, bboy22, bodyglow, boy03, boysandgirls, boyshot, Bigboy77, Bigboy1212 

Cluster  ['alo'] (id: 521 ):
 Marlboro, alcott, analjohn, Carlos, Galo137, ValkoctanLover, siralop1, paulovando, Malmol, wallon, madlock, aly5, F

### Show the usernames in all the groups

In [7]:
for cluster_id in np.unique(affprop.labels_):
    exemplar = words[affprop.cluster_centers_indices_[cluster_id]]
    cluster = np.unique(words[np.nonzero(affprop.labels_==cluster_id)])
    cluster_str = ", ".join(cluster)
    print(" - *%s:* %s" % (exemplar, cluster_str))

 - *NewBee:* NameILee, Nele, Neoprene, NewBee, NewerBaby, Newmember, PNB, seksweex19
 - *BrC:* BHC, BrC
 - *Juni:* Brunit, JuanM, Juanchi99, Juni, Junyiores, Juri_6, Juzuis32, Yutani, eunil, until9000
 - *Jamie:* Camikase5050, Catamite, James Marsden, Jamie, Jamie_Boy, Jasminec, Ramires21, SammyVideosTopic1, abmis, hmmlike, j@mie, quagmire68
 - *abc54321:* Bt654321, ab33, abc1122, abc54321, abcd301, akg321, alek54321, ba543210, cccc4321, dcba4321dcba, mia54321, niar432, ta4212
 - *delta:* Kjelt17, Kul'tovajka, Relax, Zelltrack, d33pthwat, default123, delamyr, dellxp, delonay, delta, deltakirby, devilx, videotrader
 - *boy:* 1boy, Bigboy1212, Bigboy77, Gamerboylover, Kboy, Lboy13, Lilboyperv, TobyKaa, aboyaddict, badboy225222, bboy22, bobbyv, boby, bodyglow, bonjourb0yscatl0ve, boopygoop, boqw, borissydney, boy, boy0000, boy03, boy100, boy13601, boy218416, boy2girl, boy811, boy919191, boy999, boyamour, boydaddyluvbb, boydip, boyfan2, boyforce, boyforonion, boyfreakk, boyfuck, boyhood16,