# Account Takeover
This notebooks is using the Levenstein's distance to identify similar user login attempts. Attackers are using bots, sometimes spanning long time periods, that attempts different usernames with small changes. The Levenstein algorithm is able to measure the differences in lgins and cluster them to different groups that can be further analyzed.

In [1]:
import boto3
import base64
from botocore.exceptions import ClientError
import ast

def get_secret():

    secret_name = "GBRedshiftSecret"
    region_name = "us-east-2"

    # Create a Secrets Manager client
    session = boto3.session.Session()
    client = session.client(
        service_name='secretsmanager',
        region_name=region_name
    )

    # In this sample we only handle the specific exceptions for the 'GetSecretValue' API.
    # See https://docs.aws.amazon.com/secretsmanager/latest/apireference/API_GetSecretValue.html
    # We rethrow the exception by default.

    try:
        get_secret_value_response = client.get_secret_value(
            SecretId=secret_name
        )
    except ClientError as e:
        if e.response['Error']['Code'] == 'DecryptionFailureException':
            # Secrets Manager can't decrypt the protected secret text using the provided KMS key.
            # Deal with the exception here, and/or rethrow at your discretion.
            raise e
        elif e.response['Error']['Code'] == 'InternalServiceErrorException':
            # An error occurred on the server side.
            # Deal with the exception here, and/or rethrow at your discretion.
            raise e
        elif e.response['Error']['Code'] == 'InvalidParameterException':
            # You provided an invalid value for a parameter.
            # Deal with the exception here, and/or rethrow at your discretion.
            raise e
        elif e.response['Error']['Code'] == 'InvalidRequestException':
            # You provided a parameter value that is not valid for the current state of the resource.
            # Deal with the exception here, and/or rethrow at your discretion.
            raise e
        elif e.response['Error']['Code'] == 'ResourceNotFoundException':
            # We can't find the resource that you asked for.
            # Deal with the exception here, and/or rethrow at your discretion.
            raise e
    else:
          # Decrypts secret using the associated KMS CMK.
        # Depending on whether the secret is a string or binary, one of these fields will be populated.
        if 'SecretString' in get_secret_value_response:
            secret = get_secret_value_response['SecretString']
            return(secret)
        else:
            decoded_binary_secret = base64.b64decode(get_secret_value_response['SecretBinary'])
            return(decode_binary_secret)

secret = get_secret()
USER = ast.literal_eval(secret)['username']
PASSWORD = ast.literal_eval(secret)['password']
HOST = ast.literal_eval(secret)['host']

In [2]:
#Install prerequisites
!pip install distance

[33mYou are using pip version 10.0.1, however version 19.2.1 is available.
You should consider upgrading via the 'pip install --upgrade pip' command.[0m


## Data read

In [3]:
import pandas as pd
import warnings
import psycopg2
import time
import numpy as np
import sklearn.cluster
import distance
import matplotlib.pyplot as plt

pd.set_option('display.max_colwidth', -1)
warnings.filterwarnings('ignore')
url_string = f"dbname='prod' port=5439 user='{USER}' password='{PASSWORD}' host='{HOST}'"
con = psycopg2.connect(url_string)
print(time.ctime(), "Redshift connection Successful!")
cur = con.cursor()
#Collect data for account takeovers. Here we just sample 1000 records from all useractions data.
sqlmsg = "with ts as \
          (select max(session_ts) - 1 * 2 * 3600000 as session_ts \
          from page_loads_tf) \
          select session_ts, session_uuid, dom_value\
          from useractions_tf\
          where dom_element = 'INPUT#login-email' and client_action = 'change' and session_ts > (select * from ts)\
          limit 1000;"
cur.execute(sqlmsg)
data = pd.DataFrame(cur.fetchall(), columns=['session_ts', 'session_uuid', 'dom_value'])
data['username'] = data.dom_value.apply(lambda x: x.split('@')[0])
data['domain'] = data.dom_value.apply(lambda x: x.split('@')[1] if x.rfind('@')>0 else x)
data.head()

  """)


Tue Aug 13 14:30:45 2019 Redshift connection Successful!


Unnamed: 0,session_ts,session_uuid,dom_value,username,domain
0,1565697681261,10ad51d0-bdc2-11e9-82ee-0ebc11e33b1c,jolin@charter.net,jolin,charter.net
1,1565697745657,370f5e90-bdc2-11e9-8118-0ef017bbf018,seise81@gmail.com,seise81,gmail.com
2,1565697895142,9028f860-bdc2-11e9-8118-0ef017bbf018,spencerbarb@hotmail.com,spencerbarb,hotmail.com
3,1565697946731,aee8d3b0-bdc2-11e9-82ee-0ebc11e33b1c,llallen,llallen,llallen
4,1565697838787,6e91e130-bdc2-11e9-8118-0ef017bbf018,israelhsmith95@gmail.com,israelhsmith95,gmail.com


## Run clustering algorithm

In [4]:
#Run clustering algorithm per domain
output = np.empty((0, 3))
for dom in data.domain.unique():
    data2 = data.loc[data.domain == dom]
    words = np.asarray(data2.username.unique()) #So that indexing with a list will work
    if(len(words) < 10):
        continue
    #Zero for the same words, higher value for close words.
    lev_similarity = -1*np.array([[distance.levenshtein(w1,w2) for w1 in words] for w2 in words])

    affprop = sklearn.cluster.AffinityPropagation(affinity="precomputed", damping=0.5)
    affprop.fit(lev_similarity)
    print(f'Domain: {dom}:')
    if(len(affprop.cluster_centers_indices_) == 0):
        print('AffinityPropagaation did not converge. All usernames are different.')
    else:
        for cluster_id in np.unique(affprop.labels_):
            exemplar = words[affprop.cluster_centers_indices_[cluster_id]]
            cluster = np.unique(words[np.nonzero(affprop.labels_==cluster_id)])
            if(len(cluster) > 5):
                cluster_str = ", ".join(cluster)
                #print(" - *%s:* %s" % (exemplar, cluster_str))
                output = np.append(output, [np.array((dom, exemplar, cluster_str))], axis=0)

output_df = pd.DataFrame(output, columns=['domain', 'exemplar', 'logins'])
output_df['size'] = output_df.logins.apply(lambda x: len(x))
output_df

Domain: gmail.com:
Domain: hotmail.com:
Domain: sbcglobal.net:
Domain: comcast.net:
Domain: yahoo.com:
Domain: msn.com:
Domain: aol.com:
Domain: verizon.net:
Domain: att.net:


Unnamed: 0,domain,exemplar,logins,size
0,gmail.com,johnst105,"Ngill1104, ehester58, golions1441, israelhsmith95, jimmyj555, joeestone96, johnschaefer56, johnst105, jonmtaft, joshuamk4999, rahrens1138, rhonda803",148
1,gmail.com,tompie76,"jojofixit666, koprench, liaobp1983, lmzieg79, maggiievw1963, pkneidel54, rettieM141, rmprice7377, thatzlife99, tom.dicuccio, tompie76",133
2,gmail.com,lareddy,"42carjock, dalejrbud8fan, dbpindy, faveagent, flairence, fred.boyscout, garyvecc, lagrn286, lareddy, lbebugxyz, linzcade, marimkuhl, mrbothy, olivepalima, r.reed.carey, staceyszyman, tan357mjc, waynejkm",202
3,gmail.com,telvaus,"Lilsassy882, dhove01, kellysnelson24, memeaac, petek7585, talon3000, teamv12.joe, telvaus, texasmaggs45, tkeeler10, tropicalmaisy, waltadams01",142
4,gmail.com,bjayst,"73rmsm, awa338, bekaj13, bjayst, cstasny33, djlaw741, dw9663, edyyytia, gluyas, hty922, jimsab, kinyt68, markakaster, mmcalsr, njbgt1",133
5,gmail.com,rathbungalen,"Heatherhale420, globalfountain, rachblack502, rachelrfuller7, rathbungalen, tntimbershelton",91
6,gmail.com,mc.colt12,"5acegolfer, bricksec2, cocobfannon82, gonzalo1267, kfcolvin, kimballk128, mc.colt12, mcossman615, mcovert31669, mongoose2, nishacoley90, skypilot18",147
7,gmail.com,dgaiche,"1946richo, catti.michele, chscde, coachjmb, dbynature, degrae4025, dgaiche, dghatten, dhuggins985, donaheyb, drankin932, evattnae, gabainwi, jdcawill, jmarble, rcbabicz, rrwin75, wdsmith71",188
8,gmail.com,msjhord,"bnszihlman, darahoxford, djkword, goinsrichard1, hsmithtmoran, mdschulz90, mrshuhhuh, msjhord, msjtmiles, rhschwarze",116
9,gmail.com,bodyno358,"bigloudog63, bodyno358, brythgo135, curly13681, ds9205940355, eldonfox31, jbowman343, peggy.goins53, todd.noyes",111


In [None]:
ax = output_df.plot.bar(x='exemplar', y='size', rot=45, figsize=(40,20), fontsize=30)
plt.show()

In [None]:
domain_df = output_df.groupby('domain').size()
domain_df.columns = ['num']
ax = domain_df.plot.pie(y='num', figsize=(20,10), fontsize=30)
plt.show()