In [None]:
pip install sparse_dot_topn



In [None]:
import pandas as pd 
import re
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np
from scipy.sparse import csr_matrix
import sparse_dot_topn.sparse_dot_topn as ct  # Leading Juice for us
import time
pd.set_option('display.max_colwidth', -1)

#email splitting function definition
def email_split(x):
  token = x.split('@')
  return token[0]


  


In [None]:
user =  pd.read_csv('/content/users from Aug.csv')

# Date settings - based on optimised dates we are considering last 14 days user for comparison

user['created'] = pd.to_datetime(user['created'],format = '%Y-%m-%d %H:%M')
user['created'] = user['created'].dt.date
user['days_from_register'] = (user.loc[0,'created']- user['created'] ).apply(lambda x:x.days)
user = user[user['days_from_register']<14]

df = user[['email','username']].drop(columns = ['username'])
df = df.rename(columns={'email':'RoomTypes'})


#email splitting
df['RoomTypes'] = df['RoomTypes'].apply(email_split)
df.head()

Unnamed: 0,RoomTypes
0,nepalmondal361
1,vishum333
2,notifyamam123
3,dhamgayeindu
4,abhishekyad969


In [None]:
df.tail()

Unnamed: 0,RoomTypes
626,ssasirola83
627,sudupalkumirkhala1234
628,kamaleshmondal12345k
629,ajay.negi254441
630,shankarlalfoazdar


In [None]:
df.shape
number = df.shape[0]
print(number)

# this number is the amount of users that will be compared. 
# On average the system will have to check 500 - 1000 per each user registration

631


In [None]:
user.head(5)

Unnamed: 0.1,Unnamed: 0,_id,email,created,isBlocked,isKycComplete,isFullKycComplete,username,enterraUserId,blockReason,blockedBy,blockedOn,days_from_register
0,1,600c369da868280027776f6b,nepalmondal361@gmail.com,2021-01-23,False,False,False,Nepal123,155119923,,,,0
1,2,600c34a4a868280027776f69,vishum333@gmail.com,2021-01-23,False,False,False,C0mmander,155119587,,,,0
2,3,600c28daa868280027776f61,notifyamam123@gmail.com,2021-01-23,False,False,False,igotthenuts,155113168,,,,0
3,4,600c1cb5a868280027776f59,dhamgayeindu@gmail.com,2021-01-23,False,False,False,shetty5,155107318,,,,0
4,5,600bfb5ea868280027776f47,abhishekyad969@gmail.com,2021-01-23,False,False,False,Abhi969,155089556,,,,0


In [None]:
#  ngrams(here we are taking n = 3 thus 3-gram (trigrams ) as  most room types only contain two or three words
#  used for cleaning and removing some punctuation (dots, comma’s etc) i.e.((,-./)) from a string 
#  and generate and collect all n-grams of the string.  
 
def ngrams(string, n=3):

    string = re.sub(r'[,-./]|\sBD',r'', string)
    ngrams = zip(*[string[i:] for i in range(n)])
    return [''.join(ngram) for ngram in ngrams]

# Testing ngrams work for verification 

print('All 3-grams in "Deluxroom":')
ngrams('Deluxroom')

All 3-grams in "Deluxroom":


['Del', 'elu', 'lux', 'uxr', 'xro', 'roo', 'oom']

In [None]:
room_types = df['RoomTypes']
vectorizer = TfidfVectorizer(min_df=1, analyzer=ngrams)
tf_idf_matrix = vectorizer.fit_transform(room_types)

print(tf_idf_matrix[0])

  (0, 197)	0.33324462218204615
  (0, 1714)	0.31324399716000767
  (0, 655)	0.2880461896340714
  (0, 946)	0.26486203356352384
  (0, 1957)	0.2448614085414854
  (0, 2122)	0.27905270285074657
  (0, 1890)	0.25905207782870815
  (0, 1782)	0.31324399716000767
  (0, 670)	0.299053327872785
  (0, 2178)	0.2372575238114217
  (0, 1100)	0.27905270285074657
  (0, 1977)	0.33324462218204615


In [None]:
# calculate the similarity between two vectors of TF-IDF values the Cosine Similarity is usually used.
# result matrix in a very sparse terms and Scikit-learn deals with this nicely by returning a sparse CSR matrix.

def awesome_cossim_top(A, B, ntop, lower_bound=0):
    # force A and B as a CSR matrix.
    # If they have already been CSR, there is no overhead
    A = A.tocsr()
    B = B.tocsr()
    M, _ = A.shape
    _, N = B.shape
 
    idx_dtype = np.int32
 
    nnz_max = M*ntop
 
    indptr = np.zeros(M+1, dtype=idx_dtype)
    indices = np.zeros(nnz_max, dtype=idx_dtype)
    data = np.zeros(nnz_max, dtype=A.dtype)

    ct.sparse_dot_topn(
        M, N, np.asarray(A.indptr, dtype=idx_dtype),
        np.asarray(A.indices, dtype=idx_dtype),
        A.data,
        np.asarray(B.indptr, dtype=idx_dtype),
        np.asarray(B.indices, dtype=idx_dtype),
        B.data,
        ntop,
        lower_bound,
        indptr, indices, data)

    return csr_matrix((data,indices,indptr),shape=(M,N))

In [None]:
#  Top 10 with similarity above 0.71
t1 = time.time()
matches = awesome_cossim_top(tf_idf_matrix, tf_idf_matrix.transpose(), 10, 0.71)
t = time.time()-t1
print("SELFTIMED:", t)

SELFTIMED: 0.002210378646850586


In [None]:
# unpacks the resulting sparse matrix
def get_matches_df(sparse_matrix, name_vector, top=number):
    non_zeros = sparse_matrix.nonzero()
    
    sparserows = non_zeros[0]
    sparsecols = non_zeros[1]
    
    if top:
        nr_matches = top
    else:
        nr_matches = sparsecols.size
    
    left_side = np.empty([nr_matches], dtype=object)
    right_side = np.empty([nr_matches], dtype=object)
    similairity = np.zeros(nr_matches)
    
    for index in range(0, nr_matches):
        left_side[index] = name_vector[sparserows[index]]
        right_side[index] = name_vector[sparsecols[index]]
        similairity[index] = sparse_matrix.data[index]
    
    return pd.DataFrame({'left_side': left_side,
                          'right_side': right_side,
                           'similairity': similairity})

In [None]:
# store the  matches into new dataframe called matched_df and 
# printing 10 samples
matches_df = get_matches_df(matches, room_types, top=number)
matches_df = matches_df[matches_df['similairity'] < 0.99999] # For removing all exact matches
matches_df.sample(10)

Unnamed: 0,left_side,right_side,similairity
125,tech+testaccount564,tech+testaccount563,0.919886
122,tech+testaccount565,tech+testaccount564,0.919886
131,tech+testaccount563,tech+testaccount566,0.919886
121,tech+testaccount565,tech+testaccount563,0.919886
117,tech+testaccount566,tech+testaccount563,0.919886
126,tech+testaccount564,tech+testaccount565,0.919886
123,tech+testaccount565,tech+testaccount566,0.919886
119,tech+testaccount566,tech+testaccount565,0.919886
118,tech+testaccount566,tech+testaccount564,0.919886
129,tech+testaccount563,tech+testaccount564,0.919886


In [None]:
# printing the matches in sorted order
matches_df.sort_values(['similairity'], ascending=False).head(10)

Unnamed: 0,left_side,right_side,similairity
117,tech+testaccount566,tech+testaccount563,0.919886
118,tech+testaccount566,tech+testaccount564,0.919886
119,tech+testaccount566,tech+testaccount565,0.919886
121,tech+testaccount565,tech+testaccount563,0.919886
122,tech+testaccount565,tech+testaccount564,0.919886
123,tech+testaccount565,tech+testaccount566,0.919886
125,tech+testaccount564,tech+testaccount563,0.919886
126,tech+testaccount564,tech+testaccount565,0.919886
127,tech+testaccount564,tech+testaccount566,0.919886
129,tech+testaccount563,tech+testaccount564,0.919886


In [None]:
matches_df.shape

(12, 3)

In [None]:
df.shape

(631, 1)

In [None]:
#intializing step, we just assume at all users are genuine, then among them only mark the fraud ones

user1 = user.iloc[0:number]
for i in user1.index:
  user1.loc[i,'Multiple_Accounts'] = 'False'

user2 = pd.merge(user1,df,left_on= user1.index,right_on=df.index)
user2.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[key] = _infer_fill_value(value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  isetter(loc, value)


Unnamed: 0.1,key_0,Unnamed: 0,_id,email,created,isBlocked,isKycComplete,isFullKycComplete,username,enterraUserId,blockReason,blockedBy,blockedOn,days_from_register,Multiple_Accounts,RoomTypes
0,0,1,600c369da868280027776f6b,nepalmondal361@gmail.com,2021-01-23,False,False,False,Nepal123,155119923,,,,0,False,nepalmondal361
1,1,2,600c34a4a868280027776f69,vishum333@gmail.com,2021-01-23,False,False,False,C0mmander,155119587,,,,0,False,vishum333
2,2,3,600c28daa868280027776f61,notifyamam123@gmail.com,2021-01-23,False,False,False,igotthenuts,155113168,,,,0,False,notifyamam123
3,3,4,600c1cb5a868280027776f59,dhamgayeindu@gmail.com,2021-01-23,False,False,False,shetty5,155107318,,,,0,False,dhamgayeindu
4,4,5,600bfb5ea868280027776f47,abhishekyad969@gmail.com,2021-01-23,False,False,False,Abhi969,155089556,,,,0,False,abhishekyad969


In [None]:
#cross checkking simialrity with initial database and then flagging the fraud ones

for i in user2.index:
  for j in matches_df.index:
   if  user2.loc[i,'RoomTypes'] == matches_df.loc[j,'left_side']:
     user2.loc[i,'Multiple_Accounts'] = 'True'
   if  user2.loc[i,'RoomTypes'] == matches_df.loc[j,'right_side']:
     user2.loc[i,'Multiple_Accounts'] = 'True'    

user2.head()

Unnamed: 0.1,key_0,Unnamed: 0,_id,email,created,isBlocked,isKycComplete,isFullKycComplete,username,enterraUserId,blockReason,blockedBy,blockedOn,days_from_register,Multiple_Accounts,RoomTypes
0,0,1,600c369da868280027776f6b,nepalmondal361@gmail.com,2021-01-23,False,False,False,Nepal123,155119923,,,,0,False,nepalmondal361
1,1,2,600c34a4a868280027776f69,vishum333@gmail.com,2021-01-23,False,False,False,C0mmander,155119587,,,,0,False,vishum333
2,2,3,600c28daa868280027776f61,notifyamam123@gmail.com,2021-01-23,False,False,False,igotthenuts,155113168,,,,0,False,notifyamam123
3,3,4,600c1cb5a868280027776f59,dhamgayeindu@gmail.com,2021-01-23,False,False,False,shetty5,155107318,,,,0,False,dhamgayeindu
4,4,5,600bfb5ea868280027776f47,abhishekyad969@gmail.com,2021-01-23,False,False,False,Abhi969,155089556,,,,0,False,abhishekyad969


In [None]:
user2.Multiple_Accounts.value_counts()

False    627
True     4  
Name: Multiple_Accounts, dtype: int64

In [None]:
#these are the fraud users registered in last 14 days

user2[user2['Multiple_Accounts']=='True']

Unnamed: 0.1,key_0,Unnamed: 0,_id,email,created,isBlocked,isKycComplete,isFullKycComplete,username,enterraUserId,blockReason,blockedBy,blockedOn,days_from_register,Multiple_Accounts,RoomTypes
116,116,117,60069aff27fcd60026cb53a6,tech+testaccount566@9stacks.com,2021-01-19,False,False,False,Testaccount566,154620830,testing,farhan@9stacks.com,,4,True,tech+testaccount566
117,117,118,60069aba27fcd60026cb53a4,tech+testaccount565@9stacks.com,2021-01-19,False,False,False,Testaccount565,154620242,,,,4,True,tech+testaccount565
118,118,119,60069a7427fcd60026cb53a2,tech+testaccount564@9stacks.com,2021-01-19,False,False,False,Testaccount564,154620181,,,,4,True,tech+testaccount564
119,119,120,60069a1a27fcd60026cb53a0,tech+testaccount563@9stacks.com,2021-01-19,False,False,False,Testaccount563,154620123,,,,4,True,tech+testaccount563


In [None]:
user['days_from_register'].value_counts()

9     151
10    100
11    66 
8     48 
7     45 
2     37 
13    30 
3     28 
1     27 
5     25 
4     24 
6     21 
12    16 
0     13 
Name: days_from_register, dtype: int64