In [1]:
pip install sparse_dot_topn



In [2]:
import pandas as pd 
import re
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np
from scipy.sparse import csr_matrix
import sparse_dot_topn.sparse_dot_topn as ct  # Leading Juice for us
import time
pd.set_option('display.max_colwidth', -1)

def email_split(x):
  token = x.split('@')
  return token[0]


  


In [3]:
user =  pd.read_csv('/content/users from Aug.csv')
df = user[['email','username']].drop(columns = ['username'])
df = df.rename(columns={'email':'RoomTypes'})

# Date settings
df = df.iloc[0:200]

#email splitting
df['RoomTypes'] = df['RoomTypes'].apply(email_split)
df.head()

Unnamed: 0,RoomTypes
0,nepalmondal361
1,vishum333
2,notifyamam123
3,dhamgayeindu
4,abhishekyad969


In [4]:
#  ngrams(here we are taking n = 3 thus 3-gram (trigrams ) as  most room types only contain two or three words
#  used for cleaning and removing some punctuation (dots, comma’s etc) i.e.((,-./)) from a string 
#  and generate and collect all n-grams of the string.  
 
def ngrams(string, n=3):

    string = re.sub(r'[,-./]|\sBD',r'', string)
    ngrams = zip(*[string[i:] for i in range(n)])
    return [''.join(ngram) for ngram in ngrams]

# Testing ngrams work for verification 

print('All 3-grams in "Deluxroom":')
ngrams('Deluxroom')

All 3-grams in "Deluxroom":


['Del', 'elu', 'lux', 'uxr', 'xro', 'roo', 'oom']

In [5]:
room_types = df['RoomTypes']
vectorizer = TfidfVectorizer(min_df=1, analyzer=ngrams)
tf_idf_matrix = vectorizer.fit_transform(room_types)

print(tf_idf_matrix[0])

  (0, 91)	0.3135544329180789
  (0, 830)	0.3135544329180789
  (0, 302)	0.3135544329180789
  (0, 459)	0.25215245994294117
  (0, 944)	0.2623425000676693
  (0, 1029)	0.27481410124007394
  (0, 911)	0.27481410124007394
  (0, 858)	0.3135544329180789
  (0, 311)	0.29089279162094606
  (0, 1056)	0.23607376956206896
  (0, 537)	0.29089279162094606
  (0, 954)	0.3135544329180789


In [6]:
# calculate the similarity between two vectors of TF-IDF values the Cosine Similarity is usually used.
# result matrix in a very sparse terms and Scikit-learn deals with this nicely by returning a sparse CSR matrix.

def awesome_cossim_top(A, B, ntop, lower_bound=0):
    # force A and B as a CSR matrix.
    # If they have already been CSR, there is no overhead
    A = A.tocsr()
    B = B.tocsr()
    M, _ = A.shape
    _, N = B.shape
 
    idx_dtype = np.int32
 
    nnz_max = M*ntop
 
    indptr = np.zeros(M+1, dtype=idx_dtype)
    indices = np.zeros(nnz_max, dtype=idx_dtype)
    data = np.zeros(nnz_max, dtype=A.dtype)

    ct.sparse_dot_topn(
        M, N, np.asarray(A.indptr, dtype=idx_dtype),
        np.asarray(A.indices, dtype=idx_dtype),
        A.data,
        np.asarray(B.indptr, dtype=idx_dtype),
        np.asarray(B.indices, dtype=idx_dtype),
        B.data,
        ntop,
        lower_bound,
        indptr, indices, data)

    return csr_matrix((data,indices,indptr),shape=(M,N))

In [7]:
#  Top 10 with similarity above 0.7
t1 = time.time()
matches = awesome_cossim_top(tf_idf_matrix, tf_idf_matrix.transpose(), 10, 0.70)
t = time.time()-t1
print("SELFTIMED:", t)

SELFTIMED: 0.0011107921600341797


In [8]:
# unpacks the resulting sparse matrix
def get_matches_df(sparse_matrix, name_vector, top=200):
    non_zeros = sparse_matrix.nonzero()
    
    sparserows = non_zeros[0]
    sparsecols = non_zeros[1]
    
    if top:
        nr_matches = top
    else:
        nr_matches = sparsecols.size
    
    left_side = np.empty([nr_matches], dtype=object)
    right_side = np.empty([nr_matches], dtype=object)
    similairity = np.zeros(nr_matches)
    
    for index in range(0, nr_matches):
        left_side[index] = name_vector[sparserows[index]]
        right_side[index] = name_vector[sparsecols[index]]
        similairity[index] = sparse_matrix.data[index]
    
    return pd.DataFrame({'left_side': left_side,
                          'right_side': right_side,
                           'similairity': similairity})

In [9]:
# store the  matches into new dataframe called matched_df and 
# printing 10 samples
matches_df = get_matches_df(matches, room_types, top=200)
matches_df = matches_df[matches_df['similairity'] < 0.99999] # For removing all exact matches
matches_df.sample(10)

Unnamed: 0,left_side,right_side,similairity
126,tech+testaccount564,tech+testaccount565,0.916392
127,tech+testaccount564,tech+testaccount566,0.916392
122,tech+testaccount565,tech+testaccount564,0.916392
117,tech+testaccount566,tech+testaccount563,0.916392
125,tech+testaccount564,tech+testaccount563,0.916392
119,tech+testaccount566,tech+testaccount565,0.916392
129,tech+testaccount563,tech+testaccount564,0.916392
118,tech+testaccount566,tech+testaccount564,0.916392
131,tech+testaccount563,tech+testaccount566,0.916392
123,tech+testaccount565,tech+testaccount566,0.916392


In [10]:
# printing the matches in sorted order
matches_df.sort_values(['similairity'], ascending=False).head(10)

Unnamed: 0,left_side,right_side,similairity
117,tech+testaccount566,tech+testaccount563,0.916392
118,tech+testaccount566,tech+testaccount564,0.916392
119,tech+testaccount566,tech+testaccount565,0.916392
121,tech+testaccount565,tech+testaccount563,0.916392
122,tech+testaccount565,tech+testaccount564,0.916392
123,tech+testaccount565,tech+testaccount566,0.916392
125,tech+testaccount564,tech+testaccount563,0.916392
126,tech+testaccount564,tech+testaccount565,0.916392
127,tech+testaccount564,tech+testaccount566,0.916392
129,tech+testaccount563,tech+testaccount564,0.916392


In [11]:
matches_df.shape

(12, 3)

In [12]:
df.shape

(200, 1)

In [13]:
user1 = user.iloc[0:200]
for i in user1.index:
  user1.loc[i,'Multiple_Accounts'] = 'False'

user2 = pd.merge(user1,df,left_on= user1.index,right_on=df.index)
user2.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[key] = _infer_fill_value(value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  isetter(loc, value)


Unnamed: 0.1,key_0,Unnamed: 0,_id,email,created,isBlocked,isKycComplete,isFullKycComplete,username,enterraUserId,blockReason,blockedBy,blockedOn,Multiple_Accounts,RoomTypes
0,0,1,600c369da868280027776f6b,nepalmondal361@gmail.com,2021-01-23 20:15:49,False,False,False,Nepal123,155119923,,,,False,nepalmondal361
1,1,2,600c34a4a868280027776f69,vishum333@gmail.com,2021-01-23 20:07:24,False,False,False,C0mmander,155119587,,,,False,vishum333
2,2,3,600c28daa868280027776f61,notifyamam123@gmail.com,2021-01-23 19:17:06,False,False,False,igotthenuts,155113168,,,,False,notifyamam123
3,3,4,600c1cb5a868280027776f59,dhamgayeindu@gmail.com,2021-01-23 18:25:17,False,False,False,shetty5,155107318,,,,False,dhamgayeindu
4,4,5,600bfb5ea868280027776f47,abhishekyad969@gmail.com,2021-01-23 16:03:02,False,False,False,Abhi969,155089556,,,,False,abhishekyad969


In [14]:
for i in user2.index:
  for j in matches_df.index:
   if  user2.loc[i,'RoomTypes'] == matches_df.loc[j,'left_side']:
     user2.loc[i,'Multiple_Accounts'] = 'True'
   if  user2.loc[i,'RoomTypes'] == matches_df.loc[j,'right_side']:
     user2.loc[i,'Multiple_Accounts'] = 'True'    

user2.head()

Unnamed: 0.1,key_0,Unnamed: 0,_id,email,created,isBlocked,isKycComplete,isFullKycComplete,username,enterraUserId,blockReason,blockedBy,blockedOn,Multiple_Accounts,RoomTypes
0,0,1,600c369da868280027776f6b,nepalmondal361@gmail.com,2021-01-23 20:15:49,False,False,False,Nepal123,155119923,,,,False,nepalmondal361
1,1,2,600c34a4a868280027776f69,vishum333@gmail.com,2021-01-23 20:07:24,False,False,False,C0mmander,155119587,,,,False,vishum333
2,2,3,600c28daa868280027776f61,notifyamam123@gmail.com,2021-01-23 19:17:06,False,False,False,igotthenuts,155113168,,,,False,notifyamam123
3,3,4,600c1cb5a868280027776f59,dhamgayeindu@gmail.com,2021-01-23 18:25:17,False,False,False,shetty5,155107318,,,,False,dhamgayeindu
4,4,5,600bfb5ea868280027776f47,abhishekyad969@gmail.com,2021-01-23 16:03:02,False,False,False,Abhi969,155089556,,,,False,abhishekyad969


In [15]:
user2.Multiple_Accounts.value_counts()

False    196
True     4  
Name: Multiple_Accounts, dtype: int64

In [16]:
user2[user2['Multiple_Accounts']=='True']

Unnamed: 0.1,key_0,Unnamed: 0,_id,email,created,isBlocked,isKycComplete,isFullKycComplete,username,enterraUserId,blockReason,blockedBy,blockedOn,Multiple_Accounts,RoomTypes
116,116,117,60069aff27fcd60026cb53a6,tech+testaccount566@9stacks.com,2021-01-19 14:10:31,False,False,False,Testaccount566,154620830,testing,farhan@9stacks.com,,True,tech+testaccount566
117,117,118,60069aba27fcd60026cb53a4,tech+testaccount565@9stacks.com,2021-01-19 14:09:22,False,False,False,Testaccount565,154620242,,,,True,tech+testaccount565
118,118,119,60069a7427fcd60026cb53a2,tech+testaccount564@9stacks.com,2021-01-19 14:08:12,False,False,False,Testaccount564,154620181,,,,True,tech+testaccount564
119,119,120,60069a1a27fcd60026cb53a0,tech+testaccount563@9stacks.com,2021-01-19 14:06:42,False,False,False,Testaccount563,154620123,,,,True,tech+testaccount563
