# Detecting cybercrimes using similarity models


## OS

In [1]:
import os # The OS module in Python provides a way of using operating system dependent functionality
import pandas as pd # Read data from file

In [2]:
os.listdir("C:/Users/Alejandra/Desktop/Proyecto") # Reads and shows files in entered location

['.ipynb_checkpoints',
 'data',
 'datos prueba',
 'download_utils.py',
 'GoogleNews-vectors-negative300.bin',
 'grader.py',
 'My First Project-689f409050bd.json',
 'NLP.ipynb',
 'prueba1.xlsx',
 'prueba2.xlsx',
 'setup.py',
 'solution.py',
 'test_embedding.tsv',
 'tweets',
 'util.py',
 'validation.tsv',
 '__pycache__']

## Data

In [3]:
data = pd.read_excel("prueba1.xlsx",sheet_name="Archive") # Importing the database (tweets)

In [4]:
data.head() # Returns the first rows of the database

Unnamed: 0,id_str,from_user,text,created_at,time,geo_coordinates,user_lang,in_reply_to_user_id_str,in_reply_to_screen_name,from_user_id_str,in_reply_to_status_id_str,source,profile_image_url,user_followers_count,user_friends_count,user_location,status_url,entities_str
0,1254475950059896832,elk2812,@Fernand60915270 @farmerguanabana @jflafaurie ...,Sun Apr 26 18:22:28 +0000 2020,2020-04-26 19:22:28,,,1.189782e+18,Fernand60915270,1160001175380484097,1.254359e+18,"<a href=""http://twitter.com/download/android"" ...",http://pbs.twimg.com/profile_images/1160001592...,1708,3296,,http://twitter.com/elk2812/statuses/1254475950...,"{""hashtags"":[],""symbols"":[],""user_mentions"":[{..."
1,1254469890683084800,JhojanObregon,Este señor parece que solo lee el título de no...,Sun Apr 26 17:58:23 +0000 2020,2020-04-26 18:58:23,,,,,201605528,,"<a href=""http://twitter.com/download/iphone"" r...",http://pbs.twimg.com/profile_images/1246568582...,165,634,"Lima: -12.062106, -77.036526",http://twitter.com/JhojanObregon/statuses/1254...,"{""hashtags"":[],""symbols"":[],""user_mentions"":[]..."
2,1254419035208462336,farcila20,El cómplice de Santrich en Colombia goza de to...,Sun Apr 26 14:36:18 +0000 2020,2020-04-26 15:36:18,,,,,1210573841153581057,,"<a href=""http://twitter.com/download/iphone"" r...",http://pbs.twimg.com/profile_images/1210574132...,532,963,,http://twitter.com/farcila20/statuses/12544190...,"{""hashtags"":[],""symbols"":[],""user_mentions"":[]..."
3,1254261736812810241,Tapas667,@Lucho_Pipe10 @Elbatobatillo Pues colombia apo...,Sun Apr 26 04:11:15 +0000 2020,2020-04-26 05:11:15,,,8.199648e+17,Lucho_Pipe10,1062105377637052419,1.25425e+18,"<a href=""http://twitter.com/download/android"" ...",http://pbs.twimg.com/profile_images/1241786730...,24,52,"Bogotá, D.C., Colombia",http://twitter.com/Tapas667/statuses/125426173...,"{""hashtags"":[],""symbols"":[],""user_mentions"":[{..."
4,1254132061771882498,FlorMariaSerna5,@DELAESPRIELLAE @AlexLopezMaya Honorable Dr AB...,Sat Apr 25 19:35:58 +0000 2020,2020-04-25 20:35:58,,,548906700.0,DELAESPRIELLAE,1224110745970991109,1.253837e+18,"<a href=""http://twitter.com/download/android"" ...",http://pbs.twimg.com/profile_images/1251262160...,57,64,,http://twitter.com/FlorMariaSerna5/statuses/12...,"{""hashtags"":[],""symbols"":[],""user_mentions"":[{..."


In [5]:
text=data['text'].tolist() # converting to list the column "text" where it contains the tweets

In [6]:
#print(text) # printing the tweets

In [7]:
len(text) # number of tweets

44

In [8]:
tweet_1 =[text[0]] # accessing to the first tweet from the list 'text'
tweet_1

['@Fernand60915270 @farmerguanabana @jflafaurie @HenaoBernardo @Fedegan @GaulaMilitares @GaulaPolicia @NobelPrize @JuanManSantos @JEP_Colombia @PartidoFARC @IvanDuque @CongresoCol Por La Empresa e Inversión es que Ud y Su Flia Pueden Comer y Disfrutar  de Medios para que uds PARASITOS VIVAN porque Trabaja Mas facil una Pala Empeñada que un Solo MAMERTOS de Uds Se Mueva y Algo por este País Viven de Gorra del Secuestro la Extorsión y Se Camuflan de Lideres']

## Data Preparation

- ## Functions to remove URL links, @mention, #hashtags

In [9]:
import string # String contains methods that allow the use of characters which are considered punctuation characters, 
                # digits, uppercase, lowercase, etc.
import re # Regular expression operations

In [10]:
def strip_links(text): # function to remove/strip URL links
    link_regular_expression = re.compile('((https?):((//)|(\\\\))+([\w\d:#@%/;$()~_?\+-=\\\.&](#!)?)*)', re.DOTALL)
    links = re.findall(link_regular_expression, text)
    for link in links:
        text = text.replace(link[0], ', ')   
    return text

In [11]:
def strip_all_entities(text): # function to remove/strip mentions, hashtags, characters from some users
    entity_prefixes = ['@','#','\\','_']
    for separator in  string.punctuation:
        if separator not in entity_prefixes :
            text = text.replace(separator,' ')
    words = []
    for word in text.split():
        word = word.strip()
        if word:
            if word[0] not in entity_prefixes:
                words.append(word)
    return ' '.join(words)

source: https://stackoverflow.com/questions/8376691/how-to-remove-hashtag-user-link-of-a-tweet-using-regular-expression

In [12]:
def strip_all(list_text): # function that removes all URL links, @mention, #hashtags
    list_stripped = []
    for t in list_text:
        word = strip_all_entities(strip_links(t))
        list_stripped.append(word)
    return(list_stripped)

In [13]:
tweets = strip_all(text)
#tweets  # printing the tweets

- ## Function to convert tweets in lowercase

In [14]:
def lower_case(list_text): # Convert lowercase tweets for language processing
    list_lower_case = []
    for i in list_text:
        word = i.strip()
        new_word = word.lower()
        list_lower_case.append(new_word)
    return(list_lower_case)

In [15]:
tweets = lower_case(tweets)

#tweets  # printing the tweets

- ## Replace the emojis for their names

In [27]:
import emoji # Emoji codes

In [28]:
def replace_name_emoji(list_text): # function that replaces emoticons by their names
    list_name_emoji = []
    for l in list_text:
        list_name_emoji.append(emoji.demojize(l, delimiters=("", "")))
    return(list_name_emoji)

In [29]:
tweets = replace_name_emoji(tweets)

tweets  # printing the tweets

['por la empresa e inversión es que ud y su flia pueden comer y disfrutar de medios para que uds parasitos vivan porque trabaja mas facil una pala empeñada que un solo mamertos de uds se mueva y algo por este país viven de gorra del secuestro la extorsión y se camuflan de lideres   ',
 'este señor parece que solo lee el título de noticia tanto uruguay méxico colombia y chile viene aplicando educación a distancia muchos años atrás aquí un genocida apoyó prensa chicha y mató estudiantes universitarios un suicida secuestró docentes y así hipócrita   ',
 'el cómplice de santrich en colombia goza de total apoyo de los magistrados jurídicamente podría decirse qué hay complicidad con el terrorismo   ',
 'pues colombia apoyó en la guerra de corea tal vez se acuerdan y nos mandan un bombazo   ',
 'honorable dr abelardodelaesprilla muchas gracias por ayudar a salvar a colombia permítame felicitarlo usted es una persona frentera sin miedo para cantarle la tabla de frente al terrorismo mi aprecio 

## Translation

In [35]:
# provides authentication credentials to application code:

os.environ["GOOGLE_APPLICATION_CREDENTIALS"]="C:/Users/Alejandra/Desktop/Proyecto/My First Project-689f409050bd.json"

In [38]:
from google.cloud import translate_v2 as translate

def translate_text(text,target='en'):
    translate_client = translate.Client()
    result = translate_client.translate(text, target_language=target, format_= "text")
    
    return(result['translatedText'])

In [50]:
test_tweets = []

character_count = 0
translated_tweets = 0

for f in tweets:
    tweets_translated = translate_text(f, target = "en")
    test_tweets.append(tweets_translated)
    
    translated_tweets += 1
    character_count += len(f)

In [41]:
# test_tweets

In [53]:
print("Total characters translated: {0}".format(character_count))
print("Translated Tweets: {0}".format(translated_tweets))

Total characters translated: 10295
Translated Tweets: 44


In [42]:
test_tweets[0] # first translated tweet

'for the company and investment is that you and your family can eat and enjoy the means for you parasites to live because a pawned shovel works easier than a single mamertos of you to move and something for this country lives from kidnapping cap extortion and they camouflage themselves as leaders'

In [None]:
# exporting translated tweets

In [43]:
base = pd.DataFrame(test_tweets, columns=["tweets"])

In [44]:
base.to_csv('test.csv', sep = ',')

### Grading

In [46]:
from grader import Grader

In [47]:
grader = Grader()

## Word Embedding

In [48]:
import gensim

#wv_embedding is the embedding loaded
wv_embeddings = gensim.models.KeyedVectors.load_word2vec_format('GoogleNews-vectors-negative300.bin', binary=True, limit=500000) ######### YOUR CODE HERE #############
type(wv_embeddings )


gensim.models.keyedvectors.Word2VecKeyedVectors

## From word to text embedding

In [55]:
#This function converts a question in a vector
import numpy as np
def tweets_to_vec(tweet, embeddings, dim=300):

    result = np.zeros(dim) #300 dimensional vector for phrase vector
    cnt = 0
    words = tweet.split()
    for word in words: #All word vectors composing the phrase are summed
        if word in embeddings:
            result += np.array(embeddings[word])
            cnt += 1
    if cnt != 0: #This would happen if no word was found in the embedding
        result /= cnt
    return result

In [56]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Alejandra\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [57]:
import util

In [59]:
tweet2vec_result = []
for tweet in test_tweets:
    tweet = tweet.strip()
    answer = tweets_to_vec(tweet, wv_embeddings)
    tweet2vec_result = np.append(tweet2vec_result, answer)
    print("hola", tweet2vec_result)
    print(tweet2vec_result)

hola [ 0.04995858  0.02655646 -0.00574289  0.10593619 -0.08838039  0.00987504
  0.05028069 -0.06131391  0.03036946  0.03937326 -0.00199778 -0.09451145
 -0.03161174  0.04786496 -0.12766917  0.0726564   0.03869778  0.0789527
  0.02101135 -0.04751475 -0.02324904  0.01966877  0.02232026  0.00822151
  0.02338986 -0.0223825  -0.08459282  0.04632866  0.00975279 -0.01869834
  0.01437973  0.03821694 -0.01785995 -0.01550609  0.01437303 -0.02477599
  0.04552106 -0.04417606  0.03621841  0.05945792  0.07382872 -0.00430456
  0.09858406 -0.00907489 -0.00799895 -0.03894638 -0.02151973  0.046974
  0.00837875 -0.01283339  0.01642413  0.03914735  0.04081466 -0.01486578
 -0.02427264 -0.01215977 -0.06052566 -0.02091645  0.04836552 -0.01816168
 -0.01547539  0.06411148 -0.08274153 -0.03542123 -0.00902874 -0.02392355
 -0.05957552  0.06405157 -0.02107239  0.06262505  0.08007068  0.02645911
  0.06804229 -0.0652436  -0.10048769 -0.103171    0.03488103  0.07500253
  0.0181548   0.04441499 -0.02630764 -0.06802201 

## Evaluation of text similarity

### HitsCount and DCGScore

In [60]:
'''
Metric 1 to validate the precision of a model
'''
def hits_count(dup_ranks, k):
    
    return np.average(np.array(dup_ranks) <= np.array([k]))


In [61]:
'''
Metric 2 to validate the precision of a model
'''

def dcg_score(dup_ranks, k):

    return np.average((np.array(dup_ranks) <= np.array([k]))*1./(np.log2(1. + np.array(dup_ranks))))


## Cosine similarity 

In [62]:
from sklearn.metrics.pairwise import cosine_similarity


In [63]:
tweet_vec= []
for tweet in test_tweets:
    # Strip removes the first space in the string
    tweet = tweet.strip()
    
    answer = tweets_to_vec(tweet, wv_embeddings)
    # Print each question with its vectorization
    tweet_vec.append(answer)

In [64]:
len(tweet_vec)

44

In [66]:
a=cosine_similarity([tweet_vec[0]],[tweet_vec[1]])

In [67]:
def cosine_measure(list_text):
    list_total =[]
    for i in list_text:
        list_cosine =[]
        for j in list_text:
            if (i-j).all():
                a=cosine_similarity([i],[j])
                list_cosine.append(a)
        list_total.append(list_cosine)
    return(list_total)

In [68]:
cosines = cosine_measure(tweet_vec)

In [69]:
cosines

[[array([[0.70207276]]),
  array([[0.72803324]]),
  array([[0.69135575]]),
  array([[0.76820153]]),
  array([[0.75138278]]),
  array([[0.82597124]]),
  array([[0.75621477]]),
  array([[0.75988078]]),
  array([[0.81055416]]),
  array([[0.76671452]]),
  array([[0.72988869]]),
  array([[0.74240722]]),
  array([[0.8504179]]),
  array([[0.87075671]]),
  array([[0.73541807]]),
  array([[0.71118979]]),
  array([[0.81135899]]),
  array([[0.64664508]]),
  array([[0.70092177]]),
  array([[0.84696347]]),
  array([[0.65197291]]),
  array([[0.81843312]]),
  array([[0.7779822]]),
  array([[0.73049599]]),
  array([[0.78577445]]),
  array([[0.74854324]]),
  array([[0.81482561]]),
  array([[0.60668862]]),
  array([[0.7508488]]),
  array([[0.84334356]]),
  array([[0.60442462]]),
  array([[0.70436354]]),
  array([[0.73789919]]),
  array([[0.84620825]]),
  array([[0.74976576]]),
  array([[0.64572982]]),
  array([[0.78208251]]),
  array([[0.83940032]]),
  array([[0.80672969]]),
  array([[0.68024306]]),
  a

In [70]:
len(cosines[0])

43

In [72]:
np.argmin(cosines[0])
np.argmax(cosines[0])
#cosines[0].index(np.array(0.05218427))
cosines[0][1][0][0]

0.7280332383269781

In [73]:
def column_organization_to_validation(list_text):
    final_list=[]
    for i in range(0,len(list_text)):
        temp=[]
        for j in range(0,len(list_text[i])):
            temp.append(list_text[i][j][0][0])
        final_list.append(temp)
    return(final_list)

In [74]:
final_cosines = column_organization_to_validation(cosines)

In [75]:
sorted(final_cosines[2])

[0.45227208297405935,
 0.5802538595630533,
 0.6371996145087332,
 0.6624777101520625,
 0.6757690400581784,
 0.6805019423211722,
 0.6935808088344291,
 0.6939958366209682,
 0.7000554548940222,
 0.7064742095230332,
 0.7081598591284082,
 0.7090969039686411,
 0.7105360741108706,
 0.7108402028441352,
 0.712414664568505,
 0.7142154791525823,
 0.7190918672357104,
 0.726800696675618,
 0.7280332383269781,
 0.7356232499906519,
 0.7379375496736026,
 0.7379375496736026,
 0.7397566681701415,
 0.7464816678123041,
 0.7548297216315121,
 0.7563010059740883,
 0.7564885947623876,
 0.7611432382481054,
 0.7626813722115378,
 0.7632299861567455,
 0.775072024836407,
 0.775667509108605,
 0.7783548213680684,
 0.7868694386770054,
 0.7878231321844543,
 0.7883806378205411,
 0.789558125692486,
 0.7976960506415952,
 0.8034483994189376,
 0.807648151331054,
 0.8199072657534597,
 0.8322362022293913]

In [76]:
a=sorted(final_cosines[0])
a[0]
a.index(a[0])

0

In [79]:
def data_validation_classification(cosines,tweets):
    total=[]
    for i in range(0,len(cosines)):
        #print(i)
        temp=[]
        organized=sorted(cosines[i])
        min1=organized[0]
        min2=organized[1]
        max1=organized[len(organized)-1]
        pos_min_1=cosines[i].index(min1)
        pos_min_2=cosines[i].index(min2)
        pos_max_1=cosines[i].index(max1)
        temp.append(tweets[i])
        if pos_max_1>=i:
            temp.append(tweets[(pos_max_1+1)])
        else:
            temp.append(tweets[pos_max_1])
        if pos_min_1>=i:
            temp.append(tweets[(pos_min_1+1)])
        else:
            temp.append(tweets[pos_min_1])
        if pos_min_2>=i:
            temp.append(tweets[(pos_min_2+1)])
        else:
            temp.append(tweets[pos_min_2])
        total.append(temp)
    return(total)

In [80]:
u=data_validation_classification(final_cosines, test_tweets)

In [81]:
import xlsxwriter as ws

In [125]:
workbook = ws.Workbook('validation.csv') 
worksheet = workbook.add_worksheet() 

In [126]:
row=0
for i in range(0,len(u)):
    worksheet.write(row, 0, u[i][0])
    worksheet.write(row, 1, u[i][1]) 
    worksheet.write(row, 2, u[i][2]) 
    worksheet.write(row, 3, u[i][3])
    row+=1
workbook.close()  

### First solution: pre-trained embeddings
We will work with predefined train and validation:

- train corpus contains similar sentences at the same row.
- validation corpus contains the following columns: tweet, similar tweet, negative example 1, negative example 2.


In [127]:
def read_corpus(filename):
    data = []
    for line in open(filename, encoding='utf-8'):
        data.append(line.strip().split(','))
    return data

In [128]:
validation = read_corpus('validation.csv')

In [140]:
print(validation)

[['\ufefffor the company and investment is that you and your family can eat and enjoy the means for you parasites to live because a pawned shovel works easier than a single mamertos of you to move and something for this country lives from kidnapping cap extortion and they camouflage themselves as leaders;Well', ' to live in Colombia and have a life you have already seen very little in what your country refers to', ' it is easy to talk shit when you do not touch or do not suffer the consequences of corruption and war. nothing;claaaaro are you happy with the invasion of cuba russia china iran eln farc terrorism shit like you etc etc;what was missing from that putrefaction of a terrorism support agency'], ['this man seems to only read the news headline both uruguay mexico colombia and chile has been applying distance education many years ago here a genocide supported the chicha press and killed university students a suicide kidnapped teachers and thus hypocritical;the farc are still alive

In [141]:
def rank_candidates(tweet, candidates, embeddings, dim=300):


    t_vecs = np.array([tweets_to_vec(tweet, embeddings, dim) for i in range(len(candidates))])
    cand_vecs = np.array([tweets_to_vec(candidate, embeddings, dim) for candidate in candidates])
    cosines = np.array(cosine_similarity(t_vecs, cand_vecs)[0])
    merged_list = list(zip(cosines, range(len(candidates)), candidates))
    #print(merged_list)
    sorted_list  = sorted(merged_list, key=lambda x: x[0], reverse=True)
    result = [(b,c) for a,b,c in sorted_list]
    
    return result

In [142]:
wv_ranking = []
for line in validation:
    for l in line:
        t, *ex = l
        ranks = rank_candidates(t, ex, wv_embeddings)
        wv_ranking.append([r[0] for r in ranks].index(0) + 1)

In [143]:
for k in [1, 5, 10, 100, 500, 1000]:
    print("DCG@%4d: %.3f | Hits@%4d: %.3f" % (k, dcg_score(wv_ranking, k), k, hits_count(wv_ranking, k)))

DCG@   1: 0.764 | Hits@   1: 0.764
DCG@   5: 0.780 | Hits@   5: 0.798
DCG@  10: 0.782 | Hits@  10: 0.803
DCG@ 100: 0.806 | Hits@ 100: 0.921
DCG@ 500: 0.814 | Hits@ 500: 0.983
DCG@1000: 0.816 | Hits@1000: 1.000


In [144]:
from util import text_prepare

In [147]:
prepared_validation = []
for line in validation:
    for l in line:
        prepared_validation.append([text_prepare(sentence) for sentence in l])

In [148]:
wv_prepared_ranking = []
for line in prepared_validation:
    q, *ex = line
    ranks = rank_candidates(q, ex, wv_embeddings)
    wv_prepared_ranking.append([r[0] for r in ranks].index(0) + 1)

In [149]:
for k in [1, 5, 10, 100, 500, 1000]:
    print("DCG@%4d: %.3f | Hits@%4d: %.3f" % (k, dcg_score(wv_prepared_ranking, k), 
                                              k, hits_count(wv_prepared_ranking, k)))

DCG@   1: 0.865 | Hits@   1: 0.865
DCG@   5: 0.879 | Hits@   5: 0.893
DCG@  10: 0.879 | Hits@  10: 0.893
DCG@ 100: 0.891 | Hits@ 100: 0.955
DCG@ 500: 0.897 | Hits@ 500: 1.000
DCG@1000: 0.897 | Hits@1000: 1.000
