## Import Libraries

In [1]:
import pandas as pd
import nltk,string,re,emoji
from pandas import DataFrame
from tqdm import tqdm
import itertools
from collections import Counter
from tqdm import tqdm
import time
from fuzzywuzzy import fuzz,process

## Read data

In [2]:
data=pd.read_excel("input_data.xlsx").dropna()
print(len(data))
data.head()

5105


Unnamed: 0,id,tweet_full_text
0,1,Done is better than perfect. â€” Sheryl Sandbe...
1,2,Shout out to the Great Fire Department and the...
2,3,There are some AMAZINGLY hilarious Nike Ad mem...
3,4,#kapernickeffect #swoosh #justdoit @ Lucas Bis...
4,5,"One Hand, One Dream: The Shaquem Griffin Story..."


## Cleanse data

In [3]:
import re
def url_remove(text):
    t=re.sub(r'https://\S+','',text)
    return t
def punc_remove(text):
    t=re.sub(r'[^\w\s]',' ',text)
    return t
def emoji_remove(text):
    return emoji.get_emoji_regexp().sub("",text).strip()
def hashtag_remove(text):
    reg="#(\w+)"
    return re.sub(reg," ",text)
def mentions_remove(text):
    reg="@(\w+)"
    return re.sub(reg," ",text)

def clean_text(text):
    t=(emoji_remove(punc_remove(mentions_remove(hashtag_remove(url_remove(re.sub("[\n\r\t\xa0]"," ",text).strip()))))))
    return " ".join(t.split())

In [4]:
data['cleaned_tweet']=data['tweet_full_text'].apply(lambda x:clean_text(x))
data.head()

Unnamed: 0,id,tweet_full_text,cleaned_tweet
0,1,Done is better than perfect. â€” Sheryl Sandbe...,Done is better than perfect â Sheryl Sandberg
1,2,Shout out to the Great Fire Department and the...,Shout out to the Great Fire Department and the...
2,3,There are some AMAZINGLY hilarious Nike Ad mem...,There are some AMAZINGLY hilarious Nike Ad mem...
3,4,#kapernickeffect #swoosh #justdoit @ Lucas Bis...,Lucas Bishop s Cigar Lounge
4,5,"One Hand, One Dream: The Shaquem Griffin Story...",One Hand One Dream The Shaquem Griffin Story


## Remove duplicates at the first level using the predefined function

In [5]:
data.drop_duplicates(subset='cleaned_tweet',inplace=True)
print(len(data))
data.dropna()
data.head()

4844


Unnamed: 0,id,tweet_full_text,cleaned_tweet
0,1,Done is better than perfect. â€” Sheryl Sandbe...,Done is better than perfect â Sheryl Sandberg
1,2,Shout out to the Great Fire Department and the...,Shout out to the Great Fire Department and the...
2,3,There are some AMAZINGLY hilarious Nike Ad mem...,There are some AMAZINGLY hilarious Nike Ad mem...
3,4,#kapernickeffect #swoosh #justdoit @ Lucas Bis...,Lucas Bishop s Cigar Lounge
4,5,"One Hand, One Dream: The Shaquem Griffin Story...",One Hand One Dream The Shaquem Griffin Story


## Applying Jaccard similarity

In [6]:
def jaccard_similarity(list1, list2):
    s1 = set(list1)
    s2 = set(list2)
    return float(len(s1.intersection(s2)) / len(s1.union(s2)))

## Applying string similarity on the first 100 records`

In [7]:
cleaned_100=list(data['cleaned_tweet'][:100])
res=dict(zip(data['id'],cleaned_100))

In [9]:

dup=[]
dis=[]

start=time.time()
for i in tqdm(range(len(cleaned_100))):
    
    for j in (range(i+1,(len(cleaned_100)))):
        if(jaccard_similarity(cleaned_100[i],cleaned_100[j])>=0.90):#set threshold of 0.90 for similarity
            dup.append((cleaned_100[i],cleaned_100[j]))#append all duplicates which cross the threshold
        else:
            dis.append((cleaned_100[i],cleaned_100[j]))#append all distinct records which are below the threshold
end=time.time()
print("Time taken to process similarity is:{}".format(end-start))

t=list(itertools.chain.from_iterable(dis))
u=list(itertools.chain.from_iterable(dup))

print(" ")
print("Duplicate pairs which cross the given threshold are {}".format(len(set(u))))



100%|███████████████████████████████████████| 100/100 [00:00<00:00, 555.52it/s]

Time taken to process similarity is:0.19301080703735352
 
Duplicate pairs which cross the given threshold are 29





## Fuzzy wuzzy Implementation-A Comparision of the efficiency of time

In [8]:
#FUZZYWUZZY

dup=[]
dis=[]
import time
start=time.time()
for i in tqdm(range(len(cleaned_100))):
    
    for j in (range(i+1,(len(cleaned_100)))):
        if(fuzz.ratio(cleaned_100[i],cleaned_100[j])>=0.90):
            dup.append((cleaned_100[i],cleaned_100[j]))
        else:
            dis.append((cleaned_100[i],cleaned_100[j]))
end=time.time()
print("Time taken to process similarity is:{}".format(end-start))

t=list(itertools.chain.from_iterable(dis))
u=list(itertools.chain.from_iterable(dup))




100%|███████████████████████████████████████| 100/100 [00:00<00:00, 370.35it/s]

Time taken to process similarity is:0.28601646423339844





In [10]:
#Filter the duplicate records based on their length so as to retain the longer records
long=[]
short=[]
for i in list(set(dup)): 
    if(len(i[0])>len(i[1])):
        long.append(i[0])
        short.append(i[1])
    elif(len(i[1])>len(i[0])):
        long.append(i[1])
        short.append(i[0])
    else:
        continue

In [11]:
#Remove the shorter length records from the original dataset,retaining the records of bigger length to prevent data loss

d2 = dict(res)
for k,v in tqdm(list(res.items())):
    if(v in list(short)):
        del d2[k]

        
print("Records present after removing duplicates using string similarity are {}".format(len(d2)))


100%|█████████████████████████████████████| 100/100 [00:00<00:00, 49997.66it/s]

Records present after removing duplicates using string similarity are 84





## Output data to excel

In [11]:
df=pd.DataFrame(data=d2,index=['Text_with_ID'])
df=(df.T)

df.to_excel('output_id.xlsx')