# 🚀 Import relevant packages and dataset

In [38]:
pip install nltk

Note: you may need to restart the kernel to use updated packages.


In [39]:
#import nlkt libraries
import nltk

nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package stopwords to /home/andre/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /home/andre/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /home/andre/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /home/andre/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [40]:
import pandas as pd
import numpy as np
import string
from num2words import num2words

In [41]:
df = pd.read_csv('Text_Similarity_Dataset.csv')

# 🔎First dataset exploration 

In [42]:
df.head()

Unnamed: 0,Unique_ID,text1,text2
0,0,savvy searchers fail to spot ads internet sear...,newcastle 2-1 bolton kieron dyer smashed home ...
1,1,millions to miss out on the net by 2025 40% o...,nasdaq planning $100m share sale the owner of ...
2,2,young debut cut short by ginepri fifteen-year-...,ruddock backs yapp s credentials wales coach m...
3,3,diageo to buy us wine firm diageo the world s...,mci shares climb on takeover bid shares in us ...
4,4,be careful how you code a new european directi...,media gadgets get moving pocket-sized devices ...


In [43]:
df.info

<bound method DataFrame.info of       Unique_ID                                              text1  \
0             0  savvy searchers fail to spot ads internet sear...   
1             1  millions to miss out on the net by 2025  40% o...   
2             2  young debut cut short by ginepri fifteen-year-...   
3             3  diageo to buy us wine firm diageo  the world s...   
4             4  be careful how you code a new european directi...   
...         ...                                                ...   
4018       4018  labour plans maternity pay rise maternity pay ...   
4019       4019  high fuel costs hit us airlines two of the lar...   
4020       4020  britons growing  digitally obese  gadget lover...   
4021       4021  holmes is hit by hamstring injury kelly holmes...   
4022       4022  nuclear dumpsite  plan attacked plans to allow...   

                                                  text2  
0     newcastle 2-1 bolton kieron dyer smashed home ...  
1     nasda

In [44]:
df.shape

(4023, 3)

In [45]:
df.isnull().sum().sum()

0

In [46]:
df.duplicated().sum()

0

# 🧹Cleaning text 

In [47]:
df.head()

Unnamed: 0,Unique_ID,text1,text2
0,0,savvy searchers fail to spot ads internet sear...,newcastle 2-1 bolton kieron dyer smashed home ...
1,1,millions to miss out on the net by 2025 40% o...,nasdaq planning $100m share sale the owner of ...
2,2,young debut cut short by ginepri fifteen-year-...,ruddock backs yapp s credentials wales coach m...
3,3,diageo to buy us wine firm diageo the world s...,mci shares climb on takeover bid shares in us ...
4,4,be careful how you code a new european directi...,media gadgets get moving pocket-sized devices ...


In [48]:
# Check if the 'text1' contains digit numbers
contains_digits1 = df['text1'].apply(lambda x: any(char.isdigit() for char in x))
contains_digits1.sum()

3833

In [49]:
## Check if the 'text2' contains digit numbers
contains_digits2 = df['text2'].apply(lambda x: any(char.isdigit() for char in x))
contains_digits2.sum()

3824

In [50]:
#!pip install num2words

In [51]:
df.head()

Unnamed: 0,Unique_ID,text1,text2
0,0,savvy searchers fail to spot ads internet sear...,newcastle 2-1 bolton kieron dyer smashed home ...
1,1,millions to miss out on the net by 2025 40% o...,nasdaq planning $100m share sale the owner of ...
2,2,young debut cut short by ginepri fifteen-year-...,ruddock backs yapp s credentials wales coach m...
3,3,diageo to buy us wine firm diageo the world s...,mci shares climb on takeover bid shares in us ...
4,4,be careful how you code a new european directi...,media gadgets get moving pocket-sized devices ...


In [52]:
def basic_cleaning(text):
    # Ensure that text is a string
    text = str(text)
    # 1. Removing whitespaces
    text = text.strip()
    # 2. Lowercasing
    text = text.lower()
    # 3. Changing digits to words
    cleaned_text = []
    temp_num = ""
    for char in text:
        if char.isdigit():
            temp_num += char
        else:
            if temp_num:
                cleaned_text.append(num2words(int(temp_num)))
                temp_num = ""
            if char not in string.punctuation:
                cleaned_text.append(char)
            else:
                cleaned_text.append(" ")
    if temp_num:
        cleaned_text.append(num2words(int(temp_num)))

    # Join the list into a string
    text = ''.join(cleaned_text)

    # 4. Removing extra spaces
    text = ' '.join(text.split())

    return text

# Apply basic_cleaning function to the whole dataset
df['text1'] = df['text1'].apply(basic_cleaning)
df['text2'] = df['text2'].apply(basic_cleaning)

df.head()


Unnamed: 0,Unique_ID,text1,text2
0,0,savvy searchers fail to spot ads internet sear...,newcastle two one bolton kieron dyer smashed h...
1,1,millions to miss out on the net by two thousan...,nasdaq planning one hundredm share sale the ow...
2,2,young debut cut short by ginepri fifteen year ...,ruddock backs yapp s credentials wales coach m...
3,3,diageo to buy us wine firm diageo the world s ...,mci shares climb on takeover bid shares in us ...
4,4,be careful how you code a new european directi...,media gadgets get moving pocket sized devices ...


In [53]:
# Check if the function remove numbers worked
contains_digits1 = df['text1'].apply(lambda x: any(char.isdigit() for char in x))
contains_digits1.sum()

0

In [54]:
# Check if the function remove numbers worked
contains_digits2 = df['text2'].apply(lambda x: any(char.isdigit() for char in x))
contains_digits2.sum()

0

In [55]:
import re

def regex(text):
    if isinstance(text, list):
        return [re.sub('<[^<]+?->', '', word) for word in text]
    else:
        return re.sub('<[^<]+?->', '', text)

# Tokenized  all rows in both columns
df['text1']= df['text1'].apply(regex)
df['text2']= df['text2'].apply(regex)

df.head()

Unnamed: 0,Unique_ID,text1,text2
0,0,savvy searchers fail to spot ads internet sear...,newcastle two one bolton kieron dyer smashed h...
1,1,millions to miss out on the net by two thousan...,nasdaq planning one hundredm share sale the ow...
2,2,young debut cut short by ginepri fifteen year ...,ruddock backs yapp s credentials wales coach m...
3,3,diageo to buy us wine firm diageo the world s ...,mci shares climb on takeover bid shares in us ...
4,4,be careful how you code a new european directi...,media gadgets get moving pocket sized devices ...


In [56]:
from nltk.tokenize import word_tokenize

# Function to tokenized
def tokenized(text):
    if isinstance(text, str):
        return word_tokenize(text)
    else:
        return []

# Tokenized all rows in both columns
df['text1'] = df['text1'].apply(tokenized)
df['text2'] = df['text2'].apply(tokenized)

df.head()

Unnamed: 0,Unique_ID,text1,text2
0,0,"[savvy, searchers, fail, to, spot, ads, intern...","[newcastle, two, one, bolton, kieron, dyer, sm..."
1,1,"[millions, to, miss, out, on, the, net, by, tw...","[nasdaq, planning, one, hundredm, share, sale,..."
2,2,"[young, debut, cut, short, by, ginepri, fiftee...","[ruddock, backs, yapp, s, credentials, wales, ..."
3,3,"[diageo, to, buy, us, wine, firm, diageo, the,...","[mci, shares, climb, on, takeover, bid, shares..."
4,4,"[be, careful, how, you, code, a, new, european...","[media, gadgets, get, moving, pocket, sized, d..."


In [57]:
from nltk.corpus import stopwords
from nltk import word_tokenize

stop_words = set(stopwords.words('english'))

# Function to remove StopWords
def remove_stopwords(text):
    if isinstance(text, list):
        return [word for word in text if word.lower() not in stop_words]
    else:
        tokenized = word_tokenize(text)
        without_stopwords = [word for word in tokenized if word.lower() not in stop_words]
        return without_stopwords

# Remove stopwords from all rows in both columns
df['text1'] = df['text1'].apply(remove_stopwords)
df['text2'] = df['text2'].apply(remove_stopwords)

df.head()


Unnamed: 0,Unique_ID,text1,text2
0,0,"[savvy, searchers, fail, spot, ads, internet, ...","[newcastle, two, one, bolton, kieron, dyer, sm..."
1,1,"[millions, miss, net, two, thousand, twenty-fi...","[nasdaq, planning, one, hundredm, share, sale,..."
2,2,"[young, debut, cut, short, ginepri, fifteen, y...","[ruddock, backs, yapp, credentials, wales, coa..."
3,3,"[diageo, buy, us, wine, firm, diageo, world, b...","[mci, shares, climb, takeover, bid, shares, us..."
4,4,"[careful, code, new, european, directive, coul...","[media, gadgets, get, moving, pocket, sized, d..."


In [58]:
from nltk.stem import WordNetLemmatizer

#Create a function to Lemmatize
def lemma(text):
    lemmatizer = WordNetLemmatizer() # Instantiate lemmatizer
    lemmatized = [lemmatizer.lemmatize(word) for word in text] # Lemmatize
    lemmatized_string = " ".join(lemmatized)
    return lemmatized_string

df['text1'] = df['text1'].apply(lemma)
df['text2'] = df['text2'].apply(lemma)

df.head()

Unnamed: 0,Unique_ID,text1,text2
0,0,savvy searcher fail spot ad internet search en...,newcastle two one bolton kieron dyer smashed h...
1,1,million miss net two thousand twenty-five fort...,nasdaq planning one hundredm share sale owner ...
2,2,young debut cut short ginepri fifteen year old...,ruddock back yapp credential wale coach mike r...
3,3,diageo buy u wine firm diageo world biggest sp...,mci share climb takeover bid share u phone com...
4,4,careful code new european directive could put ...,medium gadget get moving pocket sized device l...


In [59]:
#checking all functions have worked
df.iloc[1,1]

'million miss net two thousand twenty-five forty uk population still without internet access home say study around twenty-three million briton miss wide range essential service education medical information predicts report telecom giant bt compare twenty-seven million fifty uk currently online idea digital divide evaporate time wishful thinking report concludes study call government telecom industry come new way lure bypassed digital revolution although percentage briton without home access fallen slightly remain digital refuseniks miss report suggests everyday task move online offline service become le comprehensive divide become obvious burdensome got net access predicts gap net nots much talked prediction divide affect future generation le discussed bt set predict future pattern based current information taking account way technology changing optimist predict convergence emergence user friendly technology bridge digital divide could way mark report suggests internet access device te

# 🏗️Preprocessing