# Installation 

## For CPU based Spacy : 

!pip install -U pip setuptools wheel </br>

!pip install -U 'spacy' </br>

!python -m spacy download en_core_web_sm </br>

## For GPU based Spacy : 

!pip install -U pip setuptools wheel </br>


[Specify your cuda version here]

!pip install -U 'spacy[cuda113]' </br> 

!python -m spacy download en_core_web_sm</br>

# 1. Cleaning Numbers

In [8]:
!pip install -U pip setuptools wheel
!pip install -U spacy
!python -m spacy download en_core_web_sm

Collecting en-core-web-sm==3.8.0
  Using cached https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')


In [9]:
#import the library
import spacy
from spacy.symbols import ORTH
from spacy.lang.en.stop_words import STOP_WORDS
import re
import pandas as pd

sentence = "Todays Date is 23rd we need to submit our assignment on 30th of this month."


def clean_numbers(text):
    text = re.sub(r'(\d+)([a-zA-Z])', '\g<1> \g<2>', text)
    text = re.sub(r'(\d+) (th|st|nd|rd) ', '\g<1>\g<2> ', text)
    text = re.sub(r'(\d+),(\d+)', '\g<1>\g<2>', text)
    text = re.sub(r'(\d+)(e)(\d+)','\g<1> \g<3>', text)
    text = re.sub(r'\d+', '', text)
    return text


print(f'Before - {sentence}')
print(f'After  - {clean_numbers(sentence)}')

Before - Todays Date is 23rd we need to submit our assignment on 30th of this month.
After  - Todays Date is rd we need to submit our assignment on th of this month.


In [10]:
data = pd.read_csv('../input/datasetspam/spam.csv',encoding='ISO-8859-1',usecols = ['v1','v2'])
data.head()

FileNotFoundError: [Errno 2] No such file or directory: '../input/datasetspam/spam.csv'

In [3]:
data.shape

(5572, 2)

In [4]:
data['v2'] = data['v2'].apply(lambda x : clean_numbers(x))

# 2. Remove Punctuations  

In [5]:
#load the trained pipelines for english
# you can also try your hands on 
#en_core_web_md,
#en_core_web_lg,
#en_core_web_trf
nlp = spacy.load("en_core_web_sm") 


def remove_punc(sentence):
    doc = nlp(sentence)
    sentence_out = []
    for token in doc:
        if token.is_punct:
            continue
        sentence_out.append(token.text)
    return " ".join(sentence_out)


sentence = "What a great day it is!!"
print(f'Before - {sentence}')
print(f'After  - {remove_punc(sentence)}')

Before - What a great day it is!!
After  - What a great day it is


In [6]:
data['v2'] = data['v2'].apply(lambda x : remove_punc(x))

# 3. Tokenization

In [7]:
doc = nlp("LeonardMaltin gave this film a dreaded BOMB rating in his 1995 Movie and Video Guide. What film was he looking at?")

for token in doc:
    
    print(token.text,end=', ')
    
    #print(token)
    
    #You can use token as well to print the words, but token gives you words with type <class 'spacy.tokens.token.Token'>
    #and token.text gives you words in str format where you can carry normal string opertaions.

LeonardMaltin, gave, this, film, a, dreaded, BOMB, rating, in, his, 1995, Movie, and, Video, Guide, ., What, film, was, he, looking, at, ?, 

# 3.1 Adding Special Rules in Tokenization

### Lets separate LeonardMaltin into two different words while tokenizing

In [8]:
special_case = [{ORTH: "Leonard"}, {ORTH: "Maltin"}]        # Adding special case rule
nlp.tokenizer.add_special_case("LeonardMaltin", special_case)
doc = nlp("LeonardMaltin gave this film a dreaded BOMB rating in his 1995 Movie and Video Guide. What film was he looking at?")

for token in doc:
    print(token.text,end=', ')

Leonard, Maltin, gave, this, film, a, dreaded, BOMB, rating, in, his, 1995, Movie, and, Video, Guide, ., What, film, was, he, looking, at, ?, 

# 4. Lemmatization

In [9]:
def lemmatization(sentence):
    doc = nlp(sentence)
    return " ".join([token.lemma_ for token in doc])


sentence = "change changed changing changes"
print(f'Before - {sentence}')
print(f'After  - {lemmatization(sentence)}')

Before - change changed changing changes
After  - change change change change


In [10]:
data['v2'] = data['v2'].apply(lambda x : lemmatization(x))

# 5. Stop Word Removal

In [11]:
def remove_stopwords(sentence):
    doc = nlp(sentence)
    out_sentence = []
    for token in doc:
        if nlp.vocab[token.text].is_stop == False:
            out_sentence.append(token.text)
    return " ".join(out_sentence)

sentence = "LeonardMaltin gave this film a dreaded BOMB rating in his 1995 Movie and Video Guide. What film was he looking at?"
print(f'Before - {sentence}')
print(f'After  - {remove_stopwords(sentence)}')

Before - LeonardMaltin gave this film a dreaded BOMB rating in his 1995 Movie and Video Guide. What film was he looking at?
After  - Leonard Maltin gave film dreaded BOMB rating 1995 Movie Video Guide . film looking ?


In [12]:
data['v2'] = data['v2'].apply(lambda x : remove_stopwords(x))

# 6. Removing URLS and Mail Ids

In [13]:
def remove_urls(sentence):
    doc = nlp(sentence)
    out_sentence = []
    for token in doc:
        if token.like_url or token.like_email:
            continue
        out_sentence.append(token.text)
    return " ".join(out_sentence)


sentence = "You can access the website here at http://urlremove.com and if any doubts please reach me out on gmail at removedmails@gmail.com "
print(f'Before - {sentence}')
print(f'After  - {remove_urls(sentence)}')

Before - You can access the website here at http://urlremove.com and if any doubts please reach me out on gmail at removedmails@gmail.com 
After  - You can access the website here at and if any doubts please reach me out on gmail at


In [14]:
data['v2'] = data['v2'].apply(lambda x : remove_urls(x))