In [1]:
import spacy

nlp = spacy.load('en_core_web_sm')  # Load a pre-trained English pipeline

In [2]:
nlp.pipeline

[('tok2vec', <spacy.pipeline.tok2vec.Tok2Vec at 0x1edc56366f0>),
 ('tagger', <spacy.pipeline.tagger.Tagger at 0x1edc56375f0>),
 ('parser', <spacy.pipeline.dep_parser.DependencyParser at 0x1edc7d8e6c0>),
 ('attribute_ruler',
  <spacy.pipeline.attributeruler.AttributeRuler at 0x1edc7ffc3d0>),
 ('lemmatizer', <spacy.lang.en.lemmatizer.EnglishLemmatizer at 0x1edc7ff5750>),
 ('ner', <spacy.pipeline.ner.EntityRecognizer at 0x1edc7d8e810>)]

In [3]:
# Check if 'ner' is already in the pipeline
if 'ner' not in nlp.pipe_names:
    # Add the 'ner' component using its factory name
    nlp.add_pipe('ner', last=True)
nlp.remove_pipe('ner')

('ner', <spacy.pipeline.ner.EntityRecognizer at 0x1edc7d8e810>)

In [4]:
# Create a simpler pipeline that doesn't use all components
simple_pipe = spacy.load(
 "en_core_web_sm", 
 disable=["parser", "ner"])
simple_pipe.pipeline

[('tok2vec', <spacy.pipeline.tok2vec.Tok2Vec at 0x1edc5668a70>),
 ('tagger', <spacy.pipeline.tagger.Tagger at 0x1edc7d332f0>),
 ('attribute_ruler',
  <spacy.pipeline.attributeruler.AttributeRuler at 0x1edc9cd0190>),
 ('lemmatizer', <spacy.lang.en.lemmatizer.EnglishLemmatizer at 0x1edc9cd8b90>)]

In [5]:
import spacy

nlp = spacy.load('en_core_web_sm')

text = "The cats are running around the house."
doc = nlp(text)

# Tokenization
for token in doc:
    print(token.text)

# Lemmatization
for token in doc:
    print(token.lemma_)

The
cats
are
running
around
the
house
.
the
cat
be
run
around
the
house
.


In [6]:
for token in doc:
    print(token.text, token.pos_)

The DET
cats NOUN
are AUX
running VERB
around ADP
the DET
house NOUN
. PUNCT


In [7]:
import spacy

nlp = spacy.load('en_core_web_sm')

text = "Although last month's measures were mainly targeted at making it easier for middle-class Chinese families to buy homes, analysts still believe the country's 5% growth target is too optimistic. Beijing's central bank has cut interest rates on mortgages and lowered the amount of money lenders must keep on hand. However, experts have warned more substantial reforms are needed if China wants to boost its property sector following the downfall of development firms like Evergrande."
doc = nlp(text)
for ent in doc.ents:
    print(ent.text, ent.label_)

last month's DATE
Chinese NORP
5% PERCENT
Beijing GPE
China GPE


In [8]:
text = "The cats are running around the house."
doc = nlp(text)

for token in doc:
    print(token.text, token.dep_)

The det
cats nsubj
are aux
running ROOT
around prep
the det
house pobj
. punct


### Text Classification using spacy

In [9]:
import spacy
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.base import TransformerMixin
from sklearn.pipeline import Pipeline

In [10]:
#Loading Data

df_chatgpt_sent_analysis = pd.read_csv("amazon_cleaned_reviews.csv")

print(f'Shape of data: {df_chatgpt_sent_analysis.shape}')
# Show top 5 records
df_chatgpt_sent_analysis.head()

Shape of data: (17340, 4)


Unnamed: 0,sentiments,cleaned_review,cleaned_review_length,review_score
0,positive,i wish would have gotten one earlier love it a...,19,5
1,neutral,i ve learned this lesson again open the packag...,88,1
2,neutral,it is so slow and lags find better option,9,2
3,neutral,roller ball stopped working within months of m...,12,1
4,neutral,i like the color and size but it few days out ...,21,1


In [11]:
#Data Information

df_chatgpt_sent_analysis.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17340 entries, 0 to 17339
Data columns (total 4 columns):
 #   Column                 Non-Null Count  Dtype 
---  ------                 --------------  ----- 
 0   sentiments             17340 non-null  object
 1   cleaned_review         17337 non-null  object
 2   cleaned_review_length  17340 non-null  int64 
 3   review_score           17340 non-null  int64 
dtypes: int64(2), object(2)
memory usage: 542.0+ KB


In [12]:
#Tokenizing the Text

import string
from spacy.lang.en import English
from spacy.lang.en.stop_words import STOP_WORDS

# Create our list of punchuationmarks
punctuations = string.punctuation

# Create our list of stop words
nlp = spacy.load('en_core_web_sm')
stop_words = spacy.lang.en.stop_words.STOP_WORDS

# Load English tokenizer, tagger, parser, NER and word vector
parser = English()

# Creating our tokenzer function
def spacy_tokenizer(sentence):
    """This function will accepts a sentence as input and processes the sentence into tokens, performing lemmatization, 
    lowercasing, removing stop words and punctuations."""
    
    # Creating our token object which is used to create documents with linguistic annotations
    mytokens = parser(sentence)
    
    # lemmatizing each token and converting each token in lower case
    # Note that spaCy uses '-PRON-' as lemma for all personal pronouns lkike me, I etc
    mytokens = [ word.lemma_.lower().strip() if word.lemma_ != "-PRON-" else word.lower_ for word in mytokens ]
    
    # Removing stop words
    mytokens = [ word for word in mytokens if word not in stop_words and word not in punctuations]
    
    # Return preprocessed list of tokens
    return mytokens    



In [13]:
#Data Cleaning


# Custom transformer using spaCy
class predictors(TransformerMixin):
    def transform(self, X, **transform_params):
        """Override the transform method to clean text"""
        return [clean_text(text) for text in X]
    
    def fit(self, X, y= None, **fit_params):
        return self
    
    def get_params(self, deep= True):
        return {}

# Basic function to clean the text
def clean_text(text):
    """Removing spaces and converting the text into lowercase"""
    return text.strip().lower()  

In [14]:
#Feature engineering (Vectorization)

bow_vector = CountVectorizer(tokenizer = spacy_tokenizer, ngram_range = (1,1))


In [15]:
#TF-IDF

tfidf_vector = TfidfVectorizer(tokenizer = spacy_tokenizer)


In [16]:
df_chatgpt_sent_analysis['review_score'] = df_chatgpt_sent_analysis['review_score'].astype(float)

In [17]:
#Create Train and Test Datasets

from sklearn.model_selection import train_test_split

X = df_chatgpt_sent_analysis['cleaned_review'] # The features we want to analyse
ylabels = df_chatgpt_sent_analysis['review_score'] # There are labels where we have good, bad and neutral

X_train, X_test, y_train, y_test = train_test_split(X, ylabels, test_size = 0.3, random_state = 1)
print(f'X_train dimension: {X_train.shape}')
print(f'y_train dimension: {y_train.shape}')
print(f'X_test dimension: {X_test.shape}')
print(f'y_train dimension: {y_test.shape}')

X_train dimension: (12138,)
y_train dimension: (12138,)
X_test dimension: (5202,)
y_train dimension: (5202,)


In [19]:
# Sentence Embeddings using Spacy
import spacy
nlp = spacy.load("en_core_web_sm")

# Get the sentence embedding for "This is a sample sentence."
doc = nlp("This is a sample sentence.")
sentence_vector = doc.vector

# Compute similarity between two sentences
sent1 = nlp("This is a big book.")
sent2 = nlp("Here is another book but of red cover unlike the previous blue covered book.")
similarity = sent1.vector.dot(sent2.vector)
print(f"Similarity score: {similarity:.2f}")

Similarity score: 4.31


In [20]:
text = "Apple is looking at buying U.K. startup for $1 billion"

In [21]:
# Create a Doc object
document = simple_pipe(text)

In [22]:
# Create a list of lemmatized strings
cleaned_lemmas = [token.lemma_ for token in document if not 
                  token.is_space | token.is_punct | token.is_stop]

In [23]:
cleaned_lemmas

['Apple', 'look', 'buy', 'U.K.', 'startup', '$', '1', 'billion']

In [24]:
import spacy
from spacy.training import Example
from spacy.util import minibatch, compounding
import random

# Sample training data
TRAIN_DATA = [
    ("Who is iphone from Apple?", {"entities": [(19, 23, "ORG")]}),
    ("I like Delhi and Mumbai.", {"entities": [(7, 11, "LOC"), (17, 22, "LOC")]}),
]

# Load a blank model
nlp = spacy.blank("en")

# Create a new NER component and add it to the pipeline
if "ner" not in nlp.pipe_names:
    ner = nlp.add_pipe("ner", last=True)

# Add labels to the NER component
for _, annotations in TRAIN_DATA:
    for ent in annotations.get("entities"):
        ner.add_label(ent[2])

# Disable other pipeline components during training
other_pipes = [pipe for pipe in nlp.pipe_names if pipe != "ner"]
with nlp.disable_pipes(*other_pipes):  # only train NER
    optimizer = nlp.begin_training()
    for epoch in range(10):
        losses = {}
        # Shuffle the training data
        random.shuffle(TRAIN_DATA)
        # Create batches and iterate over them
        batches = minibatch(TRAIN_DATA, size=compounding(4.0, 32.0, 1.001))
        for batch in batches:
            texts, annotations = zip(*batch)
            examples = [Example.from_dict(nlp.make_doc(text), ann) for text, ann in zip(texts, annotations)]
            nlp.update(examples, drop=0.5, losses=losses)
        print(f"Losses at epoch {epoch}: {losses}")

# Save the trained model
nlp.to_disk("custom_ner_model")

# Test the trained model
test_text = "Apple, Delhi sell more iphones than Apple, Mumbai."
doc = nlp(test_text)
for ent in doc.ents:
    print(ent.text, ent.label_)




Losses at epoch 0: {'ner': 8.399999246001244}
Losses at epoch 1: {'ner': 8.124842956662178}
Losses at epoch 2: {'ner': 7.786208093166351}
Losses at epoch 3: {'ner': 7.544182971119881}
Losses at epoch 4: {'ner': 7.04532191157341}
Losses at epoch 5: {'ner': 6.413692831993103}
Losses at epoch 6: {'ner': 5.858181998133659}
Losses at epoch 7: {'ner': 5.725336492061615}
Losses at epoch 8: {'ner': 4.8050887286663055}
Losses at epoch 9: {'ner': 4.566746324300766}


In [25]:
import spacy

# Load a pre-existing model
nlp = spacy.load("en_core_web_sm")

# Check if 'ner' is already in the pipeline
if 'ner' not in nlp.pipe_names:
    # Add the 'ner' component using its factory name
    nlp.add_pipe('ner', last=True)

# Now you can use the nlp object as usual
doc = nlp("Apple is looking at buying U.K. startup for $1 billion")
for ent in doc.ents:
    print(ent.text, ent.label_)


Apple ORG
U.K. GPE
$1 billion MONEY
