In [1]:
import pandas as pd

df = pd.read_csv("../data/clean_reviews.csv", usecols=["clean_review"])
df.head()


Unnamed: 0,clean_review
0,issues
1,purchased device worked advertised never much ...
2,works expected sprung higher capacity think ma...
3,think worked greathad diff bran gb card went s...
4,bought retail packaging arrived legit orange e...


In [2]:
df_sample = df.sample(200, random_state=42).reset_index(drop=True)
df_sample.head()


Unnamed: 0,clean_review
0,superspeed card works great phone havent probl...
1,purchased product knowing might issue galaxy c...
2,bought use samsung hmx w works well good trans...
3,inexpensive way get galaxy note ii gb gb memor...
4,purchased two gbcards two new samsung ss famil...


In [3]:
import spacy

nlp = spacy.load("en_core_web_sm")


In [4]:
text = df_sample.loc[0, "clean_review"]
doc = nlp(text)

[(ent.text, ent.label_) for ent in doc.ents]


[]

In [5]:
def extract_entities(text):
    doc = nlp(text)
    return [(ent.text, ent.label_) for ent in doc.ents]

df_sample["entities"] = df_sample["clean_review"].apply(extract_entities)
df_sample.head()


Unnamed: 0,clean_review,entities
0,superspeed card works great phone havent probl...,[]
1,purchased product knowing might issue galaxy c...,"[(week two, DATE)]"
2,bought use samsung hmx w works well good trans...,"[(samsung, ORG), (hmx w works, ORG)]"
3,inexpensive way get galaxy note ii gb gb memor...,"[(gb gb, PERSON)]"
4,purchased two gbcards two new samsung ss famil...,"[(two, CARDINAL), (two, CARDINAL), (samsung, O..."


In [6]:
from collections import Counter

all_entities = [
    ent[0].lower()
    for ents in df_sample["entities"]
    for ent in ents
]

Counter(all_entities).most_common(20)


[('one', 29),
 ('first', 15),
 ('gb', 14),
 ('two', 11),
 ('samsung', 9),
 ('sony', 6),
 ('months', 3),
 ('years', 3),
 ('today', 2),
 ('second', 2),
 ('kingston gb', 2),
 ('kb mbsrandom', 2),
 ('week two', 1),
 ('hmx w works', 1),
 ('gb gb', 1),
 ('several weeks', 1),
 ('dozen minute', 1),
 ('sandisk gb', 1),
 ('april', 1),
 ('gb fast microsd', 1)]

In [7]:
def extract_relevant_entities(text):
    doc = nlp(text)
    return [
        (ent.text.lower(), ent.label_)
        for ent in doc.ents
        if ent.label_ in {"ORG", "PRODUCT"}
    ]

df_sample["filtered_entities"] = df_sample["clean_review"].apply(extract_relevant_entities)
df_sample.head()


Unnamed: 0,clean_review,entities,filtered_entities
0,superspeed card works great phone havent probl...,[],[]
1,purchased product knowing might issue galaxy c...,"[(week two, DATE)]",[]
2,bought use samsung hmx w works well good trans...,"[(samsung, ORG), (hmx w works, ORG)]","[(samsung, ORG), (hmx w works, ORG)]"
3,inexpensive way get galaxy note ii gb gb memor...,"[(gb gb, PERSON)]",[]
4,purchased two gbcards two new samsung ss famil...,"[(two, CARDINAL), (two, CARDINAL), (samsung, O...","[(samsung, ORG)]"


In [8]:
from collections import Counter

filtered_entities = [
    ent[0]
    for ents in df_sample["filtered_entities"]
    for ent in ents
]

Counter(filtered_entities).most_common(15)


[('samsung', 9),
 ('sony', 6),
 ('hmx w works', 1),
 ('samsung smart', 1),
 ('raspberry pi', 1),
 ('gb micro card', 1),
 ('charmi definitely recommend', 1),
 ('tablet kinda', 1),
 ('pluggesd microsd csrd', 1),
 ('tablet bought', 1),
 ('kingstoncrystaldiskmark c', 1),
 ('sandisk ultra micro hc blazing', 1),
 ('samsung galley tablet job', 1),
 ('sony xperia tablet z', 1),
 ('microsoft', 1)]