In [10]:
import spacy
import pandas as pd

In [11]:
model = spacy.load("en_core_web_sm")

In [13]:
file = open("data/Samsung.txt", "r", encoding="utf-8")
data = file.read()
file.close()

### 1. No. of sentences

In [16]:
reviews = data.split("\n")
len(reviews)

46355

In [18]:
reviews[0]

"I feel so LUCKY to have found this used (phone to us & not used hard at all), phone on line from someone who upgraded and sold this one. My Son liked his old one that finally fell apart after 2.5+ years and didn't want an upgrade!! Thank you Seller, we really appreciate it & your honesty re: said used phone.I recommend this seller very highly & would but from them again!!"

### 2. Finding most frequent noun in 1st review

In [65]:
pos = []
lemma = []
text = []
for tok in model(reviews[0]):
    pos.append(tok.pos_)
    lemma.append(tok.lemma_)
    text.append(tok.text)
 
# Convert the data into a dataframe object.
nlp_table = pd.DataFrame({'text':text,'lemma':lemma,'pos':pos})
nlp_table.head()

Unnamed: 0,text,lemma,pos
0,I,I,PRON
1,feel,feel,VERB
2,so,so,ADV
3,LUCKY,lucky,ADJ
4,to,to,PART


In [22]:
nlp_table.pos.value_counts()

VERB     14
PRON     12
PUNCT    11
NOUN     10
ADV      10
ADP       6
CCONJ     6
DET       5
AUX       3
PART      3
ADJ       2
PROPN     2
NUM       2
Name: pos, dtype: int64

In [23]:
nlp_table[nlp_table.pos == 'NOUN'].lemma.value_counts()

phone      3
one        2
honesty    1
year       1
upgrade    1
line       1
seller     1
Name: lemma, dtype: int64

### 3. Finding most frequent noun in 1st 1000 reviews

In [31]:
from tqdm import tqdm
pos = []
lemma = []
text = []
for review in tqdm(reviews[:1000]):
    for tok in model(review):
        pos.append(tok.pos_)
        lemma.append(tok.lemma_)
        text.append(tok.text)
 
# Convert the data into a dataframe object.
nlp_table = pd.DataFrame({'text':text,'lemma':lemma,'pos':pos})
nlp_table.head()

100%|██████████| 1000/1000 [00:06<00:00, 160.25it/s]


Unnamed: 0,text,lemma,pos
0,I,I,PRON
1,feel,feel,VERB
2,so,so,ADV
3,LUCKY,lucky,ADJ
4,to,to,PART


In [30]:
nlp_table[nlp_table.pos == 'NOUN'].lemma.value_counts()

phone             1208
battery             92
time                90
price               87
screen              86
                  ... 
storage              1
lite                 1
blemish              1
replacement.it       1
techno               1
Name: lemma, Length: 1322, dtype: int64

In [32]:
# shorten the pipeline loading
nlp=spacy.load('en_core_web_sm',disable=['parser','ner'])

In [33]:
from tqdm import tqdm
pos = []
lemma = []
text = []
for review in tqdm(reviews):
    for tok in model(review):
        pos.append(tok.pos_)
        lemma.append(tok.lemma_)
        text.append(tok.text)
 
# Convert the data into a dataframe object.
nlp_table = pd.DataFrame({'text':text,'lemma':lemma,'pos':pos})
nlp_table.head()

100%|██████████| 46355/46355 [04:35<00:00, 167.98it/s]


Unnamed: 0,text,lemma,pos
0,I,I,PRON
1,feel,feel,VERB
2,so,so,ADV
3,LUCKY,lucky,ADJ
4,to,to,PART


In [34]:
nlp_table[nlp_table.pos == 'NOUN'].lemma.value_counts()

phone          43237
battery         4350
product         3907
time            3825
screen          3746
               ...  
edge(+             1
courtesy           1
conbought          1
keyboardCon        1
pic--              1
Name: lemma, Length: 8999, dtype: int64

## Dataset - 2, tags --> https://universaldependencies.org/u/pos/

In [35]:
df = pd.read_csv("data/tagged_words.csv")

In [63]:
df.head(10)

Unnamed: 0,word,tag
0,the,DET
1,fulton,NOUN
2,county,NOUN
3,grand,ADJ
4,jury,NOUN
5,said,VERB
6,friday,NOUN
7,an,DET
8,investigation,NOUN
9,of,ADP


In [37]:
len(df)

1161192

In [45]:
df[df.word == 'saw'].tag.value_counts()

VERB    347
NOUN      5
Name: tag, dtype: int64

In [47]:
data = pd.read_csv("data/tagged_words.csv")
sent = "I saw him running away"

def get_common_tag(data,word):
    if word.lower() in data['word'].unique():
        q = f"word=='{word.lower()}'"
        return word , data.query(q)['tag'].value_counts().head(1).index.tolist()[0]
    else:
        return f"{word} not in data"

for word in sent.split(" "):
    print(get_common_tag(data,word))
 
data.query("word=='saw'")['tag'].value_counts()

('I', 'PRON')
('saw', 'VERB')
('him', 'PRON')
('running', 'VERB')
('away', 'ADV')


VERB    347
NOUN      5
Name: tag, dtype: int64

In [48]:
sent = "He wished he was rich"
for word in sent.split(" "):
    print(get_common_tag(data,word))

('He', 'PRON')
('wished', 'VERB')
('he', 'PRON')
('was', 'VERB')
('rich', 'ADJ')


In [60]:
df[(df.word == 'his') & (df.tag == 'PRON')].count()

word    37
tag     37
dtype: int64

In [61]:
df[df.tag == 'PRON'].count()

word    49334
tag     49334
dtype: int64

In [62]:
round(37 / 49334, 3)

0.001