# Experimenting with text pre-processing
## Combine review title and body

In [1]:
import pandas as pd

TINY_FILE = r'baby_train.csv'
tiny_df = pd.read_csv(TINY_FILE)
tiny_df['combined_text'] = tiny_df['review_title'] + '. ' + tiny_df['review_verbatim']
tiny_df.head()

Unnamed: 0,review_title,review_verbatim,role,status,location,date,rating,combined_text
0,"Good work life balance, but lower than average...",Work assignments are challenging and the organ...,Business Analysis Consultant Sr - IT,Current Employee,"Pittsburgh, PA","October 24, 2022",3.0,"Good work life balance, but lower than average..."
1,Not a very stable place at the moment,this bank overall was not that bad to work for...,Bank Teller,Former Employee,"Bristol, PA","October 23, 2022",4.0,Not a very stable place at the moment. this ba...
2,Good Place to Work,this bank was a good place to work at. They ha...,Mortgage Loan Processor III,Former Employee,Downers Grove Illinois,"October 22, 2022",4.0,Good Place to Work. this bank was a good place...
3,4 stars,What is the best part of working at the compan...,Lockbox Processor,Former Employee,"Mount Laurel, NJ","October 22, 2022",4.0,4 stars. What is the best part of working at t...


In [26]:
import spacy

nlp = spacy.load('en_core_web_sm')

text = 'this bank overall was not that bad to work for however, they are doing away with allot of positions and they are making allot of branches cashless. If you are a banker you will be fine but I would not waste my time being a teller. They have said within 5 years over 45% of tellers will be gone. They are doing away with positions and putting the extra work on others without any extra compensation. I woukd not go back'
doc = nlp(text)

print("Noun phrases:", [chunk.text for chunk in doc.noun_chunks])

for entity in doc.ents:
    print(entity.text, entity.label_)

Noun phrases: ['they', 'allot', 'positions', 'they', 'allot', 'branches cashless', 'you', 'a banker', 'you', 'I', 'my time', 'a teller', 'They', '5 years', '45%', 'tellers', 'They', 'positions', 'the extra work', 'others', 'any extra compensation', 'I']
5 years DATE
45% PERCENT


In [44]:
from fastcoref import spacy_component
import spacy

# review = ['this bank overall was not that bad to work for however, they are doing away with allot of positions and they are making allot of branches cashless. If you are a banker you will be fine but I would not waste my time being a teller. They have said within 5 years over 45% of tellers will be gone. They are doing away with positions and putting the extra work on others without any extra compensation. I woukd not go back']
review = tiny_df['combined_text']

nlp = spacy.load("en_core_web_sm", exclude=["parser", "lemmatizer", "ner", "textcat"])
nlp.add_pipe("fastcoref")

docs = nlp.pipe(
   review, 
   component_cfg={"fastcoref": {'resolve_text': True}}
)

changed_list = []
for doc in docs:
   changed_list.append(doc._.resolved_text)

tiny_df['antecedents_replaced'] = changed_list
tiny_df.head()



10/25/2022 21:17:32 - INFO - 	 missing_keys: []
10/25/2022 21:17:32 - INFO - 	 unexpected_keys: []
10/25/2022 21:17:32 - INFO - 	 mismatched_keys: []
10/25/2022 21:17:32 - INFO - 	 error_msgs: []
10/25/2022 21:17:32 - INFO - 	 Model Parameters: 90.5M, Transformer: 82.1M, Coref head: 8.4M
10/25/2022 21:17:33 - INFO - 	 Tokenize 4 texts...
  0%|          | 0/1 [00:00<?, ?ba/s]
10/25/2022 21:17:33 - INFO - 	 ***** Running Inference on 4 texts *****
Inference: 100%|██████████| 4/4 [00:01<00:00,  2.45it/s]


Unnamed: 0,review_title,review_verbatim,role,status,location,date,rating,combined_text,antecedents_replaced
0,"Good work life balance, but lower than average...",Work assignments are challenging and the organ...,Business Analysis Consultant Sr - IT,Current Employee,"Pittsburgh, PA","October 24, 2022",3.0,"Good work life balance, but lower than average...","Good work life balance, but lower than average..."
1,Not a very stable place at the moment,this bank overall was not that bad to work for...,Bank Teller,Former Employee,"Bristol, PA","October 23, 2022",4.0,Not a very stable place at the moment. this ba...,Not a very stable place at the moment. this ba...
2,Good Place to Work,this bank was a good place to work at. They ha...,Mortgage Loan Processor III,Former Employee,Downers Grove Illinois,"October 22, 2022",4.0,Good Place to Work. this bank was a good place...,Good Place to Work. this bank was a good place...
3,4 stars,What is the best part of working at the compan...,Lockbox Processor,Former Employee,"Mount Laurel, NJ","October 22, 2022",4.0,4 stars. What is the best part of working at t...,4 stars. What is the best part of working at t...


## Now to split each review into individual sentences
Sentencizer from spaCy does basic rule-based sentence boundary detection

In [68]:
review = 'this bank overall was not that bad to work for however, they are doing away with allot of positions and they are making allot of branches cashless. If you are a banker you will be fine but I would not waste my time being a teller. They have said within 5 years over 45% of tellers will be gone. They are doing away with positions and putting the extra work on others without any extra compensation. I woukd not go back'
sentence_doc = nlp(review)
sents_list = []
for sent in sentence_doc.sents:
    sents_list.append(sent.text)

print(sents_list)

TypeError: 'tuple' object is not callable

In [48]:
tiny_df['review_id'] = tiny_df.index
tiny_df.head()

Unnamed: 0,review_title,review_verbatim,role,status,location,date,rating,combined_text,antecedents_replaced,review_id
0,"Good work life balance, but lower than average...",Work assignments are challenging and the organ...,Business Analysis Consultant Sr - IT,Current Employee,"Pittsburgh, PA","October 24, 2022",3.0,"Good work life balance, but lower than average...","Good work life balance, but lower than average...",0
1,Not a very stable place at the moment,this bank overall was not that bad to work for...,Bank Teller,Former Employee,"Bristol, PA","October 23, 2022",4.0,Not a very stable place at the moment. this ba...,Not a very stable place at the moment. this ba...,1
2,Good Place to Work,this bank was a good place to work at. They ha...,Mortgage Loan Processor III,Former Employee,Downers Grove Illinois,"October 22, 2022",4.0,Good Place to Work. this bank was a good place...,Good Place to Work. this bank was a good place...,2
3,4 stars,What is the best part of working at the compan...,Lockbox Processor,Former Employee,"Mount Laurel, NJ","October 22, 2022",4.0,4 stars. What is the best part of working at t...,4 stars. What is the best part of working at t...,3


In [49]:
# Make a new dataframe of the separate sentences, but keep the review_id somehow
review = tiny_df['antecedents_replaced']

docs = list(nlp.pipe(review))
# sents_list = []
# for doc in docs:
#     for sent in doc.sents:
#         sents_list.append(sent)

# print(sents_list)

10/25/2022 21:55:17 - INFO - 	 Tokenize 4 texts...
  0%|          | 0/1 [00:00<?, ?ba/s]
10/25/2022 21:55:18 - INFO - 	 ***** Running Inference on 4 texts *****
Inference: 100%|██████████| 4/4 [00:01<00:00,  2.20it/s]

[Good work life balance, but lower than average compensation., Work assignments are challenging and the organization as a whole shifted focus to advancing technology., Employees who perform and earn high ratings do have opportunities to advance depending on the role., the organization as a whole is flexible with remote work and time off benefits are good., Negative would be that job titles and roles often can be misleading., In many instances business analysts work as certified scrum product owners but do not receive compensation of certified scrum product owners., Analysts are often expected to do the work of job titles and roles, acting as product owner and analyst on Agile development crew., the organization as a whole seems to favor compensating employees who leave and return vs. retaining / rewarding performers., Not a very stable place at the moment., this bank overall was not that bad to work for however, this bank are doing away with allot of positions and this bank are making 




In [62]:
simple_nlp = spacy.load("en_core_web_sm", exclude=["lemmatizer", "ner", "textcat"])

tiny_df["sentences"] = tiny_df["antecedents_replaced"].apply(lambda x: [sent.text for sent in simple_nlp(x).sents])

tiny_df.to_csv('test1.csv')

## Sentiment from HuggingFace
Need to connect to HuggingFace models such as Bert-base-multilingual-uncased-sentiment

In [69]:
from transformers import pipeline

sentiment_model = pipeline("sentiment-analysis")

data = ['Good work life balance, but lower than average compensation','Work assignments are challenging and the organization as a whole shifted focus to advancing technology.']
sentiment_model(data)

[{'label': 'POSITIVE', 'score': 0.7007424831390381},
 {'label': 'POSITIVE', 'score': 0.9516314268112183}]

In [70]:
# Now can I get a corresponding sentiment for each sentence?
tiny_df["sentiment"] = tiny_df["sentences"].apply(lambda x: sentiment_model(x))

tiny_df.to_csv('test2_sentiment.csv')