# Experimenting with text pre-processing
## Combine review title and body

In [1]:
import pandas as pd
TINY_FILE = r'baby_train.csv'
# TINY_FILE = r'indeed_reviews_processed.csv'
tiny_df = pd.read_csv(TINY_FILE)
# tiny_df['review_id'] = tiny_df.index
tiny_df['combined_text'] = tiny_df['review_title'] + '. ' + tiny_df['review_verbatim']
# New: drop the other text columns that got combined
tiny_df = tiny_df.drop(columns=['review_title','review_verbatim'])
tiny_df.head(2)

ParserError: Error tokenizing data. C error: Expected 7 fields in line 5186, saw 13


In [3]:
# New: reformat the date
from datetime import datetime

tiny_df['date'] = tiny_df['date'].apply(lambda x: datetime.strptime(x, r"%B %d, %Y").strftime(r"%m/%d/%Y"))
tiny_df.head()

Unnamed: 0,role,status,location,date,rating,combined_text
0,Business Analysis Consultant Sr - IT,Current Employee,"Pittsburgh, PA",10/24/2022,3.0,"Good work life balance, but lower than average..."
1,Bank Teller,Former Employee,"Bristol, PA",10/23/2022,4.0,Not a very stable place at the moment. this ba...
2,Mortgage Loan Processor III,Former Employee,Downers Grove Illinois,10/22/2022,4.0,Good Place to Work. this bank was a good place...
3,Lockbox Processor,Former Employee,"Mount Laurel, NJ",10/22/2022,4.0,4 stars. What is the best part of working at t...


In [26]:
import spacy

nlp = spacy.load('en_core_web_sm')

text = 'this bank overall was not that bad to work for however, they are doing away with allot of positions and they are making allot of branches cashless. If you are a banker you will be fine but I would not waste my time being a teller. They have said within 5 years over 45% of tellers will be gone. They are doing away with positions and putting the extra work on others without any extra compensation. I woukd not go back'
doc = nlp(text)

print("Noun phrases:", [chunk.text for chunk in doc.noun_chunks])

for entity in doc.ents:
    print(entity.text, entity.label_)

Noun phrases: ['they', 'allot', 'positions', 'they', 'allot', 'branches cashless', 'you', 'a banker', 'you', 'I', 'my time', 'a teller', 'They', '5 years', '45%', 'tellers', 'They', 'positions', 'the extra work', 'others', 'any extra compensation', 'I']
5 years DATE
45% PERCENT


In [24]:
from fastcoref import spacy_component
import spacy

# review = ['this bank overall was not that bad to work for however, they are doing away with allot of positions and they are making allot of branches cashless. If you are a banker you will be fine but I would not waste my time being a teller. They have said within 5 years over 45% of tellers will be gone. They are doing away with positions and putting the extra work on others without any extra compensation. I woukd not go back']
review = tiny_df['combined_text']

nlp = spacy.load("en_core_web_sm", exclude=["parser", "lemmatizer", "ner", "textcat"])
nlp.add_pipe("fastcoref")

docs = nlp.pipe(
   review, 
   component_cfg={"fastcoref": {'resolve_text': True}}
)

changed_list = []
for doc in docs:
   changed_list.append(doc._.resolved_text)

tiny_df['antecedents_replaced'] = changed_list
tiny_df.head()



10/26/2022 21:48:40 - INFO - 	 missing_keys: []
10/26/2022 21:48:40 - INFO - 	 unexpected_keys: []
10/26/2022 21:48:40 - INFO - 	 mismatched_keys: []
10/26/2022 21:48:40 - INFO - 	 error_msgs: []
10/26/2022 21:48:40 - INFO - 	 Model Parameters: 90.5M, Transformer: 82.1M, Coref head: 8.4M
10/26/2022 21:48:40 - INFO - 	 Tokenize 4 texts...
  0%|          | 0/1 [00:00<?, ?ba/s]
10/26/2022 21:48:41 - INFO - 	 ***** Running Inference on 4 texts *****
Inference: 100%|██████████| 4/4 [00:01<00:00,  2.50it/s]


Unnamed: 0,role,status,location,date,rating,combined_text,antecedents_replaced
0,Business Analysis Consultant Sr - IT,Current Employee,"Pittsburgh, PA",10/24/2022,3.0,"Good work life balance, but lower than average...","Good work life balance, but lower than average..."
1,Bank Teller,Former Employee,"Bristol, PA",10/23/2022,4.0,Not a very stable place at the moment. this ba...,Not a very stable place at the moment. this ba...
2,Mortgage Loan Processor III,Former Employee,Downers Grove Illinois,10/22/2022,4.0,Good Place to Work. this bank was a good place...,Good Place to Work. this bank was a good place...
3,Lockbox Processor,Former Employee,"Mount Laurel, NJ",10/22/2022,4.0,4 stars. What is the best part of working at t...,4 stars. What is the best part of working at t...


## Now to split each review into individual sentences
Sentencizer from spaCy does basic rule-based sentence boundary detection

In [68]:
review = 'this bank overall was not that bad to work for however, they are doing away with allot of positions and they are making allot of branches cashless. If you are a banker you will be fine but I would not waste my time being a teller. They have said within 5 years over 45% of tellers will be gone. They are doing away with positions and putting the extra work on others without any extra compensation. I woukd not go back'
sentence_doc = nlp(review)
sents_list = []
for sent in sentence_doc.sents:
    sents_list.append(sent.text)

print(sents_list)

TypeError: 'tuple' object is not callable

In [27]:
tiny_df['review_id'] = tiny_df.index
simple_nlp = spacy.load("en_core_web_sm", exclude=["lemmatizer", "ner", "textcat"])
tiny_df["A"] = tiny_df["antecedents_replaced"].apply(lambda x: [sent.text for sent in simple_nlp(x).sents])

tiny_df.head()

Unnamed: 0,role,status,location,date,rating,combined_text,antecedents_replaced,review_id,A
0,Business Analysis Consultant Sr - IT,Current Employee,"Pittsburgh, PA",10/24/2022,3.0,"Good work life balance, but lower than average...","Good work life balance, but lower than average...",0,"[Good work life balance, but lower than averag..."
1,Bank Teller,Former Employee,"Bristol, PA",10/23/2022,4.0,Not a very stable place at the moment. this ba...,Not a very stable place at the moment. this ba...,1,"[Not a very stable place at the moment., this ..."
2,Mortgage Loan Processor III,Former Employee,Downers Grove Illinois,10/22/2022,4.0,Good Place to Work. this bank was a good place...,Good Place to Work. this bank was a good place...,2,"[Good Place to Work., this bank was a good pla..."
3,Lockbox Processor,Former Employee,"Mount Laurel, NJ",10/22/2022,4.0,4 stars. What is the best part of working at t...,4 stars. What is the best part of working at t...,3,"[4 stars., What is the best part of working at..."


In [29]:
from transformers import pipeline
sentiment_model = pipeline("sentiment-analysis")
tiny_df["B"] = tiny_df["A"].apply(lambda x: sentiment_model(x))
tiny_df.head()

In [30]:
# Make a new dataframe of the separate sentences, but keep the review_id somehow
# tiny_df.rename(columns={"sentences":"A","sentiment":"B"}, inplace=True)
df = tiny_df.explode(list('AB'))
df.head()

Unnamed: 0,role,status,location,date,rating,combined_text,antecedents_replaced,review_id,A,B
0,Business Analysis Consultant Sr - IT,Current Employee,"Pittsburgh, PA",10/24/2022,3.0,"Good work life balance, but lower than average...","Good work life balance, but lower than average...",0,"Good work life balance, but lower than average...","{'label': 'POSITIVE', 'score': 0.8460509777069..."
0,Business Analysis Consultant Sr - IT,Current Employee,"Pittsburgh, PA",10/24/2022,3.0,"Good work life balance, but lower than average...","Good work life balance, but lower than average...",0,Work assignments are challenging and the organ...,"{'label': 'POSITIVE', 'score': 0.9516314268112..."
0,Business Analysis Consultant Sr - IT,Current Employee,"Pittsburgh, PA",10/24/2022,3.0,"Good work life balance, but lower than average...","Good work life balance, but lower than average...",0,Employees who perform and earn high ratings do...,"{'label': 'POSITIVE', 'score': 0.9978299736976..."
0,Business Analysis Consultant Sr - IT,Current Employee,"Pittsburgh, PA",10/24/2022,3.0,"Good work life balance, but lower than average...","Good work life balance, but lower than average...",0,the organization as a whole is flexible with r...,"{'label': 'POSITIVE', 'score': 0.9997872710227..."
0,Business Analysis Consultant Sr - IT,Current Employee,"Pittsburgh, PA",10/24/2022,3.0,"Good work life balance, but lower than average...","Good work life balance, but lower than average...",0,Negative would be that job titles and roles of...,"{'label': 'NEGATIVE', 'score': 0.999702513217926}"


In [62]:
simple_nlp = spacy.load("en_core_web_sm", exclude=["lemmatizer", "ner", "textcat"])

tiny_df["sentences"] = tiny_df["antecedents_replaced"].apply(lambda x: [sent.text for sent in simple_nlp(x).sents])

tiny_df.to_csv('test1.csv')

## Sentiment from HuggingFace
Need to connect to HuggingFace models such as Bert-base-multilingual-uncased-sentiment

In [69]:
from transformers import pipeline

sentiment_model = pipeline("sentiment-analysis")

data = ['Good work life balance, but lower than average compensation','Work assignments are challenging and the organization as a whole shifted focus to advancing technology.']
sentiment_model(data)

[{'label': 'POSITIVE', 'score': 0.7007424831390381},
 {'label': 'POSITIVE', 'score': 0.9516314268112183}]

In [71]:
# Now can I get a corresponding sentiment for each sentence?
tiny_df["sentiment"] = tiny_df["sentences"].apply(lambda x: sentiment_model(x))

tiny_df.to_csv('test2_sentiment.csv')