# Text pre-processing
## Combine review title and body

In [14]:
import pandas as pd

INPUT_FILE = r'reviews_part_aa'
df = pd.read_csv(INPUT_FILE, header=None)
df.columns=["review_title","review_verbatim","role","status","location","date","rating"]
df['combined_text'] = df['review_title'] + '. ' + df['review_verbatim']
df.head()

Unnamed: 0,review_title,review_verbatim,role,status,location,date,rating,combined_text
0,Great place to work for most,"I have really enjoyed working here, the only i...",Branch Sales and Service Associate II,Current Employee,"Ann Arbor, MI","March 27, 2019",4.0,Great place to work for most. I have really en...
1,It's a job,You go in five days a week do what you need to...,Cook,Current Employee,"San Antonio, TX","June 2, 2022",3.0,It's a job. You go in five days a week do what...
2,Stressful,Banking as a whole has become an awful industr...,Branch Manager,Former Employee,Florida,"June 1, 2022",2.0,Stressful. Banking as a whole has become an aw...
3,Micromanaged and very little room for growth,I worked for this bank for about 10 years and ...,Executive Client Relations,Former Employee,OH,"June 1, 2022",3.0,Micromanaged and very little room for growth. ...
4,Stressful environment,Core service consultant. This place is essenti...,Customer Service Representative,Former Employee,"Kalamazoo, MI","June 1, 2022",1.0,Stressful environment . Core service consultan...


In [15]:
from fastcoref import spacy_component
import spacy

review = df['combined_text']

nlp = spacy.load("en_core_web_sm", exclude=["parser", "lemmatizer", "ner", "textcat"])
nlp.add_pipe("fastcoref")

docs = nlp.pipe(
   review, 
   component_cfg={"fastcoref": {'resolve_text': True}}
)

changed_list = []
for doc in docs:
   changed_list.append(doc._.resolved_text)

df['antecedents_replaced'] = changed_list
df.head()

10/26/2022 00:15:13 - INFO - 	 missing_keys: []
10/26/2022 00:15:13 - INFO - 	 unexpected_keys: []
10/26/2022 00:15:13 - INFO - 	 mismatched_keys: []
10/26/2022 00:15:13 - INFO - 	 error_msgs: []
10/26/2022 00:15:13 - INFO - 	 Model Parameters: 90.5M, Transformer: 82.1M, Coref head: 8.4M
10/26/2022 00:15:14 - INFO - 	 Tokenize 199 texts...
  0%|          | 0/1 [00:01<?, ?ba/s]
10/26/2022 00:15:16 - INFO - 	 ***** Running Inference on 199 texts *****
Inference: 100%|██████████| 199/199 [00:51<00:00,  3.88it/s]


Unnamed: 0,review_title,review_verbatim,role,status,location,date,rating,combined_text,antecedents_replaced
0,Great place to work for most,"I have really enjoyed working here, the only i...",Branch Sales and Service Associate II,Current Employee,"Ann Arbor, MI","March 27, 2019",4.0,Great place to work for most. I have really en...,Great place to work for most. I have really en...
1,It's a job,You go in five days a week do what you need to...,Cook,Current Employee,"San Antonio, TX","June 2, 2022",3.0,It's a job. You go in five days a week do what...,It's a job. You go in five days a week do what...
2,Stressful,Banking as a whole has become an awful industr...,Branch Manager,Former Employee,Florida,"June 1, 2022",2.0,Stressful. Banking as a whole has become an aw...,Stressful. Banking as a whole has become an aw...
3,Micromanaged and very little room for growth,I worked for this bank for about 10 years and ...,Executive Client Relations,Former Employee,OH,"June 1, 2022",3.0,Micromanaged and very little room for growth. ...,Micromanaged and very little room for growth. ...
4,Stressful environment,Core service consultant. This place is essenti...,Customer Service Representative,Former Employee,"Kalamazoo, MI","June 1, 2022",1.0,Stressful environment . Core service consultan...,Stressful environment . Core service consultan...


## Now to split each review into individual sentences
Sentencizer from spaCy does basic rule-based sentence boundary detection

In [16]:
simple_nlp = spacy.load("en_core_web_sm", exclude=["lemmatizer", "ner", "textcat"])

df["sentences"] = df["antecedents_replaced"].apply(lambda x: [sent.text for sent in simple_nlp(x).sents])

df.head()

Unnamed: 0,review_title,review_verbatim,role,status,location,date,rating,combined_text,antecedents_replaced,sentences
0,Great place to work for most,"I have really enjoyed working here, the only i...",Branch Sales and Service Associate II,Current Employee,"Ann Arbor, MI","March 27, 2019",4.0,Great place to work for most. I have really en...,Great place to work for most. I have really en...,"[Great place to work for most., I have really ..."
1,It's a job,You go in five days a week do what you need to...,Cook,Current Employee,"San Antonio, TX","June 2, 2022",3.0,It's a job. You go in five days a week do what...,It's a job. You go in five days a week do what...,"[It's a job., You go in five days a week do wh..."
2,Stressful,Banking as a whole has become an awful industr...,Branch Manager,Former Employee,Florida,"June 1, 2022",2.0,Stressful. Banking as a whole has become an aw...,Stressful. Banking as a whole has become an aw...,"[Stressful., Banking as a whole has become an ..."
3,Micromanaged and very little room for growth,I worked for this bank for about 10 years and ...,Executive Client Relations,Former Employee,OH,"June 1, 2022",3.0,Micromanaged and very little room for growth. ...,Micromanaged and very little room for growth. ...,[Micromanaged and very little room for growth....
4,Stressful environment,Core service consultant. This place is essenti...,Customer Service Representative,Former Employee,"Kalamazoo, MI","June 1, 2022",1.0,Stressful environment . Core service consultan...,Stressful environment . Core service consultan...,"[Stressful environment ., Core service consult..."


## Sentiment from HuggingFace
Need to connect to HuggingFace models such as Bert-base-multilingual-uncased-sentiment

In [17]:
from transformers import pipeline

sentiment_model = pipeline("sentiment-analysis")

# data = ['Good work life balance, but lower than average compensation','Work assignments are challenging and the organization as a whole shifted focus to advancing technology.']
# sentiment_model(data)

In [18]:
# Now can I get a corresponding sentiment for each sentence?
df["sentiment"] = df["sentences"].apply(lambda x: sentiment_model(x))

df.head()

Unnamed: 0,review_title,review_verbatim,role,status,location,date,rating,combined_text,antecedents_replaced,sentences,sentiment
0,Great place to work for most,"I have really enjoyed working here, the only i...",Branch Sales and Service Associate II,Current Employee,"Ann Arbor, MI","March 27, 2019",4.0,Great place to work for most. I have really en...,Great place to work for most. I have really en...,"[Great place to work for most., I have really ...","[{'label': 'POSITIVE', 'score': 0.999877214431..."
1,It's a job,You go in five days a week do what you need to...,Cook,Current Employee,"San Antonio, TX","June 2, 2022",3.0,It's a job. You go in five days a week do what...,It's a job. You go in five days a week do what...,"[It's a job., You go in five days a week do wh...","[{'label': 'POSITIVE', 'score': 0.999699831008..."
2,Stressful,Banking as a whole has become an awful industr...,Branch Manager,Former Employee,Florida,"June 1, 2022",2.0,Stressful. Banking as a whole has become an aw...,Stressful. Banking as a whole has become an aw...,"[Stressful., Banking as a whole has become an ...","[{'label': 'POSITIVE', 'score': 0.651394128799..."
3,Micromanaged and very little room for growth,I worked for this bank for about 10 years and ...,Executive Client Relations,Former Employee,OH,"June 1, 2022",3.0,Micromanaged and very little room for growth. ...,Micromanaged and very little room for growth. ...,[Micromanaged and very little room for growth....,"[{'label': 'NEGATIVE', 'score': 0.999523639678..."
4,Stressful environment,Core service consultant. This place is essenti...,Customer Service Representative,Former Employee,"Kalamazoo, MI","June 1, 2022",1.0,Stressful environment . Core service consultan...,Stressful environment . Core service consultan...,"[Stressful environment ., Core service consult...","[{'label': 'POSITIVE', 'score': 0.996458709239..."


In [19]:
df.to_csv('review_post.csv', index=False)