# Text pre-processing
## Combine review title and body

In [11]:
import pandas as pd

INPUT_FILE = r'review_chunks/reviews_part_ag'
df = pd.read_csv(INPUT_FILE, header=None)
df.columns=["review_title","review_verbatim","role","status","location","date","rating"]
df['combined_text'] = df['review_title'] + '. ' + df['review_verbatim']
df.head()

Unnamed: 0,review_title,review_verbatim,role,status,location,date,rating,combined_text
0,you will be busy!,Micro managed to the T. Score cards for everyt...,Underwriting Analyst,Former Employee,"Pittsburgh, PA","September 26, 2019",4.0,you will be busy!. Micro managed to the T. Sco...
1,Easy going,"very easy going,easy jobs, very nice and fun p...",Mold Press Operator,Former Employee,"Polo, IL","September 26, 2019",3.0,"Easy going. very easy going,easy jobs, very ni..."
2,Great place to work if you want to meet new pe...,My experience had a lot of ups and downs. Typi...,TELLER SUPERVISOR,Former Employee,Maryland,"September 26, 2019",3.0,Great place to work if you want to meet new pe...
3,Na,Fast paced. Customer service. Universal banker...,Services Associate I,Current Employee,"Charlotte, NC","September 25, 2019",2.0,Na. Fast paced. Customer service. Universal ba...
4,It was great,It was a great place to work a lot of advancem...,Customer Service Representative,Former Employee,"Columbus, OH","September 25, 2019",5.0,It was great. It was a great place to work a l...


In [12]:
from fastcoref import spacy_component
import spacy

review = df['combined_text']

nlp = spacy.load("en_core_web_sm", exclude=["parser", "lemmatizer", "ner", "textcat"])
nlp.add_pipe("fastcoref")

docs = nlp.pipe(
   review, 
   component_cfg={"fastcoref": {'resolve_text': True}}
)

changed_list = []
for doc in docs:
   changed_list.append(doc._.resolved_text)

df['antecedents_replaced'] = changed_list
df.head()

10/26/2022 01:29:33 - INFO - 	 missing_keys: []
10/26/2022 01:29:33 - INFO - 	 unexpected_keys: []
10/26/2022 01:29:33 - INFO - 	 mismatched_keys: []
10/26/2022 01:29:33 - INFO - 	 error_msgs: []
10/26/2022 01:29:33 - INFO - 	 Model Parameters: 90.5M, Transformer: 82.1M, Coref head: 8.4M
10/26/2022 01:29:34 - INFO - 	 Tokenize 200 texts...
  0%|          | 0/1 [00:01<?, ?ba/s]
10/26/2022 01:29:36 - INFO - 	 ***** Running Inference on 200 texts *****
Inference: 100%|██████████| 200/200 [00:51<00:00,  3.91it/s]


Unnamed: 0,review_title,review_verbatim,role,status,location,date,rating,combined_text,antecedents_replaced
0,you will be busy!,Micro managed to the T. Score cards for everyt...,Underwriting Analyst,Former Employee,"Pittsburgh, PA","September 26, 2019",4.0,you will be busy!. Micro managed to the T. Sco...,you will be busy!. Micro managed to the T. Sco...
1,Easy going,"very easy going,easy jobs, very nice and fun p...",Mold Press Operator,Former Employee,"Polo, IL","September 26, 2019",3.0,"Easy going. very easy going,easy jobs, very ni...","Easy going. very easy going,easy jobs, very ni..."
2,Great place to work if you want to meet new pe...,My experience had a lot of ups and downs. Typi...,TELLER SUPERVISOR,Former Employee,Maryland,"September 26, 2019",3.0,Great place to work if you want to meet new pe...,Great place to work if you want to meet new pe...
3,Na,Fast paced. Customer service. Universal banker...,Services Associate I,Current Employee,"Charlotte, NC","September 25, 2019",2.0,Na. Fast paced. Customer service. Universal ba...,Na. Fast paced. Customer service. Universal ba...
4,It was great,It was a great place to work a lot of advancem...,Customer Service Representative,Former Employee,"Columbus, OH","September 25, 2019",5.0,It was great. It was a great place to work a l...,It was great. It was a great place to work a l...


## Now to split each review into individual sentences
Sentencizer from spaCy does basic rule-based sentence boundary detection

In [13]:
simple_nlp = spacy.load("en_core_web_sm", exclude=["lemmatizer", "ner", "textcat"])

df["sentences"] = df["antecedents_replaced"].apply(lambda x: [sent.text for sent in simple_nlp(x).sents])

df.head()

Unnamed: 0,review_title,review_verbatim,role,status,location,date,rating,combined_text,antecedents_replaced,sentences
0,you will be busy!,Micro managed to the T. Score cards for everyt...,Underwriting Analyst,Former Employee,"Pittsburgh, PA","September 26, 2019",4.0,you will be busy!. Micro managed to the T. Sco...,you will be busy!. Micro managed to the T. Sco...,"[you will be busy!., Micro managed to the T. S..."
1,Easy going,"very easy going,easy jobs, very nice and fun p...",Mold Press Operator,Former Employee,"Polo, IL","September 26, 2019",3.0,"Easy going. very easy going,easy jobs, very ni...","Easy going. very easy going,easy jobs, very ni...","[Easy going., very easy going,easy jobs, very ..."
2,Great place to work if you want to meet new pe...,My experience had a lot of ups and downs. Typi...,TELLER SUPERVISOR,Former Employee,Maryland,"September 26, 2019",3.0,Great place to work if you want to meet new pe...,Great place to work if you want to meet new pe...,[Great place to work if you want to meet new p...
3,Na,Fast paced. Customer service. Universal banker...,Services Associate I,Current Employee,"Charlotte, NC","September 25, 2019",2.0,Na. Fast paced. Customer service. Universal ba...,Na. Fast paced. Customer service. Universal ba...,"[Na., Fast paced., Customer service., Universa..."
4,It was great,It was a great place to work a lot of advancem...,Customer Service Representative,Former Employee,"Columbus, OH","September 25, 2019",5.0,It was great. It was a great place to work a l...,It was great. It was a great place to work a l...,"[It was great., It was a great place to work a..."


## Sentiment from HuggingFace
Need to connect to HuggingFace models such as Bert-base-multilingual-uncased-sentiment

In [14]:
from transformers import pipeline

sentiment_model = pipeline("sentiment-analysis")

# data = ['Good work life balance, but lower than average compensation','Work assignments are challenging and the organization as a whole shifted focus to advancing technology.']
# sentiment_model(data)

In [15]:
# Now can I get a corresponding sentiment for each sentence?
df["sentiment"] = df["sentences"].apply(lambda x: sentiment_model(x))

df.head()

Unnamed: 0,review_title,review_verbatim,role,status,location,date,rating,combined_text,antecedents_replaced,sentences,sentiment
0,you will be busy!,Micro managed to the T. Score cards for everyt...,Underwriting Analyst,Former Employee,"Pittsburgh, PA","September 26, 2019",4.0,you will be busy!. Micro managed to the T. Sco...,you will be busy!. Micro managed to the T. Sco...,"[you will be busy!., Micro managed to the T. S...","[{'label': 'NEGATIVE', 'score': 0.997491240501..."
1,Easy going,"very easy going,easy jobs, very nice and fun p...",Mold Press Operator,Former Employee,"Polo, IL","September 26, 2019",3.0,"Easy going. very easy going,easy jobs, very ni...","Easy going. very easy going,easy jobs, very ni...","[Easy going., very easy going,easy jobs, very ...","[{'label': 'POSITIVE', 'score': 0.987976372241..."
2,Great place to work if you want to meet new pe...,My experience had a lot of ups and downs. Typi...,TELLER SUPERVISOR,Former Employee,Maryland,"September 26, 2019",3.0,Great place to work if you want to meet new pe...,Great place to work if you want to meet new pe...,[Great place to work if you want to meet new p...,"[{'label': 'POSITIVE', 'score': 0.999869465827..."
3,Na,Fast paced. Customer service. Universal banker...,Services Associate I,Current Employee,"Charlotte, NC","September 25, 2019",2.0,Na. Fast paced. Customer service. Universal ba...,Na. Fast paced. Customer service. Universal ba...,"[Na., Fast paced., Customer service., Universa...","[{'label': 'NEGATIVE', 'score': 0.994416475296..."
4,It was great,It was a great place to work a lot of advancem...,Customer Service Representative,Former Employee,"Columbus, OH","September 25, 2019",5.0,It was great. It was a great place to work a l...,It was great. It was a great place to work a l...,"[It was great., It was a great place to work a...","[{'label': 'POSITIVE', 'score': 0.999874949455..."


In [16]:
# df.to_csv('review_post.csv', index=False)
df.to_csv('review_post.csv', mode='a', index=False, header=False)

In [21]:
df.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 199 entries, 0 to 198
Data columns (total 11 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   review_title          199 non-null    object 
 1   review_verbatim       199 non-null    object 
 2   role                  199 non-null    object 
 3   status                199 non-null    object 
 4   location              199 non-null    object 
 5   date                  199 non-null    object 
 6   rating                199 non-null    float64
 7   combined_text         199 non-null    object 
 8   antecedents_replaced  199 non-null    object 
 9   sentences             199 non-null    object 
 10  sentiment             199 non-null    object 
dtypes: float64(1), object(10)
memory usage: 350.5 KB


In [23]:
df['rating'] = df['rating'].astype('float16')
df.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 199 entries, 0 to 198
Data columns (total 11 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   review_title          199 non-null    object 
 1   review_verbatim       199 non-null    object 
 2   role                  199 non-null    object 
 3   status                199 non-null    object 
 4   location              199 non-null    object 
 5   date                  199 non-null    object 
 6   rating                199 non-null    float16
 7   combined_text         199 non-null    object 
 8   antecedents_replaced  199 non-null    object 
 9   sentences             199 non-null    object 
 10  sentiment             199 non-null    object 
dtypes: float16(1), object(10)
memory usage: 349.3 KB


In [24]:
df = df['combined_text']
df.info(memory_usage='deep')

<class 'pandas.core.series.Series'>
RangeIndex: 199 entries, 0 to 198
Series name: combined_text
Non-Null Count  Dtype 
--------------  ----- 
199 non-null    object
dtypes: object(1)
memory usage: 80.2 KB
