In [None]:
import pandas as pd
# Load the balanced dataset from the CSV file
df_balanced = pd.read_csv('balanced_sample.csv')

# Display the first few rows of the dataset
print("First 5 rows of the balanced sample:")
print(df_balanced.head())

# List all column names
print("\nColumn names in the balanced sample dataset:")
print(df_balanced.columns.tolist())


In [3]:
# Check for any missing values
missing_data = df_balanced.isnull().sum()
print(missing_data)

# Drop rows where key columns (like full_review) are missing
df_balanced_cleaned = df_balanced.dropna(subset=['full_review', 'rating', 'treated'])

# Verify the cleaning step
print(df_balanced_cleaned.head())


rating                          0
title                          15
status                          0
pros                            0
cons                            0
advice                       4018
Recommend                       0
CEO Approval                    0
Business Outlook                0
Career Opportunities          796
Compensation and Benefits     805
Senior Management             845
Work/Life Balance             812
Culture & Values              951
Diversity & Inclusion        2697
firm_link                       0
date                            0
job                             0
index                        4018
full_review                     0
treated                         0
dtype: int64
   rating                                             title  \
0     5.0                 Project Engineering Summer Intern   
1     3.0  Opportunities for advancement is in short supply   
2     4.0                                     Solid Company   
3     4.0          Gr

In [4]:
# Create a dictionary to map categorical ratings to numerical sentiment values
sentiment_mapping = {
    'v': 1,    # Positive
    'r': 0.5,  # Mild
    'x': -1,   # Negative
    'o': 0     # No opinion
}

# Map the columns (CEO Approval and Business Outlook) to numeric sentiment values
df_balanced_cleaned['CEO_Approval_Sentiment'] = df_balanced_cleaned['CEO Approval'].map(sentiment_mapping)
df_balanced_cleaned['Business_Outlook_Sentiment'] = df_balanced_cleaned['Business Outlook'].map(sentiment_mapping)

# Check the first few rows to see the transformed data
print(df_balanced_cleaned[['CEO Approval', 'CEO_Approval_Sentiment', 'Business Outlook', 'Business_Outlook_Sentiment']].head())


  CEO Approval  CEO_Approval_Sentiment Business Outlook  \
0            o                     0.0                o   
1            r                     0.5                r   
2            v                     1.0                v   
3            v                     1.0                v   
4            o                     0.0                v   

   Business_Outlook_Sentiment  
0                         0.0  
1                         0.5  
2                         1.0  
3                         1.0  
4                         1.0  


In [6]:
# Display the unique categories in the 'status' column
unique_status = df_balanced['status'].unique()
# Display the unique status categories
unique_status

array(['Former Employee', 'Current Employee',
       'Former Employee, more than 3 years',
       'Current Employee, more than 8 years',
       'Current Employee, more than 1 year',
       'Former Employee, less than 1 year',
       'Current Employee, more than 5 years',
       'Current Employee, more than 10 years',
       'Current Employee, more than 3 years',
       'Former Employee, more than 1 year',
       'Former Employee, more than 10 years',
       'Current Employee, less than 1 year',
       'Former Employee, more than 8 years',
       'Former Employee, more than 5 years',
       'Current Intern, less than 1 year',
       'Former Intern, less than 1 year', 'Former Temporary Employee'],
      dtype=object)

In [9]:
## function to map seniorrty levels to use as a control variable
def map_seniority(status):
    if isinstance(status, str):
        # Handle 'Intern' as entry level
        if 'Intern' in status:
            return 'Entry-Level'
        
        # Handle employees with less than 1 year or around 1 year experience
        elif 'less than 1 year' in status:
            return 'Junior'
        
        # Handle employees with experience from 1 to 3 years (Junior)
        elif 'more than 1 year' in status and 'more than 3 years' not in status:
            return 'Junior'
        
        # Handle employees with 3 to 5 years of experience (Mid-Level)
        elif 'more than 3 years' in status and 'more than 5 years' not in status:
            return 'Mid-Level'
        
        # Handle employees with 5 to 10 years of experience (Senior)
        elif 'more than 5 years' or 'more than 8 years' in status and 'more than 10 years' not in status:
            return 'Senior'
        
        # Handle employees with more than 10 years of experience (Very Senior)
        elif 'more than 10 years' in status:
            return 'Very Senior'
        
        # Handle 'Former Employee' status
        elif 'Former' in status:
            return 'Former Employee'
        
    return 'Unknown'  # Default if no match is found

# Apply the updated mapping logic
df_balanced_cleaned['seniority_level'] = df_balanced_cleaned['status'].apply(map_seniority)

# Check the results
df_balanced_cleaned[['status', 'seniority_level']].head(20)



Unnamed: 0,status,seniority_level
0,Former Employee,Senior
1,Current Employee,Senior
2,"Former Employee, more than 3 years",Mid-Level
3,"Current Employee, more than 8 years",Senior
4,Current Employee,Senior
5,"Current Employee, more than 1 year",Junior
6,"Former Employee, more than 3 years",Mid-Level
7,"Former Employee, less than 1 year",Junior
8,"Current Employee, more than 5 years",Senior
9,Current Employee,Senior


In [10]:
print(df_balanced_cleaned.head())

   rating                                             title  \
0     5.0                 Project Engineering Summer Intern   
1     3.0  Opportunities for advancement is in short supply   
2     4.0                                     Solid Company   
3     4.0          Great Company going through growth spurt   
4     4.0                                Good place to work   

                                status  \
0                      Former Employee   
1                     Current Employee   
2   Former Employee, more than 3 years   
3  Current Employee, more than 8 years   
4                     Current Employee   

                                                pros  \
0  Very approachable and kind coworkers. Made sur...   
1  Excellent medical, dental, vision benefits. Ge...   
2  Company operates in a very professional manner...   
3  Structured approach High standardization Growt...   
4  Great benefits, employees only pay 20% from pa...   

                               

In [11]:
pd.set_option('display.max_colwidth', None)

# Display the first few rows of the 'full_review' column for inspection
df_balanced_cleaned[['full_review']].head(10)

Unnamed: 0,full_review
0,"Very approachable and kind coworkers. Made sure you weren't bored or had nothing to do. Tasks they gave were interesting. Learned a lot. Honestly, none. Thought it was a great experience."
1,"Excellent medical, dental, vision benefits. Generous PTO Lack of career growth opportunities Lack of diversity in executive and senior leadership Those with accelerated career growth seems to be within the same circle Lack of support from some managers Lack of pay transparency"
2,Company operates in a very professional manner Management is sound On-site cafeteria serves good food at $5 a plate IT dept has exceptional leadership at the Vance location and in the US. Most departments waste money on an absurd level with little to no discipline or oversight. This money does not get put back into the betterment of the employees.
3,Structured approach High standardization Growth potential Clean environment High expectations Slow change curve
4,"Great benefits, employees only pay 20% from paycheck for benefits and company pays 80%, we have vacation and PTO, onsite cafeteria and fitness center. Company cares about their employees and will work with you when you have a problem instead of going directly to disciplinary action. No public transportation, not many local restaurants near by this makes onsite cafeteria appealing"
5,Propper management setup. Tier 2 supplier of automotive seats. Relatable job responsibilities should be framed according to the job description.
6,Great place to begin early in the career. Work-life balance is very minimal.
7,"Good pay, flexible schedules On-site cafeteria No benefits Micro-management Very short breaks Some Supervisors are insensitive and don’t demonstrate human decency"
8,"Really good benefits and awesome people to work with at the working level Understaffed, too many working hours, and poorly managed at project levels."
9,I love it there really gonna miss the people and the place when it shutdown…😣 The shut downs and the cap out rate


In [12]:
column_names = df_balanced_cleaned.columns.tolist()

# Display the list of column names
print(column_names)

['rating', 'title', 'status', 'pros', 'cons', 'advice', 'Recommend', 'CEO Approval', 'Business Outlook', 'Career Opportunities', 'Compensation and Benefits', 'Senior Management', 'Work/Life Balance', 'Culture & Values', 'Diversity & Inclusion', 'firm_link', 'date', 'job', 'index', 'full_review', 'treated', 'CEO_Approval_Sentiment', 'Business_Outlook_Sentiment', 'seniority_level']


In [13]:
# List of columns to be removed
columns_to_drop = ['title', 'status', 'pros', 'cons', 'advice', 'CEO Approval', 
                  'Career Opportunities', 'Compensation and Benefits', 
                  'Senior Management', 'Work/Life Balance', 'Culture & Values', 
                  'Diversity & Inclusion', 'index']

# Remove the specified columns from the DataFrame
df_balanced_cleaned = df_balanced_cleaned.drop(columns=columns_to_drop)

# Display the first few rows of the updated DataFrame to confirm
df_balanced_cleaned.head()


Unnamed: 0,rating,Recommend,Business Outlook,firm_link,date,job,full_review,treated,CEO_Approval_Sentiment,Business_Outlook_Sentiment,seniority_level
0,5.0,o,o,https://www.glassdoor.com/Reviews/Brose-Reviews-E348696.htm,2023-04-06,Project Engineering Intern,"Very approachable and kind coworkers. Made sure you weren't bored or had nothing to do. Tasks they gave were interesting. Learned a lot. Honestly, none. Thought it was a great experience.",1,0.0,0.0,Senior
1,3.0,x,r,https://www.glassdoor.com/Reviews/Brose-Reviews-E348696.htm,2023-04-07,Project Engineer,"Excellent medical, dental, vision benefits. Generous PTO Lack of career growth opportunities Lack of diversity in executive and senior leadership Those with accelerated career growth seems to be within the same circle Lack of support from some managers Lack of pay transparency",1,0.5,0.5,Senior
2,4.0,v,v,https://www.glassdoor.com/Reviews/Brose-Reviews-E348696.htm,2023-03-03,IT Engineer,Company operates in a very professional manner Management is sound On-site cafeteria serves good food at $5 a plate IT dept has exceptional leadership at the Vance location and in the US. Most departments waste money on an absurd level with little to no discipline or oversight. This money does not get put back into the betterment of the employees.,1,1.0,1.0,Mid-Level
3,4.0,v,v,https://www.glassdoor.com/Reviews/Brose-Reviews-E348696.htm,2023-03-19,Mechanical Engineer,Structured approach High standardization Growth potential Clean environment High expectations Slow change curve,1,1.0,1.0,Senior
4,4.0,v,v,https://www.glassdoor.com/Reviews/Brose-Reviews-E348696.htm,2023-03-02,Anonymous Employee,"Great benefits, employees only pay 20% from paycheck for benefits and company pays 80%, we have vacation and PTO, onsite cafeteria and fitness center. Company cares about their employees and will work with you when you have a problem instead of going directly to disciplinary action. No public transportation, not many local restaurants near by this makes onsite cafeteria appealing",1,0.0,1.0,Senior


In [19]:
# Define the updated EU Companies dictionary
eu_companies = {
    'brose': 'EU',  # Germany
    'edeka': 'EU',  # Germany
    'swarovski': 'EU',  # Austria
    'magirus international': 'EU',  # Germany
    'leuco': 'EU',  # Germany
    'zeppelin systems': 'EU',  # Germany
    'postbank': 'EU',  # Germany
    'autorola': 'EU',  # Denmark
    'nordsee': 'EU',  # Germany
    'flsmidth|flsmidth s.a.': 'EU',  # Denmark
}

# Function to classify as EU or Non-EU based on the company name
def assign_treatment(firm_link):
    # Extract the company name from the firm link (assuming it follows the format of "Reviews/{company_name}-Reviews")
    if isinstance(firm_link, str):
        firm_link = firm_link.lower().replace(' ', '-')
        # Check if the company name matches the EU dictionary
        for company in eu_companies.keys():
            if company in firm_link:
                return 1  # EU company, treated
        # If the company name is not in the EU list, classify as Non-EU (treated = 0)
        return 0
    return 0  # Default if firm link is not recognized

# Function to extract the company name from the firm_link
def extract_company_name(firm_link):
    if isinstance(firm_link, str):
        # Assuming company name is the text between "Reviews/" and "-Reviews"
        start_idx = firm_link.find("Reviews/") + len("Reviews/")
        end_idx = firm_link.find("-Reviews")
        if start_idx != -1 and end_idx != -1:
            return firm_link[start_idx:end_idx].replace('-', ' ').title()
    return 'Unknown'

# Apply the function to create the 'company_name' and 'EU_or_NonEU' columns
df_balanced_cleaned['company_name'] = df_balanced_cleaned['firm_link'].apply(extract_company_name)
df_balanced_cleaned['EU_or_NonEU'] = df_balanced_cleaned['firm_link'].apply(lambda x: 'EU' if assign_treatment(x) == 1 else 'Non-EU')

# Display the first few rows of the updated DataFrame to verify
df_balanced_cleaned[['company_name', 'EU_or_NonEU']].head(2)


Unnamed: 0,company_name,EU_or_NonEU
0,Brose,EU
1,Brose,EU


In [18]:
df_balanced_cleaned.head(10)

Unnamed: 0,rating,Recommend,Business Outlook,firm_link,date,job,full_review,treated,CEO_Approval_Sentiment,Business_Outlook_Sentiment,seniority_level,company_name,EU_or_NonEU
0,5.0,o,o,https://www.glassdoor.com/Reviews/Brose-Reviews-E348696.htm,2023-04-06,Project Engineering Intern,"Very approachable and kind coworkers. Made sure you weren't bored or had nothing to do. Tasks they gave were interesting. Learned a lot. Honestly, none. Thought it was a great experience.",1,0.0,0.0,Senior,Brose,EU
1,3.0,x,r,https://www.glassdoor.com/Reviews/Brose-Reviews-E348696.htm,2023-04-07,Project Engineer,"Excellent medical, dental, vision benefits. Generous PTO Lack of career growth opportunities Lack of diversity in executive and senior leadership Those with accelerated career growth seems to be within the same circle Lack of support from some managers Lack of pay transparency",1,0.5,0.5,Senior,Brose,EU
2,4.0,v,v,https://www.glassdoor.com/Reviews/Brose-Reviews-E348696.htm,2023-03-03,IT Engineer,Company operates in a very professional manner Management is sound On-site cafeteria serves good food at $5 a plate IT dept has exceptional leadership at the Vance location and in the US. Most departments waste money on an absurd level with little to no discipline or oversight. This money does not get put back into the betterment of the employees.,1,1.0,1.0,Mid-Level,Brose,EU
3,4.0,v,v,https://www.glassdoor.com/Reviews/Brose-Reviews-E348696.htm,2023-03-19,Mechanical Engineer,Structured approach High standardization Growth potential Clean environment High expectations Slow change curve,1,1.0,1.0,Senior,Brose,EU
4,4.0,v,v,https://www.glassdoor.com/Reviews/Brose-Reviews-E348696.htm,2023-03-02,Anonymous Employee,"Great benefits, employees only pay 20% from paycheck for benefits and company pays 80%, we have vacation and PTO, onsite cafeteria and fitness center. Company cares about their employees and will work with you when you have a problem instead of going directly to disciplinary action. No public transportation, not many local restaurants near by this makes onsite cafeteria appealing",1,0.0,1.0,Senior,Brose,EU
5,5.0,v,v,https://www.glassdoor.com/Reviews/Brose-Reviews-E348696.htm,2023-03-12,Quality Engineer,Propper management setup. Tier 2 supplier of automotive seats. Relatable job responsibilities should be framed according to the job description.,1,1.0,1.0,Junior,Brose,EU
6,4.0,o,o,https://www.glassdoor.com/Reviews/Brose-Reviews-E348696.htm,2023-03-15,Quality Engineer,Great place to begin early in the career. Work-life balance is very minimal.,1,0.0,0.0,Mid-Level,Brose,EU
7,3.0,o,o,https://www.glassdoor.com/Reviews/Brose-Reviews-E348696.htm,2023-03-27,Production Operator,"Good pay, flexible schedules On-site cafeteria No benefits Micro-management Very short breaks Some Supervisors are insensitive and don’t demonstrate human decency",1,0.0,0.0,Junior,Brose,EU
8,3.0,x,x,https://www.glassdoor.com/Reviews/Brose-Reviews-E348696.htm,2023-03-16,Engineer,"Really good benefits and awesome people to work with at the working level Understaffed, too many working hours, and poorly managed at project levels.",1,-1.0,-1.0,Senior,Brose,EU
9,5.0,v,r,https://www.glassdoor.com/Reviews/Brose-Reviews-E348696.htm,2023-02-12,Production Assembler,I love it there really gonna miss the people and the place when it shutdown…😣 The shut downs and the cap out rate,1,0.5,0.5,Senior,Brose,EU


In [20]:
df_balanced_cleaned.to_csv('df_balanced_cleaned.csv', index=False)
