# Sentiment Analysis
> Employer Reviews

In [1]:
# Import libraries
import warnings
warnings.filterwarnings('ignore')
import csv
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from googletrans import Translator
from langdetect import detect
from IPython.display import *

## 1. Data Cleaning

In [2]:
# Import indeed reviews data
# Change file name accordingly
reviews_1 = pd.read_csv("reviews_ind (backup).csv", sep = ";",
                        header = None, names = ['Company','Review'],
                        quoting = 1, skipinitialspace = True)

In [3]:
# Total number of reviews
print('Total number of reviews: {}'.format(reviews_1.shape[0]))

# Sneak peek
display(reviews_1.head(5))

Total number of reviews: 832368


Unnamed: 0,Company,Review
0,BAE Systems,BAE was an interesting company to work for. Th...
1,BAE Systems,I absolutely loved working for Bae Systems. It...
2,BAE Systems,"Good company, but certain departments are diff..."
3,BAE Systems,BAE was the worst job experience I have ever h...
4,BAE Systems,The work schedule was typically seven days a w...


In [28]:
# Format reviews dataframe

# Remove '\r\n' from reviews
def format_review(string):
    if type(string) == str:
        if '\r\n' in string:
            return string.replace('\r\n','.')
        else:
            return string
    else:
        return string
    
# Replace ".." with "." in reviews
def format_review2(string):
    if type(string) == str:
        if ".." in string:
            return string.replace('..','.')
        else:
            return string
    else:
        return string

# Replace "\n" with "\n " in reviews
def format_review3(string):
    if type(string) == str:
        if "\n" in string:
            return string.replace('\n','\n ')
        else:
            return string
    else:
        return string

# Remove links in company names
def format_name(string):
    if type(string) == str:
        if ";https" in string:
            return string.split(';https')[0]
        else:
            return string
    else:
        return string

# Apply functions
reviews_1.Review = [format_review(x) for x in reviews_1.Review]
reviews_1.Review = reviews_1.Review.apply(lambda x: '. '.join(list(filter(lambda a: a != '', x.split('.')))) + '.' if type(x) == str else x)
reviews_1.Review = reviews_1.Review.apply(lambda x: ' '.join(x.split('  ')) if type(x) == str else x)
reviews_1.Review = [format_review2(x) for x in reviews_1.Review]
reviews_1.Review = [format_review3(x) for x in reviews_1.Review]
reviews_1.Company = [format_name(x) for x in reviews_1.Company]

# Display reviews dataframe
display(reviews_1.head(5))

Unnamed: 0,Company,Review
0,BAE Systems,BAE was an interesting company to work for. Th...
1,BAE Systems,I absolutely loved working for Bae Systems. It...
2,BAE Systems,"Good company, but certain departments are diff..."
3,BAE Systems,BAE was the worst job experience I have ever h...
4,BAE Systems,The work schedule was typically seven days a w...


In [134]:
# Import sentiment analysis score (already saved as csv)
score_df = pd.read_csv('Average Vader scores (alphabetical, no index).csv', sep = ",",
                       quoting = 1, skipinitialspace = True)
score_df.drop(score_df.index[[197]], inplace = True)
score_df = score_df.reset_index().drop('index', axis = 1)
score_df.at[19, 'Company'] = 'Arvato SCM Ireland Limited'
score_df['Contained in future_bs'] = True
score_df

Unnamed: 0,Company,Score,Original number of reviews,Contained in future_bs
0,ABN AMRO,0.667300,837,True
1,Abtran,0.745081,806,True
2,Accenture,0.676800,12657,True
3,Adobe,0.739533,319,True
4,Age UK,0.720132,878,True
5,Airbus Group,0.650908,827,True
6,Aktiesport,0.753056,800,True
7,AkzoNobel,0.585241,907,True
8,Alcoa,0.540460,1239,True
9,ALDI,0.407706,2831,True


In [None]:
# Company names, uncleaned
future_bs_original = pd.read_csv("future_bs.csv", sep = ",", usecols = ['employer','company'],
                        quoting = 1, skipinitialspace = True, encoding = 'latin-1')
future_bs = pd.DataFrame(future_bs_original['company'].unique(), columns = ['Company'])
future_bs['Employer'] = None
future_bs['Review Score'] = np.nan
future_bs['Other name'] = None
future_bs['Employer of business schools graduates'] = False

for i in future_bs['Company']:
    future_bs.loc[future_bs['Company'] == i, 'Employer'] = future_bs_original.loc[future_bs_original['company'] == i, 'employer'].iloc[0,0]

import re
def getWords(text):
    return re.compile('\w+').findall(text)

for i in range(len(score_df['Company'])):
    wordlist = getWords(score_df['Company'][i])
    redundant_words = ['a', 'an', 'the', 's', 've', 'en', 'van','S', 'N', 'O', 'A', 'Limited', 'Ltd', 'LTD']
    for word in redundant_words:
        try:
            wordlist.remove(word)
        except ValueError:
            None
    contained = False
    corresponding = None
    
    for m in range(len(future_bs_original['employer'])):
        if future_bs_original['employer'][m] != future_bs_original['company'][m]:
            if score_df['Company'][i] in future_bs_original['employer'][m]: #and future_bs.loc[future_bs['Company'] == future_bs_original['company'][m], 'Employer of business schools graduates'] == False:
                contained = True
                corresponding = future_bs_original['company'][m]
                future_bs.loc[future_bs['Company'] == corresponding, 'Review Score'] = score_df['Score'][i]
                future_bs.loc[future_bs['Company'] == corresponding, 'Other name'] = score_df['Company'][i]
                future_bs.loc[future_bs['Company'] == corresponding, 'Employer of business schools graduates'] = True
                break
            else:
                #for k in range(len(wordlist)):
                #    boolean = True
                #    if (wordlist[k] not in future_bs_original['employer'][m]):# and (future_bs.loc[future_bs['Company'] == future_bs_original['company'][m], 'Employer of business schools graduates'] == False):
                #        boolean = False
                #        break
                #    else:
                #        continue
                #if boolean == True and future_bs.loc[future_bs['Company'] == future_bs_original['company'][m], 'Employer of business schools graduates'] == False:
                #    contained = True
                #    corresponding = future_bs_original['company'][m]
                #    future_bs.loc[future_bs['Company'] == corresponding, 'Review Score'] = score_df['Score'][i]
                #    future_bs.loc[future_bs['Company'] == corresponding, 'Other name'] = score_df['Company'][i]
                #    future_bs.loc[future_bs['Company'] == corresponding, 'Employer of business schools graduates'] = True
                #else:
                #    None
                None
        else:
            for j in range(len(future_bs['Company'])):
                if '<U+' in future_bs['Company'][j]:
                    None
                else:
                    if score_df['Company'][i] in future_bs['Company'][j] and future_bs.at[j, 'Employer of business schools graduates'] == False:
                        future_bs.at[j, 'Review Score'] = score_df['Score'][i]
                        future_bs.at[j, 'Other name'] = score_df['Company'][i]
                        future_bs.at[j, 'Employer of business schools graduates'] = True
                        contained = True
                        break
                    else:
                        for k in range(len(wordlist)):
                            boolean = True
                            if wordlist[k] not in future_bs['Company'][j] and future_bs.at[j, 'Employer of business schools graduates'] == False:
                                boolean = False
                                break
                            else:
                                continue
                        if boolean == True and future_bs.at[j, 'Employer of business schools graduates'] == False:
                            future_bs.at[j, 'Review Score'] = score_df['Score'][i]
                            future_bs.at[j, 'Other name'] = score_df['Company'][i]
                            future_bs.at[j, 'Employer of business schools graduates'] = True
                            contained = True
                        else:
                            None
            if contained == False:
                score_df.at[i,'Contained in future_bs'] = False
future_bs.head(20)

In [136]:
# Company names, uncleaned
future_bs_original = pd.read_csv("future_bs.csv", sep = ",", usecols = ['employer','company'],
                        quoting = 1, skipinitialspace = True, encoding = 'latin-1')
future_bs = pd.DataFrame(future_bs_original['company'].unique(), columns = ['Company'])
future_bs['Employer'] = None
future_bs['Review Score'] = np.nan
future_bs['Other name'] = None
future_bs['Employer of business schools graduates'] = False

for i in future_bs['Company']:
    future_bs.loc[future_bs['Company'] == i, 'Employer'] = future_bs_original.loc[future_bs_original['company'] == i, 'employer'].tolist()[0]

import re
def getWords(text):
    return re.compile('\w+').findall(text)

for i in range(len(score_df['Company'])):
    wordlist = getWords(score_df['Company'][i])
    redundant_words = ['a', 'an', 'the', 's', 've', 'en', 'van','S', 'N', 'O', 'A', 'T', 'Limited', 'Ltd', 'LTD']
    for word in redundant_words:
        try:
            wordlist.remove(word)
        except ValueError:
            None
    contained = False
    
    for j in range(len(future_bs['Company'])):
        if '<U+' in future_bs['Company'][j]:
            None
        else:
            if ((score_df['Company'][i] in future_bs['Company'][j]) or (score_df['Company'][i] in future_bs['Employer'][j])) and future_bs.at[j, 'Employer of business schools graduates'] == False:
                future_bs.at[j, 'Review Score'] = score_df['Score'][i]
                future_bs.at[j, 'Other name'] = score_df['Company'][i]
                future_bs.at[j, 'Employer of business schools graduates'] = True
                contained = True
                break
            else:
                for k in range(len(wordlist)):
                    if (wordlist[k].isupper() == True) and (wordlist[k] in future_bs['Company'][j] or wordlist[k] in future_bs['Employer'][j]) and (future_bs.at[j, 'Employer of business schools graduates'] == False):
                        future_bs.at[j, 'Review Score'] = score_df['Score'][i]
                        future_bs.at[j, 'Other name'] = score_df['Company'][i]
                        future_bs.at[j, 'Employer of business schools graduates'] = True
                        contained = True
                        break
                    else:
                        boolean = True
                        if ((wordlist[k] not in future_bs['Company'][j]) and (wordlist[k] not in future_bs['Employer'][j])) and future_bs.at[j, 'Employer of business schools graduates'] == False:
                            boolean = False
                            break
                        else:
                            continue
                if boolean == True and future_bs.at[j, 'Employer of business schools graduates'] == False:
                    future_bs.at[j, 'Review Score'] = score_df['Score'][i]
                    future_bs.at[j, 'Other name'] = score_df['Company'][i]
                    future_bs.at[j, 'Employer of business schools graduates'] = True
                    contained = True
                else:
                    None
    if contained == False:
        score_df.at[i,'Contained in future_bs'] = False
future_bs.head(20)

Unnamed: 0,Company,Employer,Review Score,Other name,Employer of business schools graduates
0,YPF,YPF,,,False
1,IAE Business School,IAE Business School,,,False
2,Banco Galicia,Banco Galicia,,,False
3,"Universidad Austral, Buenos Aires","Universidad Austral, Buenos Aires",,,False
4,Buenos Aires City Government,Buenos Aires City Government,,,False
5,Molinos Rio de la Plata,Molinos Rio de la Plata,,,False
6,Telef<U+00CC>_nica Argentina,Telef<U+00CC>_nica Argentina,,,False
7,Ternium,Ternium,,,False
8,Banco Supervielle,Banco Supervielle,,,False
9,Grupo OSDE,Grupo OSDE,,,False


In [137]:
score_df

Unnamed: 0,Company,Score,Original number of reviews,Contained in future_bs
0,ABN AMRO,0.667300,837,True
1,Abtran,0.745081,806,False
2,Accenture,0.676800,12657,True
3,Adobe,0.739533,319,False
4,Age UK,0.720132,878,False
5,Airbus Group,0.650908,827,False
6,Aktiesport,0.753056,800,False
7,AkzoNobel,0.585241,907,False
8,Alcoa,0.540460,1239,False
9,ALDI,0.407706,2831,False


In [55]:
names_with_weird_stuff = []
for i in future_bs['Company']:
    wordlist = i.split(' ')
    for k in wordlist:
        test = False
        for e in k:
            if e.isalnum() == False:
                names_with_weird_stuff.append(i)
                test = True
                break
            else:
                continue
        if test == True:
            break
        else:
            continue
names_with_weird_stuff  

['Universidad Austral, Buenos Aires',
 'Telef<U+00CC>_nica Argentina',
 'Cablevisi<U+00CC>_n - Fibertel',
 'FV - Grifer<U+00CC>_a de alta tecnolog<U+00CC>_a',
 'Pontificia Universidad Cat<U+00CC>_lica Argentina (UCA)',
 'Department of Education and Training, Victoria',
 'The Royal Melbourne Hospital & NorthWestern Mental Health',
 'klooster-ransom | public relations',
 'Department of Treasury and Finance, Victoria',
 'QUT (Queensland University of Technology)',
 'Department of Education and Training (Queensland)',
 "McDonald's",
 'Department of Education, Western Australia',
 'Department of Health (WA Health)',
 "Laing O'Rourke",
 'Traffic Barricade "Trusted In Traffic Safety"',
 'Department for Education and Child Development, South Australia',
 'Department of Planning, Transport and Infrastructure (DPTI)',
 'University of Sydney Union (USU)',
 'WU (Vienna University of Economics and Business)',
 "L'Or<U+00CC><U+00A9>al",
 'McKinsey & Company',
 'The Boston Consulting Group (BCG)',
 '

In [58]:
for i in score_df['Company']:
    print(getWords(i))

['ABN', 'AMRO']
['Abtran']
['Accenture']
['Adobe']
['Age', 'UK']
['Airbus', 'Group']
['Aktiesport']
['AkzoNobel']
['Alcoa']
['ALDI']
['ALH', 'Group']
['Amec', 'Foster', 'Wheeler']
['American', 'Express']
['Anadarko', 'Petroleum', 'Corporation']
['Anglicare']
['Anytime', 'Fitness']
['ANZ', 'Banking', 'Group']
['Apple']
['Argos']
['Arvato', 'SCM', 'Ireland', 'Limited']
['ASML']
['Australian', 'Army']
['Australian', 'Defence', 'Force']
['Australian', 'Electoral', 'Commission']
['Australian', 'Taxation', 'Office']
['BAE', 'Systems']
['Bank', 'of', 'Ireland']
['Barclays']
['Barnardo', 's']
['BBC']
['Bechtel']
['Behavioral', 'Health', 'Group']
['BGC', 'Contracting']
['Birmingham', 'City', 'Council']
['Blue', 'Care']
['Blue', 'Inc']
['Boots']
['Boston', 'Scientific']
['Bourne', 'Leisure']
['BP']
['Bras', 'N', 'Things']
['British', 'Airways']
['British', 'Army']
['British', 'Heart', 'Foundation']
['British', 'Red', 'Cross']
['British', 'Telecom']
['Brumby', 's', 'Bakery']
['Build', 'A', 'Bear'

In [None]:
counter = 0
boolean = []
for i in range(counts.shape[0]):
    bool2 = False
    for j in range(future_bs.shape[0]):
        wordlist = counts['Company'][i].split(' ')
        for k in wordlist:
            if k.lower() in future_bs['Company'][j].lower():
                counter += 1
                bool2 = True
                boolean.append(True)
                break
            else:
                bool2 = False
    if bool2 == False:
        boolean.append(False)
    else:
        None
counter

In [None]:
counts['Contained'] = boolean
counts

In [None]:
# Number of entries
print("Number of rows = {}".format(future_bs.shape[0]))

**'employer'**

In [None]:
test = sorted(future_bs['employer'].unique().tolist(), key=str.lower)
len(test)

In [None]:
test2 = test.copy()
counter = 0
abnormal_names = []
for i in range(len(test2)-1,-1,-1):
    if "<U+" in test2[i]:
        counter +=1
        abnormal_names.append(test2[i])
        del test2[i]
print(counter)

In [None]:
pd_abnormal = pd.DataFrame(abnormal_names, columns = ['Unformatted Employer Name'])
pd_abnormal.to_csv("Corrupted company names (Indeed, 'employer').csv", index = False)
pd_abnormal.head()

**'company'**

In [None]:
test = sorted(future_bs['company'].unique().tolist(), key=str.lower)
len(test)

In [None]:
test2 = test.copy()
counter = 0
abnormal_names = []
for i in range(len(test2)-1,-1,-1):
    if "<U+" in test2[i]:
        counter +=1
        abnormal_names.append(test2[i])
        del test2[i]
print(counter)

In [None]:
pd_abnormal = pd.DataFrame(abnormal_names, columns = ['Unformatted Company Name'])
pd_abnormal.to_csv("Corrupted company names (Indeed, 'company').csv", index = False)
pd_abnormal.head()

## 2. Sentiment Analysis by Sentence (Original code)

In [None]:
ABN_AMRO = reviews_1.loc[reviews_1.Company == 'ABN AMRO'].reset_index().copy()
ABN_AMRO.drop('index',axis=1,inplace=True)
ABN_AMRO.dropna(inplace = True)
ABN_AMRO['Language'] = [detect(x) for x in ABN_AMRO.Review]
ABN_AMRO.head()

### a) English reviews

In [None]:
ABN_AMRO_en = ABN_AMRO[ABN_AMRO.Language == 'en'].copy()
full_review = ' '.join(ABN_AMRO_en.Review.tolist())
analyzer = SentimentIntensityAnalyzer()
ABN_AMRO_en['Compound'] = [list(analyzer.polarity_scores(x).values())[3] for x in ABN_AMRO_en.Review]

fig, ax = plt.subplots(figsize = (4,6))
sns.set_style('whitegrid')
ax.set_title('ABN AMRO', size = 15)
sns.boxplot(data = ABN_AMRO_en['Compound'], ax = ax, palette = "Set3", linewidth = 1)
sns.despine(left = True, bottom = True, ax = ax)
ax.set_xticklabels([])
ax.set_xlabel('Polarity')
ax.set_ylabel('Compound Score')
plt.tight_layout()

In [None]:
# Average
average = ABN_AMRO_en.loc[:,'Negative':].copy().describe().loc['mean',:][-1]
average

In [None]:
# Function
def polarity_average(company, random_state):
    # Dataframe (all languages)
    df = reviews_1.loc[reviews_1.Company == company].reset_index().copy()
    
    #Sample the dataframe (cap at 1000)
    if df.shape[0] >= 1000:
        df = df.sample(n = 1000, replace=True, random_state = random_state)
    else:
        df = df
    
    df.drop('index',axis=1,inplace=True)
    df.dropna(inplace = True)
    df = df.select_dtypes(include=['object'])
    lang = []
    for x in df.Review:
        try:
            lang.append(detect(x))
        except:
            lang.append('No language detected')
    df['Language'] = lang
    
    # Dataframe (English only)
    df_en = df[df.Language == 'en'].copy()
    
    # Vadar sentiment analysis
    analyzer = SentimentIntensityAnalyzer()
    df_en['Negative'] = [list(analyzer.polarity_scores(x).values())[0] for x in df_en.Review]
    df_en['Neutral'] = [list(analyzer.polarity_scores(x).values())[1] for x in df_en.Review]
    df_en['Positive'] = [list(analyzer.polarity_scores(x).values())[2] for x in df_en.Review]
    df_en['Compound'] = [list(analyzer.polarity_scores(x).values())[3] for x in df_en.Review]
    
    return df_en.loc[:,'Negative':].copy().describe().loc['mean',:][-1]

In [None]:
# Compute average polarity score for all companies
company_names = sorted(reviews_1.Company.unique().tolist(), key=str.lower)
scores = []
for i in sorted(reviews_1.Company.unique().tolist(), key=str.lower):
    average = polarity_average(i)
    scores.append(average)
    print(i)
    print(average)
print("Polarity scores:\n")
print(scores)

In [None]:
temp = []
temp.append(scores)

j = 1
while j <= 9:
    scores2 = []
    for i in sorted(reviews_1.Company.unique().tolist(), key=str.lower):
        average = polarity_average(i, j*100+1)
        scores2.append(average)
    temp.append(scores2)
    print('Round {} finished.'.format(j))
    j += 1

In [None]:
average_10 = [sum(e)/len(e) for e in zip(*temp)]

In [None]:
average_df = pd.DataFrame([companies['Company'].tolist(),average_10], index = ['Company','Score']).transpose()
average_df.to_csv('Average Vader scores (alphabetical, no index).csv', index = False)

In [None]:
new_df = pd.DataFrame([companies['Company'].tolist(),average_10], index = ['Company','Score']).transpose()
number_of_reviews = []
for i in sorted(reviews_1.Company.unique().tolist(), key=str.lower):
    temp2 = reviews_1.loc[reviews_1.Company == i].reset_index().copy()
    number_of_reviews.append(temp2.shape[0])
new_df['Original number of reviews'] = number_of_reviews
new_df.to_csv('Average Vader scores (alphabetical, no index).csv', index = False)
new_df.to_csv('Average Vader scores (alphabetical).csv')

In [None]:
new_df.sort_values(by = 'Score', ascending = False).to_csv('Average Vader scores (alphabetical, descending, no index).csv', index = False)
new_df.sort_values(by = 'Score', ascending = False).to_csv('Average Vader scores (alphabetical, descending).csv')

In [None]:
import matplotlib
from matplotlib import cm
test_cm = [cm.coolwarm(x) for x in average_10]

def format_xticklabels(subplot_axes):
    t = [x.get_text() for x in list(subplot_axes.get_xticklabels())]
    t = ['\n'.join(x.split(' ')) for x in t]
    subplot_axes.set_xticklabels(t)

fig, ax = plt.subplots(6,1,figsize = (50,30))
plt.suptitle('Scores of Company Reviews', size = 20, y =1.01)
sns.set_style('white')
sns.barplot(x = new_df['Company'][:37],
           y = new_df['Score'][:37], ax = ax[0], palette = test_cm[:37])
sns.barplot(x = new_df['Company'][37:74],
           y = new_df['Score'][37:74], ax = ax[1], palette = test_cm[37:74])
sns.barplot(x = new_df['Company'][74:111],
           y = new_df['Score'][74:111], ax = ax[2], palette = test_cm[74:111])
sns.barplot(x = new_df['Company'][111:148],
           y = new_df['Score'][111:148], ax = ax[3], palette = test_cm[111:148])
sns.barplot(x = new_df['Company'][148:185],
           y = new_df['Score'][148:185], ax = ax[4], palette = test_cm[148:185])
sns.barplot(x = new_df['Company'][185:],
           y = new_df['Score'][185:], ax = ax[5], palette = test_cm[185:])
for i in (ax[0],ax[1],ax[2],ax[3], ax[4], ax[5]):
    i.set_ylim(-1,1)
    i.set_xlabel('Company', size = 15)
    i.set_ylabel('Score', size = 15)
    format_xticklabels(i)
    for p in i.patches:
        i.annotate(np.round(p.get_height(),decimals=3), (p.get_x()+p.get_width()/2., p.get_height()),
                    ha='center', va='center', xytext=(0, 10), textcoords='offset points')

plt.tight_layout()

In [None]:
# Function
def polarity(company):
    # Dataframe (all languages)
    df = reviews_1.loc[reviews_1.Company == company].reset_index().copy()
    
    #Sample the dataframe (cap at 1000)
    if df.shape[0] >= 1000:
        df = df.sample(n = 1000, replace=True)
    else:
        df = df
    
    df.drop('index',axis=1,inplace=True)
    df.dropna(inplace = True)
    df = df.select_dtypes(include=['object'])
    lang = []
    for x in df.Review:
        try:
            lang.append(detect(x))
        except:
            lang.append('No language detected')
    df['Language'] = lang
    
    # Dataframe (English only)
    df_en = df[df.Language == 'en'].copy()
    
    # Vadar sentiment analysis
    analyzer = SentimentIntensityAnalyzer()
    df_en['Negative'] = [list(analyzer.polarity_scores(x).values())[0] for x in df_en.Review]
    df_en['Neutral'] = [list(analyzer.polarity_scores(x).values())[1] for x in df_en.Review]
    df_en['Positive'] = [list(analyzer.polarity_scores(x).values())[2] for x in df_en.Review]
    df_en['Compound'] = [list(analyzer.polarity_scores(x).values())[3] for x in df_en.Review]
    
    return df_en.copy()['Compound'].tolist()

In [None]:
# Compute average polarity score for all companies
company_names = sorted(reviews_1.Company.unique().tolist(), key=str.lower)
boxplot_score = []
for i in sorted(reviews_1.Company.unique().tolist(), key=str.lower):
    polarities = polarity(i, )
    boxplot_score.append(polarities)
    print(i)

In [None]:
boxplot_data = pd.DataFrame(boxplot_score, index = sorted(reviews_1.Company.unique().tolist(), key=str.lower)).transpose()

fig, ax = plt.subplots(6,1,figsize = (50,50))
plt.suptitle('Distribution of Individual Scores of Company Reviews', size = 20, y =1.01)
sns.set_style('white')

sns.boxplot(data = boxplot_data.iloc[:,:37], ax = ax[0], palette = 'Set3')
sns.boxplot(data = boxplot_data.iloc[:,37:74], ax = ax[1], palette = 'Set3')
sns.boxplot(data = boxplot_data.iloc[:,74:111], ax = ax[2], palette = 'Set3')
sns.boxplot(data = boxplot_data.iloc[:,111:148], ax = ax[3], palette = 'Set3')
sns.boxplot(data = boxplot_data.iloc[:,148:185], ax = ax[4], palette = 'Set3')
sns.boxplot(data = boxplot_data.iloc[:,185:], ax = ax[5], palette = 'Set3')

for i in (ax[0],ax[1],ax[2],ax[3], ax[4], ax[5]):
    i.set_ylim(-1.1,1.1)
    i.set_xlabel('Company', size = 15)
    i.set_ylabel('Score', size = 15)
    format_xticklabels(i)
    for p in i.patches:
        i.annotate(np.round(p.get_height(),decimals=3), (p.get_x()+p.get_width()/2., p.get_height()),
                    ha='center', va='center', xytext=(0, 10), textcoords='offset points')

plt.tight_layout()

In [None]:
boxplot_data.to_csv('Reviews polarities (capped at 1000 reviews) (No index).csv', index = False)
boxplot_data.to_csv('Reviews polarities (capped at 1000 reviews).csv')