In [1]:
# Dependencies
import os
import string

import nltk
from nltk import word_tokenize
from nltk.corpus import stopwords
from nltk.probability import FreqDist

import pandas as pd
from matplotlib import pylab
import matplotlib.pyplot as plt

from collections import Counter

from wordcloud import WordCloud
from Resources.CustomFreqDist import CustomFreqDist

In [2]:
# Import file
csv_data = pd.read_csv(os.path.join('.','Resources','employee_reviews.csv'))
df = pd.DataFrame(csv_data)

# Create company specific dataframes
google_df = df.loc[df['company']=='google',:]
microsoft_df = df.loc[df['company']=='microsoft',:]
amazon_df = df.loc[df['company']=='amazon',:]
apple_df = df.loc[df['company']=='apple',:]
facebook_df = df.loc[df['company']=='facebook',:]

In [3]:
# Checking to see if significant amounts of data is missing (all missing data are for Netflix)
microsoft = microsoft_df['company'].count()
amazon = amazon_df['company'].count()
apple = apple_df['company'].count()
facebook = facebook_df['company'].count()
google = google_df['company'].count()
print(microsoft+amazon+apple+facebook+google)
print(df['company'].count())

66719
67529


In [4]:
# Creates a stopwords set.
stop_words=set(stopwords.words("english"))
stopcaps = []
for word in stop_words:
    word = word.translate(str.maketrans(string.ascii_lowercase, string.ascii_uppercase))
    stopcaps.append(word)

In [8]:
# Define function for splitting reviews to text file.
def getreviews(import_df, company_name, prosorcons):
    holder_df = import_df
    if prosorcons == 'pros':
        with open(f'Resources/text_pros/{company_name}_pros.txt', 'w') as text:
            for index, row in holder_df.iterrows():
                text_holder = row['pros']
                text.write(text_holder+'\n')
        text.close()
    if prosorcons == 'cons':
        with open(f'Resources/text_cons/{company_name}_cons.txt', 'w') as text:
            for index, row in holder_df.iterrows():
                text_holder = row['cons']
                text.write(text_holder+'\n')
        text.close()
        
# Define function to clean text.
def cleantext(company_name, prosorcons):
    if prosorcons == 'pros':
        with open(f'Resources/text_{prosorcons}/{company_name}_{prosorcons}.txt', 'r') as pros:
            prostext = pros.read()
            proswords = prostext
            proswords = proswords.translate(str.maketrans(string.punctuation, ' '*len(string.punctuation)))
            proswords = proswords.translate(str.maketrans(string.digits, ' '*len(string.digits)))
            proswords = proswords.translate(str.maketrans(string.ascii_lowercase, string.ascii_uppercase))
            prostoken = nltk.word_tokenize(proswords)
        pros.close()
        
        filteredpros=[]
        for w in prostoken:
            if w not in stopcaps:
                filteredpros.append(w)
        prosdist = CustomFreqDist(filteredpros)
        
        return prosdist
    
    if prosorcons == 'cons':
        with open(f'Resources/text_{prosorcons}/{company_name}_{prosorcons}.txt', 'r') as cons:
            constext = cons.read()
            conswords = constext
            conswords = conswords.translate(str.maketrans(string.punctuation, ' '*len(string.punctuation)))
            conswords = conswords.translate(str.maketrans(string.digits, ' '*len(string.digits)))
            conswords = conswords.translate(str.maketrans(string.ascii_lowercase, string.ascii_uppercase))
            constoken = nltk.word_tokenize(conswords)
        cons.close()
        
        filteredcons=[]
        for w in constoken:
            if w not in stopcaps:
                filteredcons.append(w)
        consdist = CustomFreqDist(filteredcons)
        
        return consdist

# Define wordcloud function
def show_wc(company_name, prosorcons, title = None):
    if prosorcons == 'pros':
        with open(f'Resources/text_{prosorcons}/{company_name}_{prosorcons}.txt', 'r') as pros:
            prostext = pros.read()
            wordcloud = WordCloud(
                background_color='white',
                max_words=200,
                max_font_size=50, 
                scale=3,
                random_state=1
            ).generate(str(prostext))
            fig = plt.figure(1, figsize=(12, 12))
            plt.axis('off')
            if title:
                fig.suptitle(title, fontsize=20)
                fig.subplots_adjust(top=2.3)
            plt.imshow(wordcloud)
            #plt.show()
            return plt
    if prosorcons == 'cons':
        with open(f'Resources/text_{prosorcons}/{company_name}_{prosorcons}.txt', 'r') as cons:
            constext = cons.read()
            wordcloud = WordCloud(
                background_color='white',
                max_words=200,
                max_font_size=50, 
                scale=3,
                random_state=1
            ).generate(str(constext))
            fig = plt.figure(1, figsize=(12, 12))
            plt.axis('off')
            if title: 
                fig.suptitle(title, fontsize=20)
                fig.subplots_adjust(top=2.3)
            plt.imshow(wordcloud)
            #plt.show()
            return plt

In [9]:
# Grab reviews from each company_df
getreviews(google_df, 'google', 'pros')
getreviews(microsoft_df, 'microsoft', 'pros')
getreviews(amazon_df, 'amazon', 'pros')
getreviews(apple_df, 'apple', 'pros')
getreviews(facebook_df, 'facebook', 'pros')
getreviews(google_df, 'google', 'cons')
getreviews(microsoft_df, 'microsoft', 'cons')
getreviews(amazon_df, 'amazon', 'cons')
getreviews(apple_df, 'apple', 'cons')
getreviews(facebook_df, 'facebook', 'cons')

In [10]:
# Cleans text and returns a frequency distribution object.
google_pros = cleantext('google', 'pros')
microsoft_pros = cleantext('microsoft', 'pros')
amazon_pros = cleantext('amazon', 'pros')
facebook_pros = cleantext('facebook', 'pros')
apple_pros = cleantext('apple', 'pros')
google_cons = cleantext('google', 'cons')
microsoft_cons = cleantext('microsoft', 'cons')
amazon_cons = cleantext('amazon', 'cons')
facebook_cons = cleantext('facebook', 'cons')
apple_cons = cleantext('apple', 'cons')

In [11]:
# Plots word distributions and export images - PROS.
microsoft_pros_chart = microsoft_pros.custom_plot(30,title='Microsoft - Pros')
microsoft_pros_chart.savefig("Images/microsoft_pros_chart.png", bbox_inches="tight")
microsoft_pros_chart.clf()

google_pros_chart = google_pros.custom_plot(30,title='Google - Pros')
google_pros_chart.savefig("Images/google_pros_chart.png", bbox_inches="tight")
google_pros_chart.clf()

apple_pros_chart = apple_pros.custom_plot(30,title='Apple - Pros')
apple_pros_chart.savefig("Images/apple_pros_chart.png", bbox_inches="tight")
apple_pros_chart.clf()

amazon_pros_chart = amazon_pros.custom_plot(30,title='Amazon - Pros')
amazon_pros_chart.savefig("Images/amazon_pros_chart.png", bbox_inches="tight")
amazon_pros_chart.clf()

facebook_pros_chart = facebook_pros.custom_plot(30,title='Facebook - Pros')
facebook_pros_chart.savefig("Images/facebook_pros_chart.png", bbox_inches="tight")
facebook_pros_chart.clf()

# Plots word distributions and export images - CONS.
microsoft_cons_chart = microsoft_cons.custom_plot(30,title='Microsoft - Cons')
microsoft_cons_chart.savefig("Images/microsoft_cons_chart.png", bbox_inches="tight")
microsoft_cons_chart.clf()

google_cons_chart = google_cons.custom_plot(30,title='Google - Cons')
google_cons_chart.savefig("Images/google_cons_chart.png", bbox_inches="tight")
google_cons_chart.clf()

apple_cons_chart = apple_cons.custom_plot(30,title='Apple - Cons')
apple_cons_chart.savefig("Images/apple_cons_chart.png", bbox_inches="tight")
apple_cons_chart.clf()

amazon_cons_chart = amazon_cons.custom_plot(30,title='Amazon - Cons')
amazon_cons_chart.savefig("Images/amazon_cons_chart.png", bbox_inches="tight")
amazon_cons_chart.clf()

facebook_cons_chart = facebook_cons.custom_plot(30,title='Facebook - Cons')
facebook_cons_chart.savefig("Images/facebook_cons_chart.png", bbox_inches="tight")
facebook_cons_chart.clf()

<Figure size 432x288 with 0 Axes>

In [12]:
# Plots word clouds - PROS.
microsoft_pros_cloud = show_wc('microsoft', 'pros')
microsoft_pros_cloud.savefig("images/microsoft_pros_cloud.png")
microsoft_pros_cloud.clf()

google_pros_cloud = show_wc('google', 'pros')
google_pros_cloud.savefig("images/google_pros_cloud.png")
google_pros_cloud.clf()

amazon_pros_cloud = show_wc('amazon', 'pros')
amazon_pros_cloud.savefig("images/amazon_pros_cloud.png")
amazon_pros_cloud.clf()

apple_pros_cloud = show_wc('apple', 'pros')
apple_pros_cloud.savefig("images/apple_pros_cloud.png")
apple_pros_cloud.clf()

facebook_pros_cloud = show_wc('facebook', 'pros')
facebook_pros_cloud.savefig("images/facebook_pros_cloud.png")
facebook_pros_cloud.clf()

# Plots word clouds - CONS.
microsoft_cons_cloud = show_wc('microsoft', 'cons')
microsoft_cons_cloud.savefig("images/microsoft_cons_cloud.png")
microsoft_cons_cloud.clf()

google_cons_cloud = show_wc('google', 'cons')
google_cons_cloud.savefig("images/google_cons_cloud.png")
google_cons_cloud.clf()

amazon_cons_cloud = show_wc('amazon', 'cons')
amazon_cons_cloud.savefig("images/amazon_cons_cloud.png")
amazon_cons_cloud.clf()

apple_cons_cloud = show_wc('apple', 'cons')
apple_cons_cloud.savefig("images/apple_cons_cloud.png")
apple_cons_cloud.clf()

facebook_cons_cloud = show_wc('facebook', 'cons')
facebook_cons_cloud.savefig("images/facebook_cons_cloud.png")
facebook_cons_cloud.clf()

<Figure size 864x864 with 0 Axes>