In [1]:
import pandas as pd
import re,string
import nltk
from patsy import dmatrices
from nltk.corpus import stopwords
from nltk.probability import FreqDist
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
from nltk.tokenize import sent_tokenize, word_tokenize
import matplotlib.pyplot as plt
import warnings
%pylab inline
import numpy as np
from sklearn.manifold import MDS
from sklearn.metrics import euclidean_distances

Populating the interactive namespace from numpy and matplotlib


In [2]:
apple_data = pd.read_csv('apples_reviews.csv')

In [3]:
apple_data.head()

Unnamed: 0,Title,Date Written,Rating,Current/Former,Job Title,Location,Recommendation?,Outlook,Main Text,Pros,Cons,Advice to management
0,"""Product Specialist""","Apr 23, 2018",5.0,Current Employee,Product Specialist,"Louisville, KY",Recommends,Positive Outlook,I have been working at Apple part-time (Less t...,I love working for Apple! The pay and benefits...,I don't have anything negative to say about ap...,
1,"""Challenging, Rewarding, but ZERO work/life ba...","Aug 28, 2014",4.0,Current Employee,Anonymous Employee,,Recommends,Positive Outlook,Show More,"We work with geniuses - in every department, W...",ZERO ZERO ZERO work/life balance. Execs have b...,
2,"""A Company as Meticulous as Their Products!""","Mar 10, 2017",5.0,Current Employee,Specialist,"Tempe, AZ",Recommends,Positive Outlook,I have been working at Apple full-time (More t...,"Competitive Pay, Great Benefits, Amazing Peopl...",None! I had one of the best times ever at this...,You guys are awesome. Keep up the good work!
3,"""At-Home-Advisor""","May 13, 2015",5.0,Current Employee,Apple At Home Advisor,"Lakewood, CO",,,Show More,The position is at home. The company is AMAZIN...,The schedule changes every 90 days and you hav...,The management consists of a team manager and ...
4,"""Changing the world (in a small way)""","Jan 22, 2016",5.0,Current Employee,Software Engineer IV,"Cupertino, CA",Recommends,Positive Outlook,I have been working at Apple full-time (More t...,Working at Apple means that the things you wor...,"It's a lot of work, teams are usually strapped...","Keep fighting for ""what's right"". Renewable en..."


### Step 1: pros/cons data pre-processing

In [4]:
#clean punctuation and white spaces
apple_data["pros_clean"] = apple_data.Pros.apply(lambda x:re.sub(r'[^\w\s]', ' ', x.lower()))
apple_data["pros_clean"] = apple_data.pros_clean.apply(lambda x:re.sub(r'(\\n+)', ' ', x))
apple_data["pros_clean"] = apple_data.pros_clean.apply(lambda x:re.sub(r'^b[\'\"\'"\"''\s]', '', x.lower()))
apple_data["pros_clean"] = apple_data.pros_clean.apply(lambda x:re.sub(r'\s+', ' ', x))

apple_data["cons_clean"] = apple_data.Cons.apply(lambda x:re.sub(r'[^\w\s]', ' ', x.lower()))
apple_data["cons_clean"] = apple_data.cons_clean.apply(lambda x:re.sub(r'(\\n+)', ' ', x))
apple_data["cons_clean"] = apple_data.cons_clean.apply(lambda x:re.sub(r'^b[\'\"\'"\"''\s]', '', x.lower()))
apple_data["cons_clean"] = apple_data.cons_clean.apply(lambda x:re.sub(r'\s+', ' ', x))

#tokenizing and removing stop words
stop = set(stopwords.words('english'))
punc = string.punctuation
apple_data['pros_clean'] = apple_data.pros_clean.apply(lambda x: [word for word in word_tokenize(x) if word not in stop])
apple_data['pros_clean'] = apple_data['pros_clean'].apply(lambda x: [word for word in x if word not in punc])
apple_data['cons_clean'] = apple_data.cons_clean.apply(lambda x: [word for word in word_tokenize(x) if word not in stop])
apple_data['cons_clean'] = apple_data['cons_clean'].apply(lambda x: [word for word in x if word not in punc])

apple_data.head()

Unnamed: 0,Title,Date Written,Rating,Current/Former,Job Title,Location,Recommendation?,Outlook,Main Text,Pros,Cons,Advice to management,pros_clean,cons_clean
0,"""Product Specialist""","Apr 23, 2018",5.0,Current Employee,Product Specialist,"Louisville, KY",Recommends,Positive Outlook,I have been working at Apple part-time (Less t...,I love working for Apple! The pay and benefits...,I don't have anything negative to say about ap...,,"[love, working, apple, pay, benefits, great, m...","[anything, negative, say, apple, except, lot, ..."
1,"""Challenging, Rewarding, but ZERO work/life ba...","Aug 28, 2014",4.0,Current Employee,Anonymous Employee,,Recommends,Positive Outlook,Show More,"We work with geniuses - in every department, W...",ZERO ZERO ZERO work/life balance. Execs have b...,,"[work, geniuses, every, department, create, in...","[zero, zero, zero, work, life, balance, execs,..."
2,"""A Company as Meticulous as Their Products!""","Mar 10, 2017",5.0,Current Employee,Specialist,"Tempe, AZ",Recommends,Positive Outlook,I have been working at Apple full-time (More t...,"Competitive Pay, Great Benefits, Amazing Peopl...",None! I had one of the best times ever at this...,You guys are awesome. Keep up the good work!,"[competitive, pay, great, benefits, amazing, p...","[none, one, best, times, ever, company, hopefu..."
3,"""At-Home-Advisor""","May 13, 2015",5.0,Current Employee,Apple At Home Advisor,"Lakewood, CO",,,Show More,The position is at home. The company is AMAZIN...,The schedule changes every 90 days and you hav...,The management consists of a team manager and ...,"[position, home, company, amazing, limitless, ...","[schedule, changes, every, 90, days, bid, shif..."
4,"""Changing the world (in a small way)""","Jan 22, 2016",5.0,Current Employee,Software Engineer IV,"Cupertino, CA",Recommends,Positive Outlook,I have been working at Apple full-time (More t...,Working at Apple means that the things you wor...,"It's a lot of work, teams are usually strapped...","Keep fighting for ""what's right"". Renewable en...","[working, apple, means, things, work, get, han...","[lot, work, teams, usually, strapped, resource..."


### Step 2: get pros/cons attributes

In [8]:
#get POS
get_pros = apple_data['pros_clean'].apply(lambda x: [word for word in nltk.pos_tag(x)])
get_cons = apple_data['cons_clean'].apply(lambda x: [word for word in nltk.pos_tag(x)])

pro_adj = []
con_adj = []

def get_adj(empty_list,pos_data):
    for review in pos_data:
        for word,pos in review:
            if pos == 'JJ' or pos == 'JJR' or pos == 'JJS': # if the POS-tag is adjective
                empty_list.append(word)
get_adj(pro_adj,get_pros)
get_adj(con_adj,get_cons)

In [6]:
from collections import Counter
Counter(pro_adj).most_common(30)

[('great', 6348),
 ('good', 2694),
 ('best', 760),
 ('retail', 757),
 ('nice', 437),
 ('excellent', 433),
 ('awesome', 420),
 ('new', 411),
 ('flexible', 380),
 ('many', 343),
 ('smart', 323),
 ('amazing', 299),
 ('decent', 272),
 ('high', 256),
 ('friendly', 231),
 ('full', 227),
 ('easy', 226),
 ('fantastic', 225),
 ('positive', 218),
 ('salary', 211),
 ('co', 194),
 ('free', 187),
 ('different', 180),
 ('cool', 174),
 ('supportive', 169),
 ('personal', 167),
 ('competitive', 167),
 ('incredible', 162),
 ('corporate', 159),
 ('wonderful', 156)]

In [9]:
Counter(con_adj).most_common(30)

[('retail', 1824),
 ('hard', 800),
 ('much', 665),
 ('difficult', 654),
 ('many', 645),
 ('long', 574),
 ('little', 546),
 ('good', 497),
 ('high', 489),
 ('great', 487),
 ('corporate', 418),
 ('busy', 367),
 ('bad', 355),
 ('low', 323),
 ('new', 319),
 ('stressful', 307),
 ('full', 302),
 ('poor', 300),
 ('best', 216),
 ('different', 214),
 ('big', 200),
 ('personal', 167),
 ('tough', 165),
 ('salary', 163),
 ('competitive', 151),
 ('due', 144),
 ('enough', 142),
 ('political', 140),
 ('real', 136),
 ('terrible', 135)]

In [10]:
#manually took some attributes from the top frequent pros words
pros_attribute = ['great','good','happy','nice','decent','excellent','best','ethical','strong','flexible','new',
                  'easy','friendly','positive','different','professional','high','solid','corporate',
                  'smart','stable','large']

In [11]:
#manually took some attributes from the top frequent cons words
cons_attribute = ['low','little','hard','difficult','long','poor','limited','bad','slow','terrible','conservative'
                 ,'horrible','different','less','senior','political','bureaucratic']

### Step 3: lemmentize -> get replacement

In [18]:
replacement = {'work_life_balance':['time','life','balance','sabbatical','sabbaticals','focus','hour','day','health','flexible','week'
                                    ,'vacation','schedule','overtime'],
                'culture_value':['people','culture','team','care','value','product','coworkers','atmosphere','competitive'
                                ,'family','collaboration','respect','community','colleague','supportive','vision','diversity'],
                'career_oppotunity':['opportunity','learn','industry','career','license','training','train','growth','grow'
                                    ,'level','position','development','advancement','advance','study','build','skill','resource'
                                    ,'education','potential'],
                'company_benefit':['company','benefit','pay','financial','financially','provide','salary','bonus','offer'
                                  ,'401k','package','stock','compensation','invest','investment','money','performance','reward'
                                  ,'retirement','promote','insurance'],
                'senior_management':['place','environment','management','help','manager','experience','match'
                                                ,'plan','office','support','location','leadership','treat','helpful','senior'
                                                ,'manage','leader','communication']}

In [19]:
def getKeysByValue(dictOfElements, valueToFind):
    for k,v  in dictOfElements.items():
        if valueToFind in v:
            return(k)
    return  valueToFind

def replace_attributes(s):
    return([getKeysByValue(replacement,y) for y in s])

In [20]:
apple_data['pros_replace'] = apple_data['pros_clean'].map(replace_attributes)
apple_data['cons_replace'] = apple_data['cons_clean'].map(replace_attributes)

### Step 4: Lift Score

In [21]:
def ratio(x,y):
    if x==0:
        return float(y)
    if y==0:
        return float(x)
    return(float(x)*float(y))

def get_lift(a,b,tokenized_data):
    '''Function to calculate lift scores given any two words from a list of tokenized words'''
    if (a==b):
        return 1
    p_a = len([i for i in tokenized_data if a in i])
    p_b = len([i for i in tokenized_data if (b in i)])
    p_a_b = len([i for i in tokenized_data if a in i if b in i])
    n = len(tokenized_data)
    return float(float(n)*float(p_a_b)/ratio(p_a,p_b))

In [23]:
pros_list = replacement.keys()
lift_score = [get_lift(x,y,apple_data.pros_replace) for x in pros_attribute for y in pros_list]
formatted_lift_score = [ round(elem,2) for elem in lift_score ]

In [24]:
pd.DataFrame(reshape(formatted_lift_score,(len(pros_attribute),len(pros_list))),index =pros_attribute , columns = pros_list )

Unnamed: 0,culture_value,company_benefit,senior_management,work_life_balance,career_oppotunity
great,1.2,1.19,1.21,1.09,1.13
good,0.95,1.49,1.07,1.14,1.05
happy,1.41,1.02,1.65,1.46,0.7
nice,1.13,1.12,1.29,0.98,0.71
decent,0.88,2.29,0.94,1.42,1.01
excellent,1.12,1.41,1.46,1.27,2.0
best,1.22,1.18,1.31,1.21,1.34
ethical,1.12,2.91,0.8,0.0,1.87
strong,1.44,1.06,1.43,1.63,1.26
flexible,0.0,0.0,0.0,0.0,0.0


In [25]:
cons_list = replacement.keys()
lift_score = [get_lift(x,y,apple_data.cons_replace) for x in cons_attribute for y in cons_list]
formatted_lift_score = [ round(elem,2) for elem in lift_score ]

In [26]:
pd.DataFrame(reshape(formatted_lift_score,(len(cons_attribute),len(cons_list))),index =cons_attribute , columns = cons_list )

Unnamed: 0,culture_value,company_benefit,senior_management,work_life_balance,career_oppotunity
low,0.87,3.77,1.25,0.97,1.56
little,1.39,1.47,1.36,1.24,2.01
hard,1.29,1.21,1.05,1.27,1.32
difficult,1.19,1.22,1.15,1.41,1.59
long,1.09,0.95,0.86,1.24,0.94
poor,1.26,1.69,2.23,1.69,1.62
limited,0.77,1.14,0.97,1.0,3.18
bad,1.15,1.42,1.27,1.06,0.98
slow,0.82,1.31,0.9,1.24,2.55
terrible,1.45,1.4,1.87,1.63,1.26


### Lift for mission Statement

    for Pros

In [30]:
apple_values = ['avoid complexity', 'believe', 'selectivity', 'collaboration', 'simplicity', 'courage','products','make']

In [31]:
mission_lift = [get_lift(x,y,apple_data.pros_replace) for x in apple_values for y in pros_list]
formatted_lift_score = [ round(elem,2) for elem in mission_lift ]

In [32]:
pd.DataFrame(reshape(formatted_lift_score,(len(apple_values),len(pros_list))),index =apple_values , columns = pros_list )

Unnamed: 0,culture_value,company_benefit,senior_management,work_life_balance,career_oppotunity
avoid complexity,0.0,0.0,0.0,0.0,0.0
believe,1.34,1.49,1.07,1.14,1.44
selectivity,0.0,0.0,0.0,0.0,0.0
collaboration,0.0,0.0,0.0,0.0,0.0
simplicity,0.0,2.91,0.0,5.69,0.0
courage,0.0,0.0,0.0,0.0,0.0
products,1.19,0.93,0.99,0.96,1.06
make,1.47,1.04,1.41,1.38,1.43


    for Cons

In [33]:
mission_lift = [get_lift(x,y,apple_data.cons_replace) for x in apple_values for y in cons_list]
formatted_lift_score = [ round(elem,2) for elem in mission_lift ]

In [34]:
pd.DataFrame(reshape(formatted_lift_score,(len(apple_values),len(cons_list))),index =apple_values , columns = cons_list )

Unnamed: 0,culture_value,company_benefit,senior_management,work_life_balance,career_oppotunity
avoid complexity,0.0,0.0,0.0,0.0,0.0
believe,1.99,2.73,1.17,1.36,1.51
selectivity,0.0,0.0,0.0,0.0,0.0
collaboration,0.0,0.0,0.0,0.0,0.0
simplicity,0.0,0.0,0.0,0.0,0.0
courage,0.0,0.0,0.0,0.0,0.0
products,1.67,1.43,1.05,1.26,1.29
make,1.51,1.93,1.56,1.38,1.38
