In [47]:
import pandas as pd
import re,string
import nltk
from patsy import dmatrices
from nltk.corpus import stopwords
from nltk.probability import FreqDist
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
from nltk.tokenize import sent_tokenize, word_tokenize
import matplotlib.pyplot as plt
import warnings
%pylab inline
import numpy as np
from sklearn.manifold import MDS
from sklearn.metrics import euclidean_distances

Populating the interactive namespace from numpy and matplotlib


In [54]:
amazon_data = pd.read_csv('amazon.csv')

In [49]:
amazon_data.head()

Unnamed: 0,Title,Date Written,Rating,Current/Former,Job Title,Location,Recommendation?,Outlook,Main Text,Pros,Cons,Advice to management
0,"""Account Manager""","Sep 9, 2018",3.0,Former Employee,Anonymous Employee,,Recommends,,- Fun environment and excellent team,- Proactive scenario,Almost none opportunities to grow or to move t...,
1,"""All the ""bad"" things you heard about Amazon i...","Sep 9, 2018",1.0,Former Employee,Software Developer,"Seattle, WA",Doesn't Recommend,Negative Outlook,I worked at Amazon full-time (More than a year),You get to work a LOT of hours.,Work life balance is extremely bad. Work here ...,Hopeless
2,"""Amazon is giving hard time to the employees""","Sep 9, 2018",1.0,Current Employee,Fruad Prevention,Hyderabad (India),Doesn't Recommend,Negative Outlook,Becoming worst day by day,All I can say is Amazon has been changed drast...,Working at Amazon is the con,
3,"""Amazon Warehouse Operative""","Sep 9, 2018",1.0,Current Employee,Anonymous Employee,,Doesn't Recommend,Negative Outlook,I have been working at Amazon full-time,Weekly pay was a plus,"Treated as a number, long hours and no flexibi...",Treat staff equal
4,"""Amazon was the easiest job and I was surround...","Sep 9, 2018",5.0,Former Employee,Associate,"Essex, MD",Recommends,Positive Outlook,I worked at Amazon full-time (Less than a year),"Fun, easy, positive energy, surrounded by cool...",Long hours but youâll get used to it,


# Pros


### Step 1: pros/cons data pre-processing

In [59]:
#clean punctuation and white spaces
amazon_data["pros_clean"] = amazon_data.Pros.apply(lambda x:re.sub(r'[^\w\s]', ' ', str(x).lower()))
amazon_data["pros_clean"] = amazon_data.pros_clean.apply(lambda x:re.sub(r'(\\n+)', ' ', x))
amazon_data["pros_clean"] = amazon_data.pros_clean.apply(lambda x:re.sub(r'^b[\'\"\'"\"''\s]', '', x.lower()))
amazon_data["pros_clean"] = amazon_data.pros_clean.apply(lambda x:re.sub(r'\s+', ' ', x))

amazon_data["cons_clean"] = amazon_data.Cons.apply(lambda x:re.sub(r'[^\w\s]', ' ', str(x).lower()))
amazon_data["cons_clean"] = amazon_data.cons_clean.apply(lambda x:re.sub(r'(\\n+)', ' ', x))
amazon_data["cons_clean"] = amazon_data.cons_clean.apply(lambda x:re.sub(r'^b[\'\"\'"\"''\s]', '', x.lower()))
amazon_data["cons_clean"] = amazon_data.cons_clean.apply(lambda x:re.sub(r'\s+', ' ', x))

#tokenizing and removing stop words
stop = set(stopwords.words('english'))
punc = string.punctuation
amazon_data['pros_clean'] = amazon_data.pros_clean.apply(lambda x: [word for word in word_tokenize(x) if word not in stop])
amazon_data['pros_clean'] = amazon_data['pros_clean'].apply(lambda x: [word for word in x if word not in punc])
amazon_data['cons_clean'] = amazon_data.cons_clean.apply(lambda x: [word for word in word_tokenize(x) if word not in stop])
amazon_data['cons_clean'] = amazon_data['cons_clean'].apply(lambda x: [word for word in x if word not in punc])

amazon_data.head()

Unnamed: 0,Title,Date Written,Rating,Current/Former,Job Title,Location,Recommendation?,Outlook,Main Text,Pros,Cons,Advice to management,pros_clean,cons_clean
0,"""Account Manager""","Sep 9, 2018",3.0,Former Employee,Anonymous Employee,,Recommends,,- Fun environment and excellent team,proactive scenario,Almost none opportunities to grow or to move t...,,"[proactive, scenario]","[almost, none, opportunities, grow, move, anot..."
1,"""All the ""bad"" things you heard about Amazon i...","Sep 9, 2018",1.0,Former Employee,Software Developer,"Seattle, WA",Doesn't Recommend,Negative Outlook,I worked at Amazon full-time (More than a year),you get work lot hours,Work life balance is extremely bad. Work here ...,Hopeless,"[get, work, lot, hours]","[work, life, balance, extremely, bad, work, pl..."
2,"""Amazon is giving hard time to the employees""","Sep 9, 2018",1.0,Current Employee,Fruad Prevention,Hyderabad (India),Doesn't Recommend,Negative Outlook,Becoming worst day by day,all can say amazon has been changed drasticall...,Working at Amazon is the con,,"[say, amazon, changed, drastically, trying, im...","[working, amazon, con]"
3,"""Amazon Warehouse Operative""","Sep 9, 2018",1.0,Current Employee,Anonymous Employee,,Doesn't Recommend,Negative Outlook,I have been working at Amazon full-time,weekly pay was plus,"Treated as a number, long hours and no flexibi...",Treat staff equal,"[weekly, pay, plus]","[treated, number, long, hours, flexibility, work]"
4,"""Amazon was the easiest job and I was surround...","Sep 9, 2018",5.0,Former Employee,Associate,"Essex, MD",Recommends,Positive Outlook,I worked at Amazon full-time (Less than a year),fun easy positive energy surrounded cool people,Long hours but you’ll get used to it,,"[fun, easy, positive, energy, surrounded, cool...","[long, hours, get, used]"


### Step 2: get pros/cons attributes

In [60]:
#manually took some attributes from the top frequent pros words
pros_attribute = ['great','good','happy','nice','decent','excellent','best','ethical','strong','flexible','new',
                  'easy','friendly','positive','different','professional','high','solid','corporate',
                  'smart','stable','large']

In [61]:
#manually took some attributes from the top frequent cons words
cons_attribute = ['low','little','hard','difficult','long','poor','limited','bad','slow','terrible','conservative'
                 ,'horrible','different','less','senior','political','bureaucratic']

### Step 3: lemmentize -> get replacement

In [62]:
replacement = {'work_life_balance':['time','life','balance','sabbatical','sabbaticals','focus','hour','day','health','flexible','week'
                                    ,'vacation','schedule','overtime'],
                'culture_value':['people','culture','team','care','value','product','coworkers','atmosphere','competitive'
                                ,'family','collaboration','respect','community','colleague','supportive','vision','diversity'],
                'career_oppotunity':['opportunity','learn','industry','career','license','training','train','growth','grow'
                                    ,'level','position','development','advancement','advance','study','build','skill','resource'
                                    ,'education','potential'],
                'company_benefit':['company','benefit','pay','financial','financially','provide','salary','bonus','offer'
                                  ,'401k','package','stock','compensation','invest','investment','money','performance','reward'
                                  ,'retirement','promote','insurance'],
                'senior_management':['place','environment','management','help','manager','experience','match'
                                                ,'plan','office','support','location','leadership','treat','helpful','senior'
                                                ,'manage','leader','communication']}

In [63]:
def getKeysByValue(dictOfElements, valueToFind):
    for k,v  in dictOfElements.items():
        if valueToFind in v:
            return(k)
    return  valueToFind

def replace_attributes(s):
    return([getKeysByValue(replacement,y) for y in s])

In [64]:
amazon_data['pros_replace'] = amazon_data['pros_clean'].map(replace_attributes)
amazon_data['cons_replace'] = amazon_data['cons_clean'].map(replace_attributes)

### Step 4: Lift Score

In [65]:
def ratio(x,y):
    if x==0:
        return float(y)
    if y==0:
        return float(x)
    return(float(x)*float(y))

def get_lift(a,b,tokenized_data):
    '''Function to calculate lift scores given any two words from a list of tokenized words'''
    if (a==b):
        return 1
    p_a = len([i for i in tokenized_data if a in i])
    p_b = len([i for i in tokenized_data if (b in i)])
    p_a_b = len([i for i in tokenized_data if a in i if b in i])
    n = len(tokenized_data)
    return float(float(n)*float(p_a_b)/ratio(p_a,p_b))

In [67]:
pros_list = replacement.keys()
lift_score = [get_lift(x,y,amazon_data.pros_replace) for x in pros_attribute for y in pros_list]
formatted_lift_score = [ round(elem,2) for elem in lift_score ]

In [68]:
pd.DataFrame(reshape(formatted_lift_score,(len(pros_attribute),len(pros_list))),index =pros_attribute , columns = pros_list )

Unnamed: 0,culture_value,company_benefit,senior_management,work_life_balance,career_oppotunity
great,1.34,1.28,1.39,1.03,1.22
good,1.03,1.46,1.19,1.04,0.92
happy,1.49,1.01,1.46,1.67,1.17
nice,1.54,1.16,1.51,1.1,0.73
decent,0.74,2.51,0.66,1.64,0.73
excellent,1.35,1.21,1.54,1.06,1.31
best,1.26,1.0,1.62,0.86,1.43
ethical,1.17,2.04,0.6,1.75,1.53
strong,2.14,1.21,1.55,0.96,1.48
flexible,0.0,0.0,0.0,0.0,0.0


In [70]:
cons_list = replacement.keys()
lift_score = [get_lift(x,y,amazon_data.cons_replace) for x in cons_attribute for y in cons_list]
formatted_lift_score = [ round(elem,2) for elem in lift_score ]

In [71]:
pd.DataFrame(reshape(formatted_lift_score,(len(cons_attribute),len(cons_list))),index =cons_attribute , columns = cons_list )

Unnamed: 0,culture_value,company_benefit,senior_management,work_life_balance,career_oppotunity
low,0.89,3.97,1.03,0.86,1.78
little,1.44,1.57,1.43,1.34,1.88
hard,1.38,1.23,1.12,1.18,1.27
difficult,1.23,0.94,1.35,1.35,1.7
long,0.82,0.86,0.85,1.38,0.83
poor,1.29,1.42,2.28,1.49,1.7
limited,0.98,1.28,0.94,1.27,3.04
bad,1.67,1.34,1.69,1.17,1.06
slow,0.9,1.13,0.7,0.75,3.44
terrible,2.1,1.62,2.41,1.45,1.54


### Lift for mission Statement

    for Pros

In [87]:
amazon_values = ['simplicity', 'ownership','courage','frugality','trust','think big', 'customer obsession']

In [88]:
mission_lift = [get_lift(x,y,amazon_data.pros_replace) for x in amazon_values for y in pros_list]
formatted_lift_score = [ round(elem,2) for elem in mission_lift ]

In [89]:
pd.DataFrame(reshape(formatted_lift_score,(len(amazon_values),len(pros_list))),index =amazon_values , columns = pros_list )

Unnamed: 0,culture_value,company_benefit,senior_management,work_life_balance,career_oppotunity
simplicity,0.0,3.06,0.0,0.0,4.6
ownership,1.31,0.49,1.04,0.66,1.54
courage,0.0,0.0,0.0,0.0,0.0
frugality,1.91,1.67,0.99,0.48,2.09
trust,1.61,1.1,1.85,0.67,1.3
think big,0.0,0.0,0.0,0.0,0.0
customer obsession,0.0,0.0,0.0,0.0,0.0


    for Cons

In [90]:
mission_lift = [get_lift(x,y,amazon_data.cons_replace) for x in amazon_values for y in cons_list]
formatted_lift_score = [ round(elem,2) for elem in mission_lift ]

In [91]:
pd.DataFrame(reshape(formatted_lift_score,(len(amazon_values),len(cons_list))),index =amazon_values , columns = cons_list )

Unnamed: 0,culture_value,company_benefit,senior_management,work_life_balance,career_oppotunity
simplicity,0.0,0.0,0.0,0.0,0.0
ownership,2.09,0.93,1.72,1.35,1.44
courage,2.88,0.0,2.06,0.0,0.0
frugality,1.74,2.04,1.22,0.94,0.93
trust,2.88,1.2,2.27,1.03,1.87
think big,0.0,0.0,0.0,0.0,0.0
customer obsession,0.0,0.0,0.0,0.0,0.0
