In [1]:
import pandas as pd
import re,string
import nltk
from patsy import dmatrices
from nltk.corpus import stopwords
from nltk.probability import FreqDist
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
from nltk.tokenize import sent_tokenize, word_tokenize
import matplotlib.pyplot as plt
import warnings
%pylab inline
import numpy as np
from sklearn.manifold import MDS
from sklearn.metrics import euclidean_distances

Populating the interactive namespace from numpy and matplotlib


In [160]:
charles_data = pd.read_csv('Charles.csv')

In [161]:
charles_data.head()

Unnamed: 0,Title,Date Written,Rating,Current/Former,Job Title,Location,Recommendation?,Outlook,Main Text,Pros,Cons,Advice to management
0,"""Senior Manager""","Oct 4, 2018",5.0,Current Employee,Senior Manager,"San Francisco, CA",Recommends,Positive Outlook,Benefits are comparable to other financial ser...,Great work/life balance,Company isn't doing enough to ensure equal rep...,Create a formal policy for pay parity
1,"""Specialist - Operations""","Oct 2, 2018",5.0,Current Employee,Anonymous Employee,"Lonetree, CO",Recommends,Positive Outlook,I have been working at Charles Schwab full-tim...,"Culture, Sabbaticals, Volunteering opportuniti...",Very top heavy for management.,
2,"""Business Analyst""","Sep 30, 2018",5.0,Current Employee,Business Analyst,"Austin, TX",Recommends,Positive Outlook,I have been working at Charles Schwab full-tim...,Charles Schwab is a an excellent Company. Even...,Something vertical movement can be a little slow.,
3,"""Great Culture""","Sep 26, 2018",5.0,Current Employee,Talent Sourcing Advisor,"Chicago, IL",Recommends,Positive Outlook,I have been working at Charles Schwab full-tim...,Great management that truly cares about its em...,not a large campus in Chicago,
4,"""Job Review""","Sep 26, 2018",5.0,Current Employee,Anonymous Employee,,,,I have been working at Charles Schwab full-time,Very nice environment to work in.,no cons come to my mind,Keep up the good work


# Steps 
1. tokenize each pros/cons review
2. get POS, lemmentize and find pros_attibutes/ cons_attributes by getting adj POS
3. categorize 1. into 5 categories which match to glassdoor's rating categories
4. calculate lift score form 2 and 3

# Pros

### step 1

In [162]:
#clean punctuation and white spaces
charles_data["pros_clean"] = charles_data.Pros.apply(lambda x:re.sub(r'[^\w\s]', ' ', x.lower()))
charles_data["pros_clean"] = charles_data.pros_clean.apply(lambda x:re.sub(r'(\\n+)', ' ', x))
charles_data["pros_clean"] = charles_data.pros_clean.apply(lambda x:re.sub(r'^b[\'\"\'"\"''\s]', '', x.lower()))
charles_data["pros_clean"] = charles_data.pros_clean.apply(lambda x:re.sub(r'\s+', ' ', x))

#tokenizing and removing stop words
stop = set(stopwords.words('english'))
punc = string.punctuation
charles_data['pros_clean'] = charles_data.pros_clean.apply(lambda x: [word for word in word_tokenize(x) if word not in stop])
charles_data['pros_clean'] = charles_data['pros_clean'].apply(lambda x: [word for word in x if word not in punc])


charles_data.head()

Unnamed: 0,Title,Date Written,Rating,Current/Former,Job Title,Location,Recommendation?,Outlook,Main Text,Pros,Cons,Advice to management,pros_clean
0,"""Senior Manager""","Oct 4, 2018",5.0,Current Employee,Senior Manager,"San Francisco, CA",Recommends,Positive Outlook,Benefits are comparable to other financial ser...,Great work/life balance,Company isn't doing enough to ensure equal rep...,Create a formal policy for pay parity,"[great, work, life, balance]"
1,"""Specialist - Operations""","Oct 2, 2018",5.0,Current Employee,Anonymous Employee,"Lonetree, CO",Recommends,Positive Outlook,I have been working at Charles Schwab full-tim...,"Culture, Sabbaticals, Volunteering opportuniti...",Very top heavy for management.,,"[culture, sabbaticals, volunteering, opportuni..."
2,"""Business Analyst""","Sep 30, 2018",5.0,Current Employee,Business Analyst,"Austin, TX",Recommends,Positive Outlook,I have been working at Charles Schwab full-tim...,Charles Schwab is a an excellent Company. Even...,Something vertical movement can be a little slow.,,"[charles, schwab, excellent, company, even, th..."
3,"""Great Culture""","Sep 26, 2018",5.0,Current Employee,Talent Sourcing Advisor,"Chicago, IL",Recommends,Positive Outlook,I have been working at Charles Schwab full-tim...,Great management that truly cares about its em...,not a large campus in Chicago,,"[great, management, truly, cares, employees, c..."
4,"""Job Review""","Sep 26, 2018",5.0,Current Employee,Anonymous Employee,,,,I have been working at Charles Schwab full-time,Very nice environment to work in.,no cons come to my mind,Keep up the good work,"[nice, environment, work]"


### step 2

In [250]:
#get POS
get_pos = charles_data['pros_clean'].apply(lambda x: [word for word in nltk.pos_tag(x)])

adj_list = []
for review in get_pos:
    #print review
    for word,pos in review:
        if pos == 'JJ' or pos == 'JJR' or pos == 'JJS': # if the POS-tag is adjective
            adj_list.append(word)

In [255]:
from collections import Counter
Counter(adj_list).most_common(30)

[('great', 866),
 ('good', 743),
 ('nice', 131),
 ('financial', 118),
 ('sabbatical', 87),
 ('decent', 84),
 ('many', 83),
 ('excellent', 73),
 ('best', 70),
 ('ethical', 67),
 ('schwab', 63),
 ('salary', 58),
 ('strong', 56),
 ('flexible', 56),
 ('new', 50),
 ('easy', 49),
 ('friendly', 45),
 ('positive', 43),
 ('different', 43),
 ('right', 39),
 ('professional', 35),
 ('high', 34),
 ('co', 34),
 ('solid', 32),
 ('corporate', 32),
 ('smart', 32),
 ('stable', 31),
 ('large', 30),
 ('willing', 30),
 ('hard', 30)]

In [256]:
#manually took some attributes from the top frequent pros words
pros_attribute = ['great','good','happy','nice','decent','excellent','best','ethical','strong','flexible','new',
                  'easy','friendly','positive','different','professional','high','solid','corporate',
                  'smart','stable','large']

### step 3

    Find the unique words list

In [257]:
#create a function that would return WORDNET POS compliance to WORDENT lemmatization (a,n,r,v) 
def get_wordnet_pos(treebank_tag):
    if treebank_tag.startswith('J'):
        return wordnet.ADJ
    elif treebank_tag.startswith('V'):
        return wordnet.VERB
    elif treebank_tag.startswith('N'):
        return wordnet.NOUN
    elif treebank_tag.startswith('R'):
         return wordnet.ADV
    else:
        # As default pos in lemmatization is Noun
        return wordnet.NOUN
    
wnl = WordNetLemmatizer()

#Lemmentization
def wn_pos(filtered_pos,empty_list):
    for word,pos in filtered_pos:
        empty_list.append(wnl.lemmatize(word,get_wordnet_pos(pos)))
        #print pos
        #print get_wordnet_pos(pos)
    return empty_list

In [259]:
get_pos[:3]

0    [(great, JJ), (work, NN), (life, NN), (balance...
1    [(culture, NN), (sabbaticals, NNS), (volunteer...
2    [(charles, NNS), (schwab, VBP), (excellent, JJ...
Name: pros_clean, dtype: object

In [264]:
pros_lem = []
for review in get_pos:
    for word,pos in review:
        wn_pos(review,pros_lem)

In [266]:
pros_words = nltk.FreqDist(pros_lem)
rslt_unique = pd.DataFrame.from_dict(pros_words,orient='index').reset_index()
rslt_unique.columns = ['word','frequency']
pd.DataFrame(pros_words.most_common(500)).to_csv("most common pros.csv")

In [285]:
pros_replace = {'work_life_balance':['time','life','balance','sabbatical','sabbaticals','focus','hour','day','health','flexible','week'
                                    ,'vacation','schedule','overtime'],
                'culture_value':['people','culture','team','care','value','product','coworkers','atmosphere','competitive'
                                ,'family','collaboration','respect','community','colleague','supportive','vision','diversity'],
                'career_oppotunity':['opportunity','learn','industry','career','license','training','train','growth','grow'
                                    ,'level','position','development','advancement','advance','study','build','skill','resource'
                                    ,'education','potential'],
                'company_benefit':['company','benefit','pay','financial','financially','provide','salary','bonus','offer'
                                  ,'401k','package','stock','compensation','invest','investment','money','performance','reward'
                                  ,'retirement','promote','insurance'],
                'senior_management':['place','environment','management','help','manager','experience','match'
                                                ,'plan','office','support','location','leadership','treat','helpful','senior'
                                                ,'manage','leader','communication']}

    replace the original reviews with those categories

In [286]:
def getKeysByValue(dictOfElements, valueToFind):
    for k,v  in dictOfElements.items():
        if valueToFind in v:
            return(k)
    return  valueToFind

def replace_attributes(s):
    return([getKeysByValue(pros_replace,y) for y in s])

In [287]:
charles_data['pros_replace'] = charles_data['pros_clean'].map(replace_attributes)

In [288]:
charles_data['pros_replace'].head()

0    [great, work, work_life_balance, work_life_bal...
1    [culture_value, work_life_balance, volunteerin...
2    [charles, schwab, excellent, company_benefit, ...
3    [great, senior_management, truly, cares, emplo...
4                      [nice, senior_management, work]
Name: pros_replace, dtype: object

### Step 4: Lift Score for Pros

In [289]:
def ratio(x,y):
    if x==0:
        return float(y)
    if y==0:
        return float(x)
    return(float(x)*float(y))

def get_lift(a,b,tokenized_data):
    '''Function to calculate lift scores given any two words from a list of tokenized words'''
    if (a==b):
        return 1
    p_a = len([i for i in tokenized_data if a in i])
    p_b = len([i for i in tokenized_data if (b in i)])
    p_a_b = len([i for i in tokenized_data if a in i if b in i])
    n = len(tokenized_data)
    return float(float(n)*float(p_a_b)/ratio(p_a,p_b))

In [290]:
pros_list = pros_replace.keys()
lift_score = [get_lift(x,y,charles_data.pros_replace) for x in pros_attribute for y in pros_list]
formatted_lift_score = [ round(elem,2) for elem in lift_score ]

In [291]:
pd.DataFrame(reshape(formatted_lift_score,(len(pros_attribute),len(pros_list))),index =pros_attribute , columns = pros_list )

Unnamed: 0,culture_value,company_benefit,senior_management,work_life_balance,career_oppotunity
great,1.26,1.08,1.17,1.1,1.2
good,1.02,1.22,1.19,1.17,1.04
nice,1.54,0.81,1.11,1.17,0.71
decent,0.71,1.53,1.26,1.67,1.17
excellent,1.19,1.43,1.43,1.13,1.14
best,1.07,1.57,1.34,1.36,1.35
ethical,0.89,1.63,0.87,0.77,1.15
strong,1.49,1.01,1.26,1.19,0.83
flexible,0.0,0.0,0.0,0.0,0.0
new,1.27,1.09,1.42,1.18,2.18


# Cons

### Step 1

In [292]:
#clean punctuation and white spaces
charles_data["cons_clean"] = charles_data.Cons.apply(lambda x:re.sub(r'[^\w\s]', ' ', x.lower()))
charles_data["cons_clean"] = charles_data.cons_clean.apply(lambda x:re.sub(r'(\\n+)', ' ', x))
charles_data["cons_clean"] = charles_data.cons_clean.apply(lambda x:re.sub(r'^b[\'\"\'"\"''\s]', '', x.lower()))
charles_data["cons_clean"] = charles_data.cons_clean.apply(lambda x:re.sub(r'\s+', ' ', x))

#tokenizing and removing stop words
stop = set(stopwords.words('english'))
punc = string.punctuation
charles_data['cons_clean'] = charles_data.cons_clean.apply(lambda x: [word for word in word_tokenize(x) if word not in stop])
charles_data['cons_clean'] = charles_data['cons_clean'].apply(lambda x: [word for word in x if word not in punc])


charles_data.head()

Unnamed: 0,Title,Date Written,Rating,Current/Former,Job Title,Location,Recommendation?,Outlook,Main Text,Pros,Cons,Advice to management,pros_clean,pros_replace,cons_clean
0,"""Senior Manager""","Oct 4, 2018",5.0,Current Employee,Senior Manager,"San Francisco, CA",Recommends,Positive Outlook,Benefits are comparable to other financial ser...,Great work/life balance,Company isn't doing enough to ensure equal rep...,Create a formal policy for pay parity,"[great, work, life, balance]","[great, work, work_life_balance, work_life_bal...","[company, enough, ensure, equal, representatio..."
1,"""Specialist - Operations""","Oct 2, 2018",5.0,Current Employee,Anonymous Employee,"Lonetree, CO",Recommends,Positive Outlook,I have been working at Charles Schwab full-tim...,"Culture, Sabbaticals, Volunteering opportuniti...",Very top heavy for management.,,"[culture, sabbaticals, volunteering, opportuni...","[culture_value, work_life_balance, volunteerin...","[top, heavy, management]"
2,"""Business Analyst""","Sep 30, 2018",5.0,Current Employee,Business Analyst,"Austin, TX",Recommends,Positive Outlook,I have been working at Charles Schwab full-tim...,Charles Schwab is a an excellent Company. Even...,Something vertical movement can be a little slow.,,"[charles, schwab, excellent, company, even, th...","[charles, schwab, excellent, company_benefit, ...","[something, vertical, movement, little, slow]"
3,"""Great Culture""","Sep 26, 2018",5.0,Current Employee,Talent Sourcing Advisor,"Chicago, IL",Recommends,Positive Outlook,I have been working at Charles Schwab full-tim...,Great management that truly cares about its em...,not a large campus in Chicago,,"[great, management, truly, cares, employees, c...","[great, senior_management, truly, cares, emplo...","[large, campus, chicago]"
4,"""Job Review""","Sep 26, 2018",5.0,Current Employee,Anonymous Employee,,,,I have been working at Charles Schwab full-time,Very nice environment to work in.,no cons come to my mind,Keep up the good work,"[nice, environment, work]","[nice, senior_management, work]","[cons, come, mind]"


### Step 2

In [293]:
#get POS
get_cons = charles_data['cons_clean'].apply(lambda x: [word for word in nltk.pos_tag(x)])

adj_list = []
for review in get_cons:
    #print review
    for word,pos in review:
        if pos == 'JJ' or pos == 'JJR' or pos == 'JJS': # if the POS-tag is adjective
            adj_list.append(word)

In [295]:
from collections import Counter
Counter(adj_list).most_common(50)

[('many', 166),
 ('low', 123),
 ('much', 111),
 ('good', 102),
 ('new', 97),
 ('little', 96),
 ('great', 90),
 ('high', 90),
 ('salary', 83),
 ('hard', 82),
 ('difficult', 77),
 ('financial', 77),
 ('long', 63),
 ('poor', 63),
 ('schwab', 57),
 ('limited', 55),
 ('large', 54),
 ('senior', 54),
 ('bad', 53),
 ('competitive', 51),
 ('slow', 50),
 ('san', 49),
 ('corporate', 49),
 ('lower', 44),
 ('upper', 39),
 ('constant', 38),
 ('top', 37),
 ('due', 37),
 ('big', 37),
 ('francisco', 36),
 ('best', 33),
 ('different', 33),
 ('terrible', 30),
 ('real', 30),
 ('last', 28),
 ('middle', 27),
 ('less', 27),
 ('average', 26),
 ('internal', 26),
 ('enough', 26),
 ('current', 25),
 ('political', 24),
 ('better', 23),
 ('red', 22),
 ('overall', 22),
 ('conservative', 22),
 ('certain', 21),
 ('full', 20),
 ('upward', 20),
 ('horrible', 20)]

In [311]:
#manually took some attributes from the top frequent cons words
cons_attribute = ['low','little','hard','difficult','long','poor','limited','bad','slow','terrible','conservative'
                 ,'horrible','different','less','senior','political','bureaucratic']

### Step 3

In [312]:
cons_lem = []
for review in get_cons:
    for word,pos in review:
        wn_pos(review,cons_lem)

In [313]:
cons_words = nltk.FreqDist(cons_lem)
rslt_unique = pd.DataFrame.from_dict(cons_words,orient='index').reset_index()
rslt_unique.columns = ['word','frequency']
pd.DataFrame(cons_words.most_common(500)).to_csv("most common cons.csv")

    It seems like the cons words are almost the same as pros so we can just use the same replacement words as pros.

In [314]:
def getKeysByValue(dictOfElements, valueToFind):
    for k,v  in dictOfElements.items():
        if valueToFind in v:
            return(k)
    return  valueToFind

def replace_attributes(s):
    return([getKeysByValue(pros_replace,y) for y in s])

In [315]:
charles_data['cons_replace'] = charles_data['cons_clean'].map(replace_attributes)

In [316]:
charles_data['cons_replace'].head()

0    [company_benefit, enough, ensure, equal, repre...
1                      [top, heavy, senior_management]
2        [something, vertical, movement, little, slow]
3                             [large, campus, chicago]
4                                   [cons, come, mind]
Name: cons_replace, dtype: object

### Step 4

In [317]:
cons_list = pros_replace.keys()
lift_score = [get_lift(x,y,charles_data.cons_replace) for x in cons_attribute for y in cons_list]
formatted_lift_score = [ round(elem,2) for elem in lift_score ]

In [318]:
pd.DataFrame(reshape(formatted_lift_score,(len(cons_attribute),len(cons_list))),index =cons_attribute , columns = cons_list )

Unnamed: 0,culture_value,company_benefit,senior_management,work_life_balance,career_oppotunity
low,0.92,2.08,1.11,1.31,1.19
little,1.14,1.39,1.32,1.1,1.67
hard,1.55,1.2,0.92,1.17,1.24
difficult,0.86,0.77,1.31,1.46,1.14
long,1.36,1.0,1.21,2.59,1.35
poor,1.67,1.34,2.07,0.92,1.52
limited,0.57,0.96,0.92,0.99,2.18
bad,1.01,1.16,1.5,0.94,0.83
slow,0.77,0.73,0.76,0.8,1.5
terrible,1.67,1.49,2.07,1.36,0.88


### Lift for mission Statement
    for Pros

In [319]:
schwab_values = ['trust','innovation', 'teamwork','stewards', 'ethical', 'proactive']

In [320]:
mission_lift = [get_lift(x,y,charles_data.pros_replace) for x in schwab_values for y in pros_list]
formatted_lift_score = [ round(elem,2) for elem in mission_lift ]

In [321]:
pd.DataFrame(reshape(formatted_lift_score,(len(schwab_values),len(pros_list))),index =schwab_values , columns = pros_list )

Unnamed: 0,culture_value,company_benefit,senior_management,work_life_balance,career_oppotunity
trust,1.59,1.62,0.93,1.32,1.22
innovation,1.19,2.43,1.39,0.0,1.83
teamwork,0.9,0.91,0.0,1.97,1.37
stewards,2.39,2.43,2.79,0.0,3.66
ethical,0.89,1.63,0.87,0.77,1.15
proactive,1.59,0.81,2.79,1.32,2.44


In [322]:
mission_lift = [get_lift(x,y,charles_data.cons_replace) for x in schwab_values for y in cons_list]
formatted_lift_score = [ round(elem,2) for elem in mission_lift ]

In [323]:
pd.DataFrame(reshape(formatted_lift_score,(len(schwab_values),len(cons_list))),index =schwab_values , columns = cons_list )

Unnamed: 0,culture_value,company_benefit,senior_management,work_life_balance,career_oppotunity
trust,1.62,0.42,2.0,1.87,0.6
innovation,1.94,1.02,2.0,1.12,1.69
teamwork,2.42,2.54,1.5,0.0,1.81
stewards,0.0,0.0,0.0,0.0,0.0
ethical,0.0,0.0,0.0,0.0,0.0
proactive,0.0,0.0,0.0,0.0,0.0
