In [1]:
import pandas as pd
import re,string
import nltk
from patsy import dmatrices
from nltk.corpus import stopwords
from nltk.probability import FreqDist
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
from nltk.tokenize import sent_tokenize, word_tokenize
import matplotlib.pyplot as plt
import warnings
%pylab inline
import numpy as np
from sklearn.manifold import MDS
from sklearn.metrics import euclidean_distances

Populating the interactive namespace from numpy and matplotlib


In [2]:
dell_data = pd.read_csv('Dell_csv.csv')

In [3]:
dell_data.head()

Unnamed: 0,Title,Date Written,Rating,Current/Former,Job Title,Location,Recommendation?,Outlook,Main Text,Pros,Cons,Advice to management
0,"""Financial Analyst""","Sep 24, 2018",5.0,Current Employee,Anonymous Employee,,,,I have been working at Dell full-time,"Company culture, work life balance",High achieving individuals may feel underemployed,
1,"""Sr Principal Engineer""","Sep 27, 2018",4.0,Current Employee,Architect,"Austin, TX",Recommends,Positive Outlook,"Oct 2, 2018 – Candidate and Employee Experienc...",Good work environment,Slow career growth,
2,"""Good Place to work""","Sep 28, 2018",4.0,Current Employee,Anonymous Employee,,,,,"Flexibility in work, Friendly environment,scop...",Lack of communication gap between team members...,
3,"""I enjoy working here, but my compensation is ...","Sep 25, 2018",4.0,Current Employee,Senior Representative,"Franklin, MA",Recommends,Neutral Outlook,I have been working at Dell full-time (More th...,"I like my job, I have a lot of responsibility,...",My pay does not reflect the level of difficult...,I would like more opportunities for growth in ...
4,"""Technical Support Manager""","Sep 26, 2018",4.0,Former Employee,Anonymous Employee,,Recommends,Positive Outlook,I worked at Dell part-time,"Competitive salary, great culture",Career growth limited at the management level ...,promote more african Americans to senior manag...


### Step 1: pros/cons data pre-processing

In [4]:
#clean punctuation and white spaces
dell_data["pros_clean"] = dell_data.Pros.apply(lambda x:re.sub(r'[^\w\s]', ' ', x.lower()))
dell_data["pros_clean"] = dell_data.pros_clean.apply(lambda x:re.sub(r'(\\n+)', ' ', x))
dell_data["pros_clean"] = dell_data.pros_clean.apply(lambda x:re.sub(r'^b[\'\"\'"\"''\s]', '', x.lower()))
dell_data["pros_clean"] = dell_data.pros_clean.apply(lambda x:re.sub(r'\s+', ' ', x))

dell_data["cons_clean"] = dell_data.Cons.apply(lambda x:re.sub(r'[^\w\s]', ' ', x.lower()))
dell_data["cons_clean"] = dell_data.cons_clean.apply(lambda x:re.sub(r'(\\n+)', ' ', x))
dell_data["cons_clean"] = dell_data.cons_clean.apply(lambda x:re.sub(r'^b[\'\"\'"\"''\s]', '', x.lower()))
dell_data["cons_clean"] = dell_data.cons_clean.apply(lambda x:re.sub(r'\s+', ' ', x))

#tokenizing and removing stop words
stop = set(stopwords.words('english'))
punc = string.punctuation
dell_data['pros_clean'] = dell_data.pros_clean.apply(lambda x: [word for word in word_tokenize(x) if word not in stop])
dell_data['pros_clean'] = dell_data['pros_clean'].apply(lambda x: [word for word in x if word not in punc])
dell_data['cons_clean'] = dell_data.cons_clean.apply(lambda x: [word for word in word_tokenize(x) if word not in stop])
dell_data['cons_clean'] = dell_data['cons_clean'].apply(lambda x: [word for word in x if word not in punc])

dell_data.head()

Unnamed: 0,Title,Date Written,Rating,Current/Former,Job Title,Location,Recommendation?,Outlook,Main Text,Pros,Cons,Advice to management,pros_clean,cons_clean
0,"""Financial Analyst""","Sep 24, 2018",5.0,Current Employee,Anonymous Employee,,,,I have been working at Dell full-time,"Company culture, work life balance",High achieving individuals may feel underemployed,,"[company, culture, work, life, balance]","[high, achieving, individuals, may, feel, unde..."
1,"""Sr Principal Engineer""","Sep 27, 2018",4.0,Current Employee,Architect,"Austin, TX",Recommends,Positive Outlook,"Oct 2, 2018 – Candidate and Employee Experienc...",Good work environment,Slow career growth,,"[good, work, environment]","[slow, career, growth]"
2,"""Good Place to work""","Sep 28, 2018",4.0,Current Employee,Anonymous Employee,,,,,"Flexibility in work, Friendly environment,scop...",Lack of communication gap between team members...,,"[flexibility, work, friendly, environment, sco...","[lack, communication, gap, team, members, mana..."
3,"""I enjoy working here, but my compensation is ...","Sep 25, 2018",4.0,Current Employee,Senior Representative,"Franklin, MA",Recommends,Neutral Outlook,I have been working at Dell full-time (More th...,"I like my job, I have a lot of responsibility,...",My pay does not reflect the level of difficult...,I would like more opportunities for growth in ...,"[like, job, lot, responsibility, work, good, p...","[pay, reflect, level, difficulty, responsibili..."
4,"""Technical Support Manager""","Sep 26, 2018",4.0,Former Employee,Anonymous Employee,,Recommends,Positive Outlook,I worked at Dell part-time,"Competitive salary, great culture",Career growth limited at the management level ...,promote more african Americans to senior manag...,"[competitive, salary, great, culture]","[career, growth, limited, management, level, u..."


### Step 2: get pros/cons attributes

In [5]:
#manually took some attributes from the top frequent pros words
pros_attribute = ['great','good','happy','nice','decent','excellent','best','ethical','strong','flexible','new',
                  'easy','friendly','positive','different','professional','high','solid','corporate',
                  'smart','stable','large']

In [6]:
#manually took some attributes from the top frequent cons words
cons_attribute = ['low','little','hard','difficult','long','poor','limited','bad','slow','terrible','conservative'
                 ,'horrible','different','less','senior','political','bureaucratic']

### Step 3: lemmentize -> get replacement

In [7]:
replacement = {'work_life_balance':['time','life','balance','sabbatical','sabbaticals','focus','hour','day','health','flexible','week'
                                    ,'vacation','schedule','overtime'],
                'culture_value':['people','culture','team','care','value','product','coworkers','atmosphere','competitive'
                                ,'family','collaboration','respect','community','colleague','supportive','vision','diversity'],
                'career_oppotunity':['opportunity','learn','industry','career','license','training','train','growth','grow'
                                    ,'level','position','development','advancement','advance','study','build','skill','resource'
                                    ,'education','potential'],
                'company_benefit':['company','benefit','pay','financial','financially','provide','salary','bonus','offer'
                                  ,'401k','package','stock','compensation','invest','investment','money','performance','reward'
                                  ,'retirement','promote','insurance'],
                'senior_management':['place','environment','management','help','manager','experience','match'
                                                ,'plan','office','support','location','leadership','treat','helpful','senior'
                                                ,'manage','leader','communication']}

In [8]:
def getKeysByValue(dictOfElements, valueToFind):
    for k,v  in dictOfElements.items():
        if valueToFind in v:
            return(k)
    return  valueToFind

def replace_attributes(s):
    return([getKeysByValue(replacement,y) for y in s])

In [9]:
dell_data['pros_replace'] = dell_data['pros_clean'].map(replace_attributes)
dell_data['cons_replace'] = dell_data['cons_clean'].map(replace_attributes)

### Step 4: Lift Score

In [10]:
def ratio(x,y):
    if x==0:
        return float(y)
    if y==0:
        return float(x)
    return(float(x)*float(y))

def get_lift(a,b,tokenized_data):
    '''Function to calculate lift scores given any two words from a list of tokenized words'''
    if (a==b):
        return 1
    p_a = len([i for i in tokenized_data if a in i])
    p_b = len([i for i in tokenized_data if (b in i)])
    p_a_b = len([i for i in tokenized_data if a in i if b in i])
    n = len(tokenized_data)
    return float(float(n)*float(p_a_b)/ratio(p_a,p_b))

In [11]:
pros_list = replacement.keys()
lift_score = [get_lift(x,y,dell_data.pros_replace) for x in pros_attribute for y in pros_list]
formatted_lift_score = [ round(elem,2) for elem in lift_score ]

In [12]:
pd.DataFrame(reshape(formatted_lift_score,(len(pros_attribute),len(pros_list))),index =pros_attribute , columns = pros_list )

Unnamed: 0,culture_value,company_benefit,senior_management,work_life_balance,career_oppotunity
great,1.45,1.15,1.38,1.1,1.18
good,1.09,1.29,1.14,1.18,0.99
happy,1.72,1.42,1.38,1.02,0.53
nice,1.56,1.01,1.57,0.95,0.72
decent,1.02,2.32,0.99,1.49,1.32
excellent,1.33,1.35,1.3,1.16,1.42
best,1.17,1.29,1.48,0.95,1.27
ethical,1.07,1.82,1.1,0.81,1.05
strong,1.41,1.35,1.23,1.0,1.51
flexible,0.0,0.0,0.0,0.0,0.0


In [13]:
cons_list = replacement.keys()
lift_score = [get_lift(x,y,dell_data.cons_replace) for x in cons_attribute for y in cons_list]
formatted_lift_score = [ round(elem,2) for elem in lift_score ]

In [14]:
pd.DataFrame(reshape(formatted_lift_score,(len(cons_attribute),len(cons_list))),index =cons_attribute , columns = cons_list )

Unnamed: 0,culture_value,company_benefit,senior_management,work_life_balance,career_oppotunity
low,1.28,2.44,1.12,1.19,1.47
little,1.46,1.03,1.42,1.29,1.55
hard,1.19,1.24,1.18,1.25,1.42
difficult,1.24,1.2,1.33,1.48,1.69
long,1.42,1.05,1.26,2.4,1.11
poor,1.39,1.24,2.21,1.31,1.23
limited,1.16,1.09,1.01,0.92,3.35
bad,1.56,1.18,1.78,1.03,1.03
slow,0.66,1.08,0.72,0.77,2.48
terrible,1.92,1.65,2.5,0.9,1.53
