In [1]:
from selenium import webdriver
import time
import datetime
import re
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from bs4 import BeautifulSoup

In [None]:
browser = webdriver.Chrome('C:\\Users\\an-user\\Downloads\\chromedriver.exe')
browser.implicitly_wait(5)
browser.get('https://www.jobplanet.co.kr/')

# Collecting Raw Text Data

In [27]:
##list for raw data
title_list = []
advantage_list = []
drawback_list = []
opinion_list = []
personal_info = []
promotion, welfareandsalary, worklifebalance, corporateculture, executives = [], [],[],[],[]

##crawling data from Jobplanet
for board_number in range(1, 25):
    browser.get("https://www.jobplanet.co.kr/companies/60632/reviews/%EC%9C%84%EB%A9%94%ED%94%84?page=&page=" + str(board_number))
    contents = browser.find_elements_by_css_selector('div.content_wrap')
    clean_contents = contents[:2] + contents[4:]

    for i in range(len(clean_contents)):
        ##personal_info and text info
        text = clean_contents[i].text.split('\n')
        personal_info.append(text[:text.index('평점')])
        title_list.append(' '.join(str(text[12:text.index('장점')])))
        advantage_list.append(' '.join(str(text[text.index('장점') + 1 : text.index('단점')])))
        drawback_list.append(' '.join(str(text[text.index('단점') + 1 : text.index('경영진에 바라는 점')])))
        opinion_list.append(' '.join(str(text[text.index('경영진에 바라는 점') + 1 : -5])))
    
        ##score info
        html = clean_contents[i].get_attribute('outerHTML')
        attributes = str(BeautifulSoup(html, 'lxml')).split('\n')
        promotion.append(re.findall(r'\d+', attributes[21]))
        welfareandsalary.append(re.findall(r'\d+', attributes[27]))
        worklifebalance.append(re.findall(r'\d+', attributes[33]))
        corporateculture.append(re.findall(r'\d+', attributes[39]))
        executives.append(re.findall(r'\d+', attributes[45]))

        
##Converting list into dataframe
personal_info_df = pd.DataFrame(personal_info, columns = ['info'])
title_df = pd.DataFrame(title_list, columns = ['title'])
advantage_df = pd.DataFrame(advantage_list, columns = ['advantage'])
drawback_df = pd.DataFrame(drawback_list, columns = ['drawback'])
opinion_df = pd.DataFrame(opinion_list, columns = ['opinion'])
promotion_score = pd.DataFrame(promotion, columns = ['promotion'])
welfareandsalary_score = pd.DataFrame(welfareandsalary, columns = ['welfaresalary'])
worklifebalance_score = pd.DataFrame(worklifebalance, columns = ['worklifebalance'])
corporateculture_score = pd.DataFrame(corporateculture, columns = ['culture'])
executives_score = pd.DataFrame(executives, columns = ['executives'])

## Merging Data
df = pd.concat([personal_info_df,
                     title_df,
                     advantage_df,
                     drawback_df,
                     opinion_df,
                     promotion_score,
                     welfareandsalary_score,
                     worklifebalance_score,
                     corporateculture_score,
                     executives_score], axis=1)


# Cleaning DataFrame

In [29]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 120 entries, 0 to 119
Data columns (total 10 columns):
info               120 non-null object
title              120 non-null object
advantage          120 non-null object
drawback           120 non-null object
opinion            120 non-null object
promotion          120 non-null object
welfaresalary      120 non-null object
worklifebalance    120 non-null object
culture            120 non-null object
executives         120 non-null object
dtypes: object(10)
memory usage: 9.5+ KB


In [30]:
df['total'] = ""
df['date'] = ''
df['region'] = ''
df['workingstatus'] = ''
df['department'] = ''
for i in range(len(df)):
    ##converting 0 to 100 score scale into 1 to 5 score scale. 
    df['promotion'][i] = int(df['promotion'][i])/20
    df['welfaresalary'][i] = int(df['welfaresalary'][i])/20
    df['worklifebalance'][i] = int(df['worklifebalance'][i])/20
    df['culture'][i] = int(df['culture'][i])/20
    df['executives'][i] = int(df['executives'][i])/20
    df['total'][i] = (df['executives'][i] + df['culture'][i] + df['worklifebalance'][i] + df['welfaresalary'][i] + df['promotion'][i])/5
    
    
    ##spliting personal information of each observation
    information = df['info'][i].split('|')
    df['date'][i] = pd.to_datetime(information[3])
    df['region'][i] = information[2]
    df['workingstatus'][i] = information[1]
    df['department'][i] = information[0]
    
    ##removing characters from string of each text information
    df['title'][i] = re.sub('[-=+,#/\?:^$.@*\"※~&%ㆍ!』\\‘|\(\)\[\]\<\>`\'…》]', '', df['title'][i])
    df['advantage'][i] = re.sub('[-=+,#/\?:^$.@*\"※~&%ㆍ!』\\‘|\(\)\[\]\<\>`\'…》]', '', df['advantage'][i])
    df['drawback'][i] = re.sub('[-=+,#/\?:^$.@*\"※~&%ㆍ!』\\‘|\(\)\[\]\<\>`\'…》]', '', df['drawback'][i])
    df['opinion'][i] = re.sub('[-=+,#/\?:^$.@*\"※~&%ㆍ!』\\‘|\(\)\[\]\<\>`\'…》]', '', df['opinion'][i])

    

In [31]:
df.sample(4)

Unnamed: 0,info,title,advantage,drawback,opinion,promotion,welfaresalary,worklifebalance,culture,executives,total,date,region,workingstatus,department
95,IT/인터넷 | 현직원 | 서울 | 2019/4/21,복지도 잘 되어있고 같이 일하는 분위기가 자유로움,연차 자유롭고 분위기 자유로움 복지도 잘 되어 있음 지하철역과 가까움,영업손실이 나서 불안함 투자가 많아짐 업계 2워 자리 내어줌,연봉 많이 올려주세요 연봉 많이 올려주세요,3,3,3,3,3,3.0,2019-04-21 00:00:00,서울,현직원,IT/인터넷
111,IT/인터넷 | 현직원 | 서울 | 2019/4/3,경력 쌓고 싶으면 오세요 열려있습니다,경력 쌓기는 괜찮고 존중하는 업무분위기 연차 자유 지하철역 근처,오래다니기엔 비전이 없고 동종 업계에서 하락하는 추세 10년 후에 내 자리가 없을지도,회사에 신경 좀 써주세요,3,4,3,4,2,3.2,2019-04-03 00:00:00,서울,현직원,IT/인터넷
91,영업/제휴 | 전직원 | 서울 | 2019/4/26,분위기 매우 좋고 승진 기회가 많은 회사,젊은 사람들 위주라 분위기가 매우 화기애애하고 재밌고 자유로움,부서마다 분위기가 매우 다르고 가끔 빡세게 야근하는 경우가 발생 경영의 줏대가 없는 느낌,,4,2,4,4,3,3.4,2019-04-26 00:00:00,서울,전직원,영업/제휴
30,마케팅/시장조사 | 전직원 | 서울 | 2019/7/15,시대에 맞춰 복지를 다양하게 개선하려고 하지만 아직 옛 사고방식을 버리지 못한 기업,팀마다 분위기가 매우 다름 구내식당 아침1000원 점심2500원 저녁1000원 퀄리...,쓸데없는 일을 만들어서 사서 고생하는 스타일 다른 직원들에게 올 파장은 생각 안하고...,작은 계획의 변경이 모든 직원들에게 올 큰 파장과 리스크를 생각했으면 한다,4,4,4,2,2,3.2,2019-07-15 00:00:00,서울,전직원,마케팅/시장조사


In [32]:
df.to_csv('wemakeprice.csv', sep = ',')

## categorizing datasets into categories

(1) High Job Satisfaction Group - Hyundai, SK Telecome, Naver

(2) Low Job Satisfaction Group - Eland, WeMakePrice, Samsung Automobile

In [129]:
hdf1 = pd.read_csv('hyundai_df.csv')
hdf1 = hdf1.fillna('NaN')


hdf2 = pd.read_csv('naver_df.csv')
hdf2 = hdf2.fillna('NaN')


hdf3 = pd.read_csv('skt_df.csv')
hdf3 = hdf3.fillna('NaN')


ldf1 = pd.read_csv('elandworld.csv')
ldf1 = ldf1.fillna('NaN')



ldf2 = pd.read_csv('renaultsamsung_df.csv')
ldf2 = ldf2.fillna('NaN')


ldf3 = pd.read_csv('wemakeprice.csv')
ldf3 = ldf3.fillna('NaN')

In [130]:
hdf1['company'] = '현대자동차'
hdf2['company'] = '네이버'
hdf3['company'] = 'sk텔레콤'

ldf1['company'] = '이랜드'
ldf2['company'] = '르노삼성'
ldf3['company'] = '위메프'

In [131]:
high_df = pd.concat([hdf1, hdf2, hdf3], axis = 0)
high_df.drop(['Unnamed: 0'], axis=1, inplace = True)
low_df = pd.concat([ldf1, ldf2, ldf3], axis = 0)
low_df.drop(['Unnamed: 0'], axis=1, inplace = True)

In [132]:
## Resetting row numbers for clean dataframe.
high_df = high_df.reset_index(drop=True)
low_df = low_df.reset_index(drop=True)

In [8]:
print(high_df.info(),
    low_df.info(), sep = '\n')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 294 entries, 0 to 293
Data columns (total 16 columns):
info               294 non-null object
title              294 non-null object
advantage          294 non-null object
drawback           294 non-null object
opinion            294 non-null object
promotion          294 non-null float64
welfaresalary      294 non-null float64
worklifebalance    294 non-null float64
culture            294 non-null float64
executives         294 non-null float64
total              294 non-null float64
date               294 non-null object
region             294 non-null object
workingstatus      294 non-null object
department         294 non-null object
company            294 non-null object
dtypes: float64(6), object(10)
memory usage: 36.8+ KB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 264 entries, 0 to 263
Data columns (total 16 columns):
info               264 non-null object
title              264 non-null object
advantage          264 non-n

## Creating Text File for Frequency Analysis for Each Group

In [6]:
high_df['total_text'] = high_df['title'] + ' ' + high_df['advantage'] + ' '+high_df['drawback'] + ' '+high_df['opinion']
low_df['total_text'] = low_df['title'] + ' ' + low_df['advantage'] + ' '+ low_df['drawback'] + ' '+ low_df['opinion']

In [18]:
##Saving Dataframe as Text File
f = open("high_total_text.txt", 'w')
for i in range(high_df.shape[0]):
    data = high_df.total_text[i]
    f.write(data)
f.close()

g = open("low_total_text.txt", 'w')
for i in range(low_df.shape[0]):
    data = low_df.total_text[i]
    g.write(data)
g.close()

In [2]:
##packages for frequency analysis
from konlpy.tag import Kkma
from konlpy.tag import Okt
from konlpy.tag import Twitter
from collections import Counter

In [8]:
##Uploading stop words list file
stop_words = open('stop_words.txt', 'r').read()
stop_words = stop_words.split(' ')

In [9]:
##tokenizing words and country frequency 

from nltk.tokenize import word_tokenize 

high_text_file = open('high_total_text.txt', 'r').read()
word_tokens = word_tokenize(high_text_file)

high_result = []
for w in word_tokens: 
    if w not in stop_words: 
        high_result.append(w) 


for w, c in Counter(high_result).most_common(10):
    print(w, c)

좋은 98
많이 90
회사 88
복지 58
많음 56
너무 55
많은 47
좀 44
대한 41
업무 39


In [10]:
##tokenizing words and country frequency 

from nltk.tokenize import word_tokenize 

low_text_file = open('low_total_text.txt', 'r').read()
word_tokens = word_tokenize(low_text_file)

low_result = []
for w in word_tokens: 
    if w not in stop_words: 
        low_result.append(w) 


for w, c in Counter(low_result).most_common(10):
    print(w, c)

회사 127
너무 73
없음 71
많이 57
많음 56
좋은 52
연차 48
분위기 44
좋음 40
일을 39


In [11]:
##counting words by  morpheme
##Counting Frequent Nouns for High Group

twitter = Twitter()
high_nouns = twitter.nouns(high_text_file)
high_noun_result = []
for w in high_nouns: 
    if w not in stop_words: 
        high_noun_result.append(w) 


for w, c in Counter(high_noun_result).most_common(10):
    print(w, c)

  warn('"Twitter" has changed to "Okt" since KoNLPy v0.4.5.')
-------------------------------------------------------------------------------
Deprecated: convertStrings was not specified when starting the JVM. The default
behavior in JPype will be False starting in JPype 0.8. The recommended setting
for new code is convertStrings=False.  The legacy value of True was assumed for
please file a ticket with the developer.
-------------------------------------------------------------------------------

  """)


회사 232
복지 166
일 143
업무 134
기업 133
문화 116
사람 116
분위기 108
직 101
연봉 99


In [12]:
##counting words by  morpheme
##Counting Frequent Nouns for Low Group
low_nouns = twitter.nouns(low_text_file)
low_noun_result = []
for w in low_nouns: 
    if w not in stop_words: 
        low_noun_result.append(w) 


for w, c in Counter(low_noun_result).most_common(10):
    print(w, c)

회사 274
사람 178
일 167
업무 127
직원 111
연봉 99
복지 99
함 95
분위기 94
연차 84


In [13]:
##Counting Verbs or Adjectives 
##Counts for High Group

high_pos = twitter.pos(high_text_file)

high_verb_adj = []

for i in range(len(high_pos)):
    if high_pos[i][1] == 'Verb' or high_pos[i][1] == 'Adjective':
        high_verb_adj.append(high_pos[i])
    else:
        pass
    
high_result = []
for w in high_verb_adj: 
    if w[0] not in stop_words:
        high_result.append(w)
    else:
        pass
    
for w, c in Counter(high_result).most_common(10):
    print(w, c)

('하는', 'Verb') 139
('좋은', 'Adjective') 131
('많음', 'Adjective') 59
('많은', 'Adjective') 52
('높은', 'Adjective') 48
('없음', 'Adjective') 47
('좋음', 'Adjective') 38
('있고', 'Adjective') 33
('자유로운', 'Adjective') 31
('하면', 'Verb') 30


In [14]:
low_pos = twitter.pos(high_text_file)

low_verb_adj = []

for i in range(len(low_pos)):
    if low_pos[i][1] == 'Verb' or low_pos[i][1] == 'Adjective':
        low_verb_adj.append(low_pos[i])
    else:
        pass
    
low_result = []
for w in low_verb_adj: 
    if w[0] not in stop_words:
        low_result.append(w)
    else:
        pass
    
for w, c in Counter(low_result).most_common(10):
    print(w, c)

('하는', 'Verb') 139
('좋은', 'Adjective') 131
('많음', 'Adjective') 59
('많은', 'Adjective') 52
('높은', 'Adjective') 48
('없음', 'Adjective') 47
('좋음', 'Adjective') 38
('있고', 'Adjective') 33
('자유로운', 'Adjective') 31
('하면', 'Verb') 30


## Analysis plan

Purpose: Converting Text Data to Keyword Format

1. By reading each observation, each text data is going to be converted into keywords
- keywords are going to be arranged by each column. For example, advantage keyword and drawback keyword are going to be seperated.
 
2. Frequency Analysis for Keyword 

3. Categorizing Keywords based on Attractiveness/Avoidance Theory

## Converting high group's text into keyword data

In [None]:
high_keyword_df = high_df[['title', 'advantage', 'drawback', 'opinion', 'total_text', 'company']]
high_keyword_df['title_key'] = ''
high_keyword_df['advantage_key'] = ''
high_keyword_df['drawback_key'] = ''
high_keyword_df['opinion_key'] = ''

In [2]:
high_keyword_df = pd.read_csv('high_keyword_df.csv')
high_keyword_df.drop(['Unnamed: 0'], axis = 1, inplace = True)

In [3]:
high_keyword_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 294 entries, 0 to 293
Data columns (total 10 columns):
title            294 non-null object
advantage        294 non-null object
drawback         294 non-null object
opinion          262 non-null object
total_text       294 non-null object
company          294 non-null object
title_key        294 non-null object
advantage_key    294 non-null object
drawback_key     294 non-null object
opinion_key      294 non-null object
dtypes: object(10)
memory usage: 23.0+ KB


## Converting low group's text into keyword data

In [137]:
low_keyword_df = low_df[['title', 'advantage', 'drawback', 'opinion', 'total', 'company']]
low_keyword_df['title_key'] = ''
low_keyword_df['advantage_key'] = ''
low_keyword_df['drawback_key'] = ''
low_keyword_df['opinion_key'] = ''

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the cavea

In [4]:
low_keyword_df = pd.read_csv('low_keyword_df.csv')
low_keyword_df.drop(['Unnamed: 0'], axis = 1, inplace = True)

In [5]:
low_keyword_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 264 entries, 0 to 263
Data columns (total 10 columns):
title            264 non-null object
advantage        264 non-null object
drawback         264 non-null object
opinion          179 non-null object
total            264 non-null float64
company          264 non-null object
title_key        264 non-null object
advantage_key    264 non-null object
drawback_key     264 non-null object
opinion_key      264 non-null object
dtypes: float64(1), object(9)
memory usage: 20.7+ KB
