In [1]:
# This part of the analysis is a decription of dataset following the EDA (Explorative Data Analysis) showing
# the columns or variables, the dimension of the dataset in rows and columns, the first five rows,
# I have also checked if there are null values which
# decided to delete. The reason why I decided to delete them is the impossibility to subsitute them with
# the "mean" giben the fact that I'm analyzing text data.

# here import the necessary libraries 
import nltk
import matplotlib.pyplot as plt
from nltk.tokenize import RegexpTokenizer
import seaborn as sns
import pandas as pd
from pandas.compat import StringIO

In [2]:
# read the data file
with open('job_skills.csv',  'r') as myfile:
  jobs = myfile.read()

In [3]:
# convert it to dataframe. I will convert it to dataframe as it is the most useful format for doing EDA (Exploratory data analysis)
data_jobs = pd.read_csv(StringIO(jobs))

In [4]:
print('Step 1 - the columns of the dataset ')
print(data_jobs.columns)

Step 1 - the columns of the dataset 
Index(['Company', 'Title', 'Category', 'Location', 'Responsibilities',
       'Minimum Qualifications', 'Preferred Qualifications'],
      dtype='object')


In [5]:
print('Step 2 - the shape of the data in rows and columns')
print('shape of the datasets = ',data_jobs.shape)

Step 2 - the shape of the data in rows and columns
shape of the datasets =  (1250, 7)


In [6]:
print('Step 3 - the first five rows of the data')
print(data_jobs.head(5))

Step 3 - the first five rows of the data
  Company                                              Title  \
0  Google                       Google Cloud Program Manager   
1  Google  Supplier Development Engineer (SDE), Cable/Con...   
2  Google  Data Analyst, Product and Tools Operations, Go...   
3  Google            Developer Advocate, Partner Engineering   
4  Google     Program Manager, Audio Visual (AV) Deployments   

                       Category                          Location  \
0            Program Management                         Singapore   
1  Manufacturing & Supply Chain                   Shanghai, China   
2           Technical Solutions       New York, NY, United States   
3           Developer Relations  Mountain View, CA, United States   
4            Program Management      Sunnyvale, CA, United States   

                                    Responsibilities  \
0  Shape, shepherd, ship, and show technical prog...   
1  Drive cross-functional activities in the sup

In [7]:
print('Step 4 - Check if there are null values')
print('There are null variables = ', data_jobs.isnull().sum())

Step 4 - Check if there are null values
There are null variables =  Company                      0
Title                        0
Category                     0
Location                     0
Responsibilities            15
Minimum Qualifications      14
Preferred Qualifications    14
dtype: int64


In [8]:
print('Step 5 - Delete eventual null values')
data_jobs = data_jobs.dropna()
print('Final shape of the dataset after deleting rows with null values ')
print('After deleting 15 rows containing null values the new dimension of the dataset is as follows : ',data_jobs.shape)


Step 5 - Delete eventual null values
Final shape of the dataset after deleting rows with null values 
After deleting 15 rows containing null values the new dimension of the dataset is as follows :  (1235, 7)


In [9]:
# the process of text analysis goes through the followng steps :
# 1-tokenize the text in single words
# 2-put all the words in lower case
# 3- remove stop words such as 'i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'you', "you're"


In [10]:
# instantiate a tokenizer with RegexpTokenizer and use it to create tokens (words) from the text
tokenizer = RegexpTokenizer('\w+')
tokens = tokenizer.tokenize(jobs)


In [11]:
# create list where to put the lower case tokens
words = []

# Loop through list tokens and make lower case
for word in tokens:
    words.append(word.lower())


In [14]:
# remove stopwords such as 'i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're"
# and print them
sw = nltk.corpus.stopwords.words('english')
print(sw[:10])

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're"]


In [15]:
# Initialize new list that will contain text without stop words
words_ns = []

# Add to words_ns all words that are in words but not in sw
for word in words:
    if word not in sw:
        words_ns.append(word)

In [16]:
# Print several list items as sanity check
print(words_ns[:20])

['company', 'title', 'category', 'location', 'responsibilities', 'minimum', 'qualifications', 'preferred', 'qualifications', 'google', 'google', 'cloud', 'program', 'manager', 'program', 'management', 'singapore', 'shape', 'shepherd', 'ship']


In [None]:
# plot the frequency of the words 
# adjust the plot margins.
plt.subplots_adjust(left=0.15, bottom=0.25, right=0.9, top=0.8)

# Figures inline and set visualization style
# matplotlib inline
sns.set()

# Create freq dist and plot
freqdist1 = nltk.FreqDist(words_ns)
freqdist1.plot(25)