# String Variable Cleaning 

In [1]:
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt
from nltk.tokenize import sent_tokenize, word_tokenize, RegexpTokenizer
from nltk.stem import WordNetLemmatizer
from nltk.stem.porter import PorterStemmer
from nltk.corpus import stopwords

In [2]:
train = pd.read_csv('datasets/train_first_clean.csv')

In [3]:
train.head()

Unnamed: 0,job_description,job_desig,key_skills,location,min_experience,max_experience,average_salary
0,Exp: Minimum 5 years;Good understanding of IOC...,Senior Exploit and Vulnerability Researcher,"team skills, communication skills, analytical ...",Delhi NCR(Vikas Puri),5,7,11.0
1,He should have handled a team of atleast 5-6 d...,Head SCM,"ppc, logistics, inventory management, supply c...",Sonepat,10,17,17.5
2,Must be an effective communicator (written & s...,Deputy Manager - Talent Management & Leadershi...,"HR Analytics, Employee Engagement, Training, S...",Delhi NCR,5,9,27.5
3,7 - 10 years of overall experience in data e...,Associate Manager Data Engineering,"SQL, Javascript, Automation, Python, Ruby, Ana...",Bengaluru,7,10,17.5
4,Chartered Accountancy degree or MBA in Finance...,TS- GSA- Senior Analyst,"accounting, finance, cash flow, financial plan...",Gurgaon,1,3,6.0


### I won't touch the key_skills column as there are only key words and symbols mean something 

In [4]:
train['key_skills']

0        team skills, communication skills, analytical ...
1        ppc, logistics, inventory management, supply c...
2        HR Analytics, Employee Engagement, Training, S...
3        SQL, Javascript, Automation, Python, Ruby, Ana...
4        accounting, finance, cash flow, financial plan...
                               ...                        
19796    Medical Coding, ICD - 10, US Healthcare, RCM, ...
19797    offline, online, part time, home base, work fr...
19798    SQL Server, VB.NET, C#, .Net, C#.Net, Oracle S...
19799    accounting, internal audit, auditing, risk adv...
19800    IOS, XCode, Apple, Version Control, Perforce, ...
Name: key_skills, Length: 19801, dtype: object

### Same with the job_desig variable 

In [5]:
train['job_desig']

0              Senior Exploit and Vulnerability Researcher
1                                                 Head SCM
2        Deputy Manager - Talent Management & Leadershi...
3                       Associate Manager Data Engineering
4                                  TS- GSA- Senior Analyst
                               ...                        
19796                    Director, Medical Coding Training
19797        Looking For Freshers WHO WANT To Work WITH US
19798            PM- C#/ .Net ( Annuity/ Insurance Domain)
19799                          Consultant - Internal Audit
19800        Unity 3D Developer - Mobile Games Development
Name: job_desig, Length: 19801, dtype: object

### I will manipulate the job_description variable with Tokenizer, Lemmatizer, and PortStemmer 

In [6]:
tokenizer = RegexpTokenizer(pattern=r'\w+')

In [7]:
train['job_description_tokens'] = train['job_description'].apply(lambda row: tokenizer.tokenize(row.lower()))

In [8]:
lemmatizer = WordNetLemmatizer()

In [9]:
train['job_description_tokens'] = train['job_description_tokens'].apply(lambda tokens: [lemmatizer.lemmatize(token) for token in tokens])

### PortStemmer is altering the data incorrectly, it's shortening certain words, etc. so I will not save the changes

In [10]:
p_stemmer = PorterStemmer()

In [11]:
train['job_description_tokens'].apply(lambda tokens: [p_stemmer.stem(token) for token in tokens])

0        [exp, minimum, 5, year, good, understand, of, ...
1        [he, should, have, handl, a, team, of, atleast...
2        [must, be, an, effect, commun, written, spoken...
3        [7, 10, year, of, overal, experi, in, data, en...
4        [charter, account, degre, or, mba, in, financ,...
                               ...                        
19796                                        [not_specifi]
19797                                        [not_specifi]
19798    [annuiti, domain, experi, is, mandatori, work,...
19799    [by, plug, into, our, market, lead, global, ne...
19800    [3, 5, year, experi, with, the, end, to, end, ...
Name: job_description_tokens, Length: 19801, dtype: object

### Let's see the data without stopwords 

In [12]:
eng_stopwords = stopwords.words('english')
train['job_description_tokens'].apply(lambda tokens: [token for token in tokens if token not in eng_stopwords])

0        [exp, minimum, 5, year, good, understanding, i...
1        [handled, team, atleast, 5, 6, direct, reporte...
2        [must, effective, communicator, written, spoke...
3        [7, 10, year, overall, experience, data, engin...
4        [chartered, accountancy, degree, mba, finance,...
                               ...                        
19796                                      [not_specified]
19797                                      [not_specified]
19798    [annuity, domain, experience, mandatory, work,...
19799    [plugging, market, leading, global, network, y...
19800    [3, 5, year, experience, end, end, process, cr...
Name: job_description_tokens, Length: 19801, dtype: object

In [13]:
train['job_description'] = train['job_description_tokens'].apply(lambda token: ' '.join(token))

In [14]:
train.head()

Unnamed: 0,job_description,job_desig,key_skills,location,min_experience,max_experience,average_salary,job_description_tokens
0,exp minimum 5 year good understanding of ioc r...,Senior Exploit and Vulnerability Researcher,"team skills, communication skills, analytical ...",Delhi NCR(Vikas Puri),5,7,11.0,"[exp, minimum, 5, year, good, understanding, o..."
1,he should have handled a team of atleast 5 6 d...,Head SCM,"ppc, logistics, inventory management, supply c...",Sonepat,10,17,17.5,"[he, should, have, handled, a, team, of, atlea..."
2,must be an effective communicator written spok...,Deputy Manager - Talent Management & Leadershi...,"HR Analytics, Employee Engagement, Training, S...",Delhi NCR,5,9,27.5,"[must, be, an, effective, communicator, writte..."
3,7 10 year of overall experience in data engine...,Associate Manager Data Engineering,"SQL, Javascript, Automation, Python, Ruby, Ana...",Bengaluru,7,10,17.5,"[7, 10, year, of, overall, experience, in, dat..."
4,chartered accountancy degree or mba in finance...,TS- GSA- Senior Analyst,"accounting, finance, cash flow, financial plan...",Gurgaon,1,3,6.0,"[chartered, accountancy, degree, or, mba, in, ..."


In [15]:
train.drop(columns={'job_description_tokens'}, inplace=True)

In [16]:
train.head()

Unnamed: 0,job_description,job_desig,key_skills,location,min_experience,max_experience,average_salary
0,exp minimum 5 year good understanding of ioc r...,Senior Exploit and Vulnerability Researcher,"team skills, communication skills, analytical ...",Delhi NCR(Vikas Puri),5,7,11.0
1,he should have handled a team of atleast 5 6 d...,Head SCM,"ppc, logistics, inventory management, supply c...",Sonepat,10,17,17.5
2,must be an effective communicator written spok...,Deputy Manager - Talent Management & Leadershi...,"HR Analytics, Employee Engagement, Training, S...",Delhi NCR,5,9,27.5
3,7 10 year of overall experience in data engine...,Associate Manager Data Engineering,"SQL, Javascript, Automation, Python, Ruby, Ana...",Bengaluru,7,10,17.5
4,chartered accountancy degree or mba in finance...,TS- GSA- Senior Analyst,"accounting, finance, cash flow, financial plan...",Gurgaon,1,3,6.0


In [17]:
train.to_csv('datasets/train_cleaned.csv', index=False)