In [2]:
# Importing all the necessary libraries

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [227]:
# Now read the data 
df=pd.read_csv(r'D:\Akshay\Python Projects\Naukri.com Analysis\Datasets\naukri_com-job_sample.csv')

In [231]:
# Lets take a lookat the data columns available
df.columns

Index(['company', 'education', 'experience', 'industry', 'jobdescription',
       'jobid', 'joblocation_address', 'jobtitle', 'numberofpositions',
       'payrate', 'postdate', 'site_name', 'skills', 'uniq_id'],
      dtype='object')

In [232]:
df.shape

(22000, 14)

In [233]:
# Lets see how many records are missing for each column
count_missing=df.isnull().sum()
count_missing

company                    4
education               1996
experience                 4
industry                   5
jobdescription             4
jobid                      0
joblocation_address      501
jobtitle                   0
numberofpositions      17536
payrate                   97
postdate                  23
site_name              18013
skills                   528
uniq_id                    0
dtype: int64

In [236]:
# Now we will identify what percent of records are blank for each column
percent_missing=count_missing*100/len(df)
percent_missing

company                 0.018182
education               9.072727
experience              0.018182
industry                0.022727
jobdescription          0.018182
jobid                   0.000000
joblocation_address     2.277273
jobtitle                0.000000
numberofpositions      79.709091
payrate                 0.440909
postdate                0.104545
site_name              81.877273
skills                  2.400000
uniq_id                 0.000000
dtype: float64

In [237]:
# We can show both number of missing values and percent as part of dataframe

missing_values_df=pd.DataFrame({'count_missing':count_missing,
                           'percent_missing':percent_missing})

In [238]:
missing_values_df.style.background_gradient()

Unnamed: 0,count_missing,percent_missing
company,4,0.018182
education,1996,9.072727
experience,4,0.018182
industry,5,0.022727
jobdescription,4,0.018182
jobid,0,0.0
joblocation_address,501,2.277273
jobtitle,0,0.0
numberofpositions,17536,79.709091
payrate,97,0.440909


 #### Unique Values
 - Now we will try to find the unique values in each column
 - We can do this either by using for loop or by using List Comprehension

In [10]:
for col in df.columns:
    print('{} has {} unique values'.format(col,df[col].nunique()))

company has 8469 unique values
education has 2355 unique values
experience has 147 unique values
industry has 63 unique values
jobdescription has 21063 unique values
jobid has 21910 unique values
joblocation_address has 2329 unique values
jobtitle has 17564 unique values
numberofpositions has 84 unique values
payrate has 1848 unique values
postdate has 4203 unique values
site_name has 1 unique values
skills has 45 unique values
uniq_id has 22000 unique values


In [11]:
unique=[]
for col in df.columns:
    unique.append([col,df[col].nunique(),df[col].unique()])

In [12]:
unique

[['company',
  8469,
  array(['MM Media Pvt Ltd', 'find live infotech',
         'Softtech Career Infosystem Pvt. Ltd', ...,
         'TeamLease Services Limited hiring for BPA Solution Architect',
         'Austere Technology Solutions', 'Musk - Startup'], dtype=object)],
 ['education',
  2355,
  array(['UG: B.Tech/B.E. - Any Specialization PG:Any Postgraduate - Any Specialization, Post Graduation Not Required',
         'UG: B.Tech/B.E. - Any Specialization PG:MBA/PGDM - Any Specialization',
         'UG: Any Graduate - Any Specialization PG:Any Postgraduate Doctorate:Doctorate Not Required',
         ..., 'PG:MS/M.Sc(Science) - Any Specialization, Statistics',
         'UG: B.Tech/B.E. - Any Specialization PG:MCA - Computers, M.Tech - Any Specialization, MS/M.Sc(Science) - Any Specialization Doctorate:Doctorate Not Required',
         'UG: B.Tech/B.E. - Computers PG:MS/M.Sc(Science) - Computers Doctorate:Doctorate Not Required'],
        dtype=object)],
 ['experience',
  147,
  arra

- Finding unique values using List Comprehension

In [13]:
unique_df=[[col,df[col].nunique(),df[col].unique()] for col in df.columns]
unique_df

[['company',
  8469,
  array(['MM Media Pvt Ltd', 'find live infotech',
         'Softtech Career Infosystem Pvt. Ltd', ...,
         'TeamLease Services Limited hiring for BPA Solution Architect',
         'Austere Technology Solutions', 'Musk - Startup'], dtype=object)],
 ['education',
  2355,
  array(['UG: B.Tech/B.E. - Any Specialization PG:Any Postgraduate - Any Specialization, Post Graduation Not Required',
         'UG: B.Tech/B.E. - Any Specialization PG:MBA/PGDM - Any Specialization',
         'UG: Any Graduate - Any Specialization PG:Any Postgraduate Doctorate:Doctorate Not Required',
         ..., 'PG:MS/M.Sc(Science) - Any Specialization, Statistics',
         'UG: B.Tech/B.E. - Any Specialization PG:MCA - Computers, M.Tech - Any Specialization, MS/M.Sc(Science) - Any Specialization Doctorate:Doctorate Not Required',
         'UG: B.Tech/B.E. - Computers PG:MS/M.Sc(Science) - Computers Doctorate:Doctorate Not Required'],
        dtype=object)],
 ['experience',
  147,
  arra

In [14]:
count_df=pd.DataFrame(unique_df, columns=['col_name','count','unique'])
count_df

Unnamed: 0,col_name,count,unique
0,company,8469,"[MM Media Pvt Ltd, find live infotech, Softtec..."
1,education,2355,[UG: B.Tech/B.E. - Any Specialization PG:Any P...
2,experience,147,"[0 - 1 yrs, 0 - 0 yrs, 4 - 8 yrs, 11 - 15 yrs,..."
3,industry,63,"[Media / Entertainment / Internet, Advertising..."
4,jobdescription,21063,[Job Description Send me Jobs like this Qual...
5,jobid,21910,"[210516002263, 210516002391, 101016900534, 810..."
6,joblocation_address,2329,"[Chennai, Bengaluru, Mumbai, Bengaluru, Kolkat..."
7,jobtitle,17564,"[Walkin Data Entry Operator (night Shift), Wor..."
8,numberofpositions,84,"[nan, 60.0, 4.0, 2.0, 20.0, 3.0, 1.0, 700.0, 8..."
9,payrate,1848,"[1,50,000 - 2,25,000 P.A, 1,50,000 - 2,50,000 ..."


In [15]:
count_df.style.background_gradient()

Unnamed: 0,col_name,count,unique
0,company,8469,['MM Media Pvt Ltd' 'find live infotech'  'Softtech Career Infosystem Pvt. Ltd' ...  'TeamLease Services Limited hiring for BPA Solution Architect'  'Austere Technology Solutions' 'Musk - Startup']
1,education,2355,"['UG: B.Tech/B.E. - Any Specialization PG:Any Postgraduate - Any Specialization, Post Graduation Not Required'  'UG: B.Tech/B.E. - Any Specialization PG:MBA/PGDM - Any Specialization'  'UG: Any Graduate - Any Specialization PG:Any Postgraduate Doctorate:Doctorate Not Required'  ... 'PG:MS/M.Sc(Science) - Any Specialization, Statistics'  'UG: B.Tech/B.E. - Any Specialization PG:MCA - Computers, M.Tech - Any Specialization, MS/M.Sc(Science) - Any Specialization Doctorate:Doctorate Not Required'  'UG: B.Tech/B.E. - Computers PG:MS/M.Sc(Science) - Computers Doctorate:Doctorate Not Required']"
2,experience,147,['0 - 1 yrs' '0 - 0 yrs' '4 - 8 yrs' '11 - 15 yrs' '6 - 8 yrs' '2 - 5 yrs'  '1 - 3 yrs' '2 - 7 yrs' '1 - 5 yrs' '2 - 4 yrs' '3 - 8 yrs' '5 - 7 yrs'  '1 - 2 yrs' '5 - 10 yrs' '6 - 10 yrs' '10 - 12 yrs' '4 - 6 yrs'  '1 - 6 yrs' '3 - 6 yrs' '3 - 7 yrs' '4 - 9 yrs' '3 - 5 yrs' '0 - 5 yrs'  '5 - 8 yrs' '9 - 12 yrs' '7 - 12 yrs' '10 - 15 yrs' '4 - 7 yrs'  '8 - 12 yrs' '10 - 16 yrs' '5 - 9 yrs' '0 - 4 yrs' '10 - 20 yrs'  '10 - 18 yrs' '0 - 2 yrs' '7 - 10 yrs' '6 - 11 yrs' '2 - 3 yrs'  '1 - 4 yrs' '7 - 9 yrs' '8 - 13 yrs' '2 - 6 yrs' '8 - 10 yrs' '0 - 3 yrs'  '6 - 9 yrs' '15 - 20 yrs' '12 - 16 yrs' '1 - 1 yrs' '3 - 4 yrs'  '17 - 20 yrs' '10 - 17 yrs' '9 - 14 yrs' '13 - 15 yrs' '17 - 23 yrs'  '15 - 25 yrs' '14 - 22 yrs' '4 - 5 yrs' '7 - 11 yrs' '16 - 24 yrs'  '14 - 18 yrs' '12 - 15 yrs' '7 - 8 yrs' '5 - 6 yrs' '12 - 19 yrs'  '20 - 25 yrs' '10 - 13 yrs' '12 - 18 yrs' '6 - 7 yrs' '10 - 14 yrs'  '12 - 20 yrs' 'Not Mentioned' '11 - 16 yrs' '15 - 17 yrs' '20 - 30 yrs'  '15 - 21 yrs' '8 - 11 yrs' '8 - 9 yrs' '9 - 13 yrs' '15 - 22 yrs'  '14 - 19 yrs' '13 - 19 yrs' '15 - 16 yrs' '14 - 20 yrs' '11 - 12 yrs'  '1 - 8 yrs' '9 - 11 yrs' '15 - 18 yrs' '4 - 10 yrs' '10 - 11 yrs'  '12 - 22 yrs' '7 - 16 yrs' '16 - 18 yrs' '13 - 17 yrs' '13 - 20 yrs'  '5 - 5 yrs' '1 - 10 yrs' '12 - 14 yrs' '5 - 15 yrs' '11 - 17 yrs' nan  '16 - 20 yrs' '13 - 23 yrs' '3 - 3 yrs' '18 - 20 yrs' '23 - 30 yrs'  '14 - 24 yrs' '18 - 28 yrs' '11 - 21 yrs' '15 - 19 yrs' '18 - 22 yrs'  '6 - 12 yrs' '14 - 17 yrs' '12 - 17 yrs' '13 - 18 yrs' '25 - 30 yrs'  '11 - 14 yrs' '2 - 8 yrs' '2 - 10 yrs' '4 - 11 yrs' '11 - 13 yrs'  '14 - 16 yrs' '2 - 2 yrs' '10 - 19 yrs' '14 - 15 yrs' '2 - 9 yrs'  '13 - 16 yrs' '7 - 7 yrs' '9 - 10 yrs' '11 - 20 yrs' '12 - 13 yrs'  '15 - 24 yrs' '3 - -1 yrs' '10 - 10 yrs' '26 - 30 yrs' '7 - 14 yrs'  '11 - 19 yrs' '16 - 25 yrs' '11 - 18 yrs' '18 - 25 yrs' '17 - 25 yrs'  '18 - 21 yrs' '18 - 23 yrs' '20 - 24 yrs' '20 - 23 yrs' '3 - 10 yrs'  '17 - 27 yrs' '16 - 26 yrs' '13 - 22 yrs']
3,industry,63,['Media / Entertainment / Internet'  'Advertising / PR / MR / Event Management'  'IT-Software / Software Services'  'Banking / Financial Services / Broking' 'Aviation / Aerospace Firms'  'Industrial Products / Heavy Machinery' 'FMCG / Foods / Beverage'  'Recruitment / Staffing' 'Internet / Ecommerce'  'Travel / Hotels / Restaurants / Airlines / Railways'  'BPO / Call Centre / ITES' 'Pharma / Biotech / Clinical Research'  'Real Estate / Property' 'Insurance' 'Facility Management' 'Publishing'  'Education / Teaching / Training' 'Retail / Wholesale'  'Automobile / Auto Anciliary / Auto Components'  'Chemicals / PetroChemical / Plastic / Rubber' 'Government / Defence'  'Accounting / Finance' 'Textiles / Garments / Accessories'  'Semiconductors / Electronics' 'Telecom/ISP'  'Medical / Healthcare / Hospitals' 'Legal'  'Courier / Transportation / Freight / Warehousing'  'NGO / Social Services / Regulators / Industry Associations' 'Other'  'Architecture / Interior Design' 'KPO / Research / Analytics'  'Construction / Engineering / Cement / Metals'  'Office Equipment / Automation'  'Consumer Electronics / Appliances / Durables' 'Iron and Steel'  'Strategy / Management Consulting Firms'  'Oil and Gas / Energy / Power / Infrastructure'  'IT-Hardware & Networking' 'Wellness / Fitness / Sports / Beauty'  'Agriculture / Dairy' 'Electricals / Switchgears'  'Security / Law Enforcement' 'Gems / Jewellery'  'Ceramics / Sanitary ware' 'Fresher / Trainee / Entry Level'  'Food Processing' 'Printing / Packaging' 'Brewery / Distillery'  'Telecom/ISP /' 'Export / Import' 'Heat Ventilation / Air Conditioning'  'Wellness / Fitness / Sports' 'Water Treatment / Waste Management' nan  'Shipping / Marine' 'Glass / Glassware' 'Animation / Gaming'  'Fertilizers / Pesticides' 'Pulp and Paper' 'Tyres' 'Leather'  'Broadcasting' 'Medical Devices / Equipments']
4,jobdescription,21063,"['Job Description \xa0 Send me Jobs like this Qualifications: - == > 10th To Graduation & Any Skill: - == > Basic Computer Knowledge Job Requirement : - == > System or Laptop Type of job: - == > Full Time or Part time Languages : - == > Tamil & English. Experience : - == > Freshers & Experience payment details: - 1 form per day 5/- 10 form per day 50/- 100 form per day 500/- monthly you can earn 15000/- per month Selection Process: - == > Easy Selection Process,So What Are You Waiting For? Apply Now & Grab Best Opportunity To Make Your Carrier & To Improve Your Earing Skills. More detail contact Mr Hari 8678902528 9003010282 Salary:INR 1,50,000 - 2,25,000 P.A Industry: Media / Entertainment / Internet Functional Area: ITES , BPO , KPO , LPO , Customer Service , Operations Role Category:Other Role:Fresher Keyskills English Typing Part Time Data Entry Selection Process Desired Candidate Profile Education- UG: B.Tech/B.E. - Any Specialization PG:Any Postgraduate - Any Specialization, Post Graduation Not Required Please refer to the Job description above Company Profile: MM Media Pvt Ltd Mass Media International Find Live Info Pvt Ltd DATA SERVICES is established in the year 2012 with a vision of transforming an ordinary to extra ordinary thing. Find live info is one of our main project, it is a developing search engine site based at Chennai, India. With years of experience, we have crafted massive data collection job opportunities for public. FIND LIVE INFO a blend of experience, imagination, strategy and action to create digital experience that excite, challenge, inspire and engage users. FIND LIVE INFO provides other business opportunities which will be handled by experts of various fields. We have interesting combination of young and experienced Specialists Under the guidance of our Top Management we bloom towards success More detail contect 8678902528 Download PPT Photo 1 \xa0 View Contact Details'  'Job Description \xa0 Send me Jobs like this Qualifications: - == > 10th To Graduation & Any Skill: - == > Basic Computer Knowledge Job Requirement : - == > System or Laptop Type of job: - == > Full Time or Part time Languages : - == > Tamil & English. Experience : - == > Freshers & Experience payment details: - 1 form per day 5/- 10 form per day 50/- 100 form per day 500/- monthly you can earn 15000/- per month Selection Process: - == > Easy Selection Process,So What Are You Waiting For? Apply Now & Grab Best Opportunity To Make Your Carrier & To Improve Your Earing Skills. Chennai Peoples Only Contact Deepika Team Leader 9087728815 / 9087726713 Salary:INR 1,50,000 - 2,50,000 P.A. 20000 Industry: Advertising / PR / MR / Event Management Functional Area: Marketing , Advertising , MR , PR , Media Planning Role Category:Online/Digital Marketing Role:Search Engine Marketing/SEM Specialist Keyskills Data Entry data processing Desired Candidate Profile Education- UG: B.Tech/B.E. - Any Specialization PG:MBA/PGDM - Any Specialization Please refer to the Job description above Company Profile: find live infotech Find Live Infotech is one of our main projects, it is a developing search engine site based at chennai , India. With years of experience , we have crafted massive data collection job oppurtunties for public. Find live infotech provides other business opportunities which will be handled by experts of various fields Download PPT Photo 1 \xa0 View Contact Details'  'Job Description \xa0 Send me Jobs like this - as a developer in providing application design guidance and consultation, utilizing a thorough understanding of applicable technology, tools and existing designs. - Analyzes highly complex business requirements, designs and writes technical specifications to design or redesign complex computer platforms and applications. - Provides coding direction to less experienced staff or develops highly complex original code. - Acts as an expert technical resource for modeling, simulation and analysis efforts. - Verifies program logic by overseeing the preparation of test data, testing and debugging of programs. - Oversees overall systems testing and the migration of platforms and applications to production. - Develops new documentation, departmental technical procedures and user guides. - Leads projects, allocates and manages resources and manages the work of less experienced staff. - Assures quality, security and compliance requirements are met for supported area and oversees creation of or updates to and testing of the business continuation plan.: 7+ years application development and implementation experience. Additional Job Details: - Looking for PL/SQL developer, who has experience loading data using SQL loader. Salary: Not Disclosed by Recruiter Industry: IT-Software / Software Services Functional Area: IT Software - Application Programming , Maintenance Role Category:Programming & Design Role:Software Developer Keyskills SQL Loader PL SQL Development PLSQL Procedures Application Design Test Data Application Development Desired Candidate Profile \xa0 Education- UG: Any Graduate - Any Specialization PG:Any Postgraduate Doctorate:Doctorate Not Required Company Profile: Softtech Career Infosystem Pvt. Ltd Leading client of Softtech Career Infosystem Pvt. Ltd Download PPT Photo 1 \xa0 View Contact Details'  ...  'Job Description \xa0 Send me Jobs like this Work with tech lead to architect and develop a full stack of functionalities, from scratch Design and own product components, from initial technology to scale Write high performing code that deploys to production multiple times a day Make key architectural decisions as well as solve scaling challenges Collaborate with the team and communicate effectively, to solve problems and build, extend, optimize and refactor the back-end architecture Salary: Not Disclosed by Recruiter Industry: Internet / Ecommerce Functional Area: IT Software - Application Programming , Maintenance Role Category:Programming & Design Role:Software Developer Keyskills Design Solutions Python Java Javascript HTML CSS Node.js PostgreSQL jQuery EC2 DynamoDB RDS SNS SQS S3 EMR Desired Candidate Profile Strong understanding and experience in any one of core back-end programming language (Java, C++, Python etc.) Strong understanding of Object-Oriented Programming. Strong understanding of algorithms and data structures. Understanding and experience of working with mid to large scale SQL database systems (MySQL, PostgreSQL, Microsoft-SQL etc). User authentication and authorization between multiple systems, servers, and environments. Familiarity with framework tools: spring, REST services. Knowledge in HTML, CSS, JavaScript, jQuery. Understanding of Node.js and web frameworks for Node.js (expressjs etc.) Understanding of code versioning tools, such as Git. Comfortable working within non-GUI linux shell environments. Familiarity with AWS products - EC2, DynamoDB, RDS, SNS, SQS, S3, EMR etc. The right candidate loves programming, ships code to production quickly and is passionate about building products for web & mobile.Skills: Python, Java, Javascript, HTML, CSS, Node.js, PostgreSQL, AWS/EC2/ELB/S3/DynamoDB Education- UG: B.Tech/B.E. - Any Specialization PG:Any Postgraduate Doctorate:Doctorate Not Required Company Profile: Musk - Startup Musk is an invoice discounting platform that partners with medium and large corporates to finance their SME vendors against approved invoices. The platform manages entire life cycle of loan- from sourcing to servicing. Data-driven intelligent on-boarding, diligence, and process automation renders invoice discounting programs scalable and safe for lending partners. Download PPT Photo 1 \xa0 View Contact Details'  'Job Description \xa0 Send me Jobs like this We are looking for a Senior UI Developers and Technical leads for a leading IT MNC at Delhi/NCR. Role : Senior Developer / Technical Lead / Manager Location: Delhi/ NCR SKILL REQUIREMENTS : - Expertise in HTML5 / CSS, JQuery, Javascript. - Hands on experience in Object Oriented Java script, AngularJS, BackboneJS. - The candidate should be willing to work as an individual contributor. - The candidate will be mainly involved in Design and development of Web/UI content. He should be dynamic and flexible. - Exposure in basic UI tools like HTML/CSS is must. Salary: Not Disclosed by Recruiter Industry: IT-Software / Software Services Functional Area: IT Software - Application Programming , Maintenance Role Category:Programming & Design Role:Team Lead/Technical Lead Keyskills Javascript HTML CSS UI Development JQuery Java Web Technologies Web UI Design Development Desired Candidate Profile EXPERIENCE : 7-13 yrsEducation : Btech / MCA / ME Education- UG: B.Tech/B.E. - Any Specialization PG:MCA - Computers, M.Tech - Any Specialization Doctorate:Doctorate Not Required Company Profile: Confidential Leading client Download PPT Photo 1 \xa0 View Contact Details'  'Job Description \xa0 Send me Jobs like this Job description : Experience of 5-10 years with most of this experience in Wireless (LTE/3G/2G) RF firmware development and will be an individual contributor. He/She should have the ability to Develop, enhance, troubleshoot of RF control firmware Participate in Baseband-RFIC interface bring-up of radio boards Develop manufacturing test firmware for RF boards Creation/customization of calibration and verification routines. Work closely with LTE baseband and RF hardware teams for RF firmware integration and testing Coach and mentor team mates and groom them to reach their goals. Support pre and post sales activities. Specific Knowledge/Skills Strong experience in development, customization of RF driver firmware Strong understanding of integrated RFIC based RF hardware Strong experience with developing and optimizing RF calibration and verification routines Proficient in C. Experience with configuration management and bug tracking systems Excellent communication skills Experience working both independently and in a team-oriented, collaborative environment is essential. Salary:INR 12,00,000 - 22,00,000 P.A Industry: IT-Software / Software Services Functional Area: IT Software - System Programming Role Category:Programming & Design Role:Software Developer Keyskills 2G 3G LTE C Optimization RF Post Sales Communication Skills Bug Tracking Firmware Development Desired Candidate Profile Please refer to the Job description above Education- UG: B.Tech/B.E. - Any Specialization PG:MCA - Computers, M.Tech - Any Specialization Company Profile: Cambio Consulting India Pvt Ltd Product MNC Download PPT Photo 1 \xa0 View Contact Details']"
5,jobid,21910,[210516002263 210516002391 101016900534 ... 251116900644 231116901329  251116003840]
6,joblocation_address,2329,"['Chennai' 'Bengaluru'  'Mumbai, Bengaluru, Kolkata, Chennai, Coimbatore, Tamilnadu' ...  'Mumbai, Bengaluru, Hyderabad, Kolkata'  'Mumbai, Chennai, Delhi, Bengaluru, Hyderabad, Pune'  'Bengaluru, Malaysia']"
7,jobtitle,17564,['Walkin Data Entry Operator (night Shift)'  'Work Based Onhome Based Part Time.' 'Pl/sql Developer - SQL' ...  'Full Stack Development Engineer'  'Sr UI Developer/ Technical Lead - Html/ CSS/ Javascript/ Angularjs'  'RF Firmware Engineers']
8,numberofpositions,84,[ nan 6.00e+01 4.00e+00 2.00e+00 2.00e+01 3.00e+00 1.00e+00 7.00e+02  8.00e+00 5.00e+00 7.00e+00 2.50e+01 1.00e+01 6.50e+02 1.50e+01 5.00e+01  3.00e+01 6.00e+00 4.50e+01 1.30e+01 1.00e+02 2.70e+01 2.20e+01 1.90e+01  1.10e+01 9.00e+00 3.60e+01 9.90e+01 1.80e+01 7.00e+01 4.90e+01 3.50e+01  5.20e+01 1.25e+02 2.00e+02 1.20e+03 1.40e+01 1.20e+01 3.80e+01 5.00e+02  4.00e+01 1.00e+03 4.00e+02 1.50e+02 4.30e+01 7.50e+01 5.50e+01 2.50e+02  3.00e+02 5.70e+01 6.00e+02 8.60e+01 3.40e+01 8.80e+01 8.90e+01 9.50e+01  3.50e+02 3.20e+01 7.40e+01 8.70e+01 2.40e+01 7.80e+01 6.80e+01 6.50e+01  9.80e+01 7.60e+01 9.20e+01 2.05e+02 8.00e+01 5.60e+01 6.40e+01 7.30e+01  3.10e+01 9.60e+01 1.70e+01 4.70e+01 9.00e+01 2.30e+01 5.50e+02 9.00e+02  1.60e+01 2.00e+03 2.90e+01 4.10e+01 1.26e+02]
9,payrate,1848,"['1,50,000 - 2,25,000 P.A' '1,50,000 - 2,50,000 P.A. 20000'  'Not Disclosed by Recruiter' ...  '35,00,000 - 50,00,000 P.A. Compensation: Best in Industry'  '15,00,000 - 30,00,000 P.A. Salary is open' '8,50,000 - 14,00,000 P.A']"


## Now we will do feature engineering on Payrate column to get minimum and maximum numeric values

- Currently PayRate feature has a single text value which consits of both minimum and maximum value
- Our aim is to create 2 numeric columns using this column which would contain miminum value and maximum value

In [268]:
# We will first look into the type of values flowing in the column

df['payrate'].head(20)

0                               1,50,000 - 2,25,000 P.A
1                        1,50,000 - 2,50,000 P.A. 20000
2                            Not Disclosed by Recruiter
3                            Not Disclosed by Recruiter
4                            Not Disclosed by Recruiter
5                            Not Disclosed by Recruiter
6                               3,00,000 - 6,50,000 P.A
7                            Not Disclosed by Recruiter
8                               1,00,000 - 2,00,000 P.A
9                            Not Disclosed by Recruiter
10                           Not Disclosed by Recruiter
11                           Not Disclosed by Recruiter
12                              1,75,000 - 2,50,000 P.A
13                           Not Disclosed by Recruiter
14                           Not Disclosed by Recruiter
15                           Not Disclosed by Recruiter
16    1,00,000 - 2,00,000 P.A. incentives for achiev...
17                              1,00,000 - 5,00,

- Looking at above our first approach would be to split on basis of "-" and get both the values

In [269]:
df['payrate'][0].split('-')

['1,50,000 ', ' 2,25,000 P.A']

In [270]:
df['payrate'][0].split('-')[0]

'1,50,000 '

In [271]:
df['payrate'][0].split('-')[1]

' 2,25,000 P.A'

In [272]:
# If we split the values based on "-" then we see some records with more than 2 partitions
# So, ths=is approach would not work for all the records

len_pay=[]
for pay in df['payrate']:
    len_pay.append(len(str(pay).split('-')))

pd.Series(len_pay).value_counts()

1    17028
2     4682
4      126
3       83
5       54
6       24
7        3
dtype: int64

In [274]:
# Our next approach is to convert the values in a dataframe 

payrate_split=df['payrate'].str.split('-')
payrate_split

0               [1,50,000 ,  2,25,000 P.A]
1        [1,50,000 ,  2,50,000 P.A. 20000]
2             [Not Disclosed by Recruiter]
3             [Not Disclosed by Recruiter]
4             [Not Disclosed by Recruiter]
                       ...                
21995         [Not Disclosed by Recruiter]
21996          [8,50,000 ,  14,00,000 P.A]
21997         [Not Disclosed by Recruiter]
21998         [Not Disclosed by Recruiter]
21999         [12,00,000 ,  22,00,000 P.A]
Name: payrate, Length: 22000, dtype: object

In [275]:
# Using expand we have have each value in a specific columns
# Now we can only use column 0 and 1 and disregard others

payrate_split=df['payrate'].str.split('-', expand=True)
payrate_split

Unnamed: 0,0,1,2,3,4,5,6
0,150000,"2,25,000 P.A",,,,,
1,150000,"2,50,000 P.A. 20000",,,,,
2,Not Disclosed by Recruiter,,,,,,
3,Not Disclosed by Recruiter,,,,,,
4,Not Disclosed by Recruiter,,,,,,
...,...,...,...,...,...,...,...
21995,Not Disclosed by Recruiter,,,,,,
21996,850000,"14,00,000 P.A",,,,,
21997,Not Disclosed by Recruiter,,,,,,
21998,Not Disclosed by Recruiter,,,,,,


#### Now we will clean up the minimum value column of Pay Rate
##### To perform this we need to do 4 basic steps:
- 1. Remove any extra spaces from the value
- 2. Remove commas from the values
- 3. Remove any set of characters available
- 4. Convert object type to float type/ int type

In [276]:
# to remove extra spaces from the values we will use strip() function

payrate_split[0]=payrate_split[0].str.strip()
payrate_split[0]

0                          1,50,000
1                          1,50,000
2        Not Disclosed by Recruiter
3        Not Disclosed by Recruiter
4        Not Disclosed by Recruiter
                    ...            
21995    Not Disclosed by Recruiter
21996                      8,50,000
21997    Not Disclosed by Recruiter
21998    Not Disclosed by Recruiter
21999                     12,00,000
Name: 0, Length: 22000, dtype: object

In [277]:
# Now we will replace all commas using replace() function

payrate_split[0].str.replace(',','')

0                            150000
1                            150000
2        Not Disclosed by Recruiter
3        Not Disclosed by Recruiter
4        Not Disclosed by Recruiter
                    ...            
21995    Not Disclosed by Recruiter
21996                        850000
21997    Not Disclosed by Recruiter
21998    Not Disclosed by Recruiter
21999                       1200000
Name: 0, Length: 22000, dtype: object

In [278]:
# We can also use lambda function to replace all the commas

payrate_split[0]=payrate_split[0].apply(lambda x:str(x).replace(',',''))
payrate_split[0]

0                            150000
1                            150000
2        Not Disclosed by Recruiter
3        Not Disclosed by Recruiter
4        Not Disclosed by Recruiter
                    ...            
21995    Not Disclosed by Recruiter
21996                        850000
21997    Not Disclosed by Recruiter
21998    Not Disclosed by Recruiter
21999                       1200000
Name: 0, Length: 22000, dtype: object

- Now to remove any characters we have in our feature we have 3 options:
 1. We can use exceptional handling to assign any default value to all the character values
 2. We can use to_numeric function 
 3. We can use regular expressions

In [279]:
# We would first try using exceptional handling

pay=[]
for payrate in payrate_split[0]:
    try:
        if type(float(payrate))==np.float:
            pay.append(payrate)
    except:
            pay.append('missing value')

pay

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  if type(float(payrate))==np.float:


['150000',
 '150000',
 'missing value',
 'missing value',
 'missing value',
 'missing value',
 '300000',
 'missing value',
 '100000',
 'missing value',
 'missing value',
 'missing value',
 '175000',
 'missing value',
 'missing value',
 'missing value',
 '100000',
 '100000',
 'missing value',
 'missing value',
 'missing value',
 'missing value',
 '150000',
 'missing value',
 'missing value',
 'missing value',
 '175000',
 '50000',
 '900000',
 'missing value',
 'missing value',
 '500000',
 'missing value',
 'missing value',
 '400000',
 'missing value',
 'missing value',
 'missing value',
 '200000',
 'missing value',
 'missing value',
 'missing value',
 'missing value',
 'missing value',
 'missing value',
 'missing value',
 'missing value',
 'missing value',
 'missing value',
 'missing value',
 'missing value',
 'missing value',
 'missing value',
 'missing value',
 '400000',
 '250000',
 'missing value',
 '300000',
 'missing value',
 'missing value',
 'missing value',
 '50000',
 'missing va

In [280]:
# Here we are using to_numeric function
# errors='coerce' will set invalid parsing as NaN
# errors='raise' will raise an exception
# errors='ignore' will return the input

pd.to_numeric(payrate_split[0],errors='coerce')

0         150000.0
1         150000.0
2              NaN
3              NaN
4              NaN
           ...    
21995          NaN
21996     850000.0
21997          NaN
21998          NaN
21999    1200000.0
Name: 0, Length: 22000, dtype: float64

In [281]:
# Here we are using regular expression to find the character pattern and replace it

pattern='\D.*'
payrate_split[0].str.replace(pattern,'')

  payrate_split[0].str.replace(pattern,'')


0         150000
1         150000
2               
3               
4               
          ...   
21995           
21996     850000
21997           
21998           
21999    1200000
Name: 0, Length: 22000, dtype: object

- Any Map function can be used to check the data type

In [282]:
any(map(str.isdigit,payrate_split[0]))

True

In [283]:
any(map(str.isnumeric,payrate_split[0]))

True

#### In Previous steps we tried to clean up the minimum value of Pay Rate, now lets clean maximum value of Pay Rate.
##### To perform this we need to do 4 basic steps:
- 1. Remove any extra spaces from the value
- 2. Remove commas from the values
- 3. Remove any set of characters available
- 4. Convert object type to float type/ int type

In [284]:
payrate_split[1][0]

' 2,25,000 P.A'

In [285]:
# Remove extra spaces using strip function

payrate_split[1]=payrate_split[1].str.strip()
payrate_split[1]

0               2,25,000 P.A
1        2,50,000 P.A. 20000
2                       None
3                       None
4                       None
                ...         
21995                   None
21996          14,00,000 P.A
21997                   None
21998                   None
21999          22,00,000 P.A
Name: 1, Length: 22000, dtype: object

In [286]:
# Remove comma from the values using lambda replace function

payrate_split[1]=payrate_split[1].apply(lambda x:str(x).replace(',',''))
payrate_split[1]

0               225000 P.A
1        250000 P.A. 20000
2                     None
3                     None
4                     None
               ...        
21995                 None
21996          1400000 P.A
21997                 None
21998                 None
21999          2200000 P.A
Name: 1, Length: 22000, dtype: object

In [287]:
# Remove set of characters using regular expression function

pattern='\D.*'
payrate_split[1]=payrate_split[1].str.replace(pattern,'')
payrate_split[1]

  payrate_split[1]=payrate_split[1].str.replace(pattern,'')


0         225000
1         250000
2               
3               
4               
          ...   
21995           
21996    1400000
21997           
21998           
21999    2200000
Name: 1, Length: 22000, dtype: object

In [288]:
# We will now check the data type of the minimum and maximum values of payrate

payrate_split.dtypes

0    object
1    object
2    object
3    object
4    object
5    object
6    object
dtype: object

In [289]:
# As we can see 0 and 1 have object data type which we need to convert to numeric data type. 
# We will use to_numeric function of pandas to perform this

payrate_split[0]=pd.to_numeric(payrate_split[0], errors='coerce')
payrate_split[1]=pd.to_numeric(payrate_split[1], errors='coerce')

payrate_split.dtypes

0    float64
1    float64
2     object
3     object
4     object
5     object
6     object
dtype: object

#### We need to now add mimimum and maximum payrate values in our dataframe. We have 2 approaches for this:
#### 1. We can use concat function of pandas to perform this
#### 2. We can create new feauture column and assign payrate_split[0] or [1] into this

##### To be noted that if we have a lot of new features in that case if would be very difficult and time taking to follow approach 2 and hence approach 1 is suggested as we do not want to write steps for all 20-30 new features 

In [290]:
# Lets try to use the 2nd approach first and see the results

df['min_pay']=payrate_split[0]
df['max_pay']=payrate_split[1]

df.head(10)

Unnamed: 0,company,education,experience,industry,jobdescription,jobid,joblocation_address,jobtitle,numberofpositions,payrate,postdate,site_name,skills,uniq_id,min_exp,max_exp,avg_exp,min_pay,max_pay
0,MM Media Pvt Ltd,UG: B.Tech/B.E. - Any Specialization PG:Any Po...,0 - 1 yrs,Media / Entertainment / Internet,Job Description Send me Jobs like this Quali...,210516002263,Chennai,Walkin Data Entry Operator (night Shift),,"1,50,000 - 2,25,000 P.A",2016-05-21 19:30:00 +0000,,ITES,43b19632647068535437c774b6ca6cf8,0.0,1.0,0.5,150000.0,225000.0
1,find live infotech,UG: B.Tech/B.E. - Any Specialization PG:MBA/PG...,0 - 0 yrs,Advertising / PR / MR / Event Management,Job Description Send me Jobs like this Quali...,210516002391,Chennai,Work Based Onhome Based Part Time.,60.0,"1,50,000 - 2,50,000 P.A. 20000",2016-05-21 19:30:00 +0000,,Marketing,d4c72325e57f89f364812b5ed5a795f0,0.0,0.0,0.0,150000.0,250000.0
2,Softtech Career Infosystem Pvt. Ltd,UG: Any Graduate - Any Specialization PG:Any P...,4 - 8 yrs,IT-Software / Software Services,Job Description Send me Jobs like this - as ...,101016900534,Bengaluru,Pl/sql Developer - SQL,,Not Disclosed by Recruiter,2016-10-13 16:20:55 +0000,,IT Software - Application Programming,c47df6f4cfdf5b46f1fd713ba61b9eba,4.0,8.0,6.0,,
3,Onboard HRServices LLP,UG: Any Graduate - Any Specialization PG:CA Do...,11 - 15 yrs,Banking / Financial Services / Broking,Job Description Send me Jobs like this - Inv...,81016900536,"Mumbai, Bengaluru, Kolkata, Chennai, Coimbator...",Manager/ad/partner - Indirect Tax - CA,,Not Disclosed by Recruiter,2016-10-13 16:20:55 +0000,,Accounts,115d28f140f694dd1cc61c53d03c66ae,11.0,15.0,13.0,,
4,Spire Technologies and Solutions Pvt. Ltd.,UG: B.Tech/B.E. - Any Specialization PG:Any Po...,6 - 8 yrs,IT-Software / Software Services,Job Description Send me Jobs like this Pleas...,120916002122,Bengaluru,JAVA Technical Lead (6-8 yrs) -,4.0,Not Disclosed by Recruiter,2016-10-13 16:20:55 +0000,,IT Software - Application Programming,a12553fc03bc7bcced8b1bb8963f97b4,6.0,8.0,7.0,,
5,PFS Web Global Services Pvt Ltd,UG: B.Tech/B.E. - Any Specialization PG:MCA - ...,2 - 5 yrs,IT-Software / Software Services,Job Description Send me Jobs like this We ar...,131016005070,Bengaluru,WALK IN - As400 Developer - Pfsweb Global Serv...,,Not Disclosed by Recruiter,2016-10-13 16:20:55 +0000,,IT Software - Application Programming,8c3af9062ea835b0965779e2c7faac76,2.0,5.0,3.5,,
6,Kinesis Management Consultant Pvt. Ltd,,1 - 3 yrs,IT-Software / Software Services,Job Description Send me Jobs like this exper...,131016004451,"Delhi NCR, Mumbai, Bengaluru, Kochi, Greater N...",PHP Developer,2.0,"3,00,000 - 6,50,000 P.A",2016-10-13 16:20:55 +0000,,IT Software - Application Programming,98f84958cd6409386e7f0c9e447b8510,1.0,3.0,2.0,300000.0,650000.0
7,Agile HR consultancy Pvt. Ltd. hiring for Ross...,"UG: Diploma - Any Specialization, Electrical, ...",2 - 7 yrs,Aviation / Aerospace Firms,Job Description Send me Jobs like this Job D...,121016002995,Bengaluru,Member Technical Staff-wire Harness/cable Harn...,20.0,Not Disclosed by Recruiter,2016-10-13 16:20:55 +0000,,Production,c811c10568fdbfdb3d5e633c99c914b4,2.0,7.0,4.5,,
8,HANSUM INDIA ELECTRONICS PVT.LTD.,"UG: Diploma - Any Specialization, Electronics/...",1 - 3 yrs,Industrial Products / Heavy Machinery,Job Description Send me Jobs like this Indep...,131016002120,Bengaluru,Team Leader,2.0,"1,00,000 - 2,00,000 P.A",2016-10-13 16:20:55 +0000,,Production,a40cb270401af0c246dbe60dcf219262,1.0,3.0,2.0,100000.0,200000.0
9,Accenture,UG: Any Graduate - Any Specialization PG:Any P...,1 - 5 yrs,IT-Software / Software Services,Job Description Send me Jobs like this Overa...,121016901354,Bengaluru,German Translator,,Not Disclosed by Recruiter,2016-10-13 16:20:55 +0000,,ITES,7774df1c672c0b92486da8b36a721638,1.0,5.0,3.0,,


In [291]:
# Here we will use concat function to add the columns

pay=pd.concat([payrate_split[0],payrate_split[1]], axis=1, sort=False)
pay.columns=['min_pay','max_pay']
pay

Unnamed: 0,min_pay,max_pay
0,150000.0,225000.0
1,150000.0,250000.0
2,,
3,,
4,,
...,...,...
21995,,
21996,850000.0,1400000.0
21997,,
21998,,


In [292]:
# Let's rename min_pay and max_pay already existing in the dataframe to min_pay_2 and max_pay_2 to indicate created  
# using 2nd approach

df.rename(columns={"min_pay":"min_pay_2", "max_pay":"max_pay_2"}, inplace=True)

In [293]:
df=pd.concat([df,pay],axis=1, sort=False)

In [294]:
df.head(6)

Unnamed: 0,company,education,experience,industry,jobdescription,jobid,joblocation_address,jobtitle,numberofpositions,payrate,...,site_name,skills,uniq_id,min_exp,max_exp,avg_exp,min_pay_2,max_pay_2,min_pay,max_pay
0,MM Media Pvt Ltd,UG: B.Tech/B.E. - Any Specialization PG:Any Po...,0 - 1 yrs,Media / Entertainment / Internet,Job Description Send me Jobs like this Quali...,210516002263,Chennai,Walkin Data Entry Operator (night Shift),,"1,50,000 - 2,25,000 P.A",...,,ITES,43b19632647068535437c774b6ca6cf8,0.0,1.0,0.5,150000.0,225000.0,150000.0,225000.0
1,find live infotech,UG: B.Tech/B.E. - Any Specialization PG:MBA/PG...,0 - 0 yrs,Advertising / PR / MR / Event Management,Job Description Send me Jobs like this Quali...,210516002391,Chennai,Work Based Onhome Based Part Time.,60.0,"1,50,000 - 2,50,000 P.A. 20000",...,,Marketing,d4c72325e57f89f364812b5ed5a795f0,0.0,0.0,0.0,150000.0,250000.0,150000.0,250000.0
2,Softtech Career Infosystem Pvt. Ltd,UG: Any Graduate - Any Specialization PG:Any P...,4 - 8 yrs,IT-Software / Software Services,Job Description Send me Jobs like this - as ...,101016900534,Bengaluru,Pl/sql Developer - SQL,,Not Disclosed by Recruiter,...,,IT Software - Application Programming,c47df6f4cfdf5b46f1fd713ba61b9eba,4.0,8.0,6.0,,,,
3,Onboard HRServices LLP,UG: Any Graduate - Any Specialization PG:CA Do...,11 - 15 yrs,Banking / Financial Services / Broking,Job Description Send me Jobs like this - Inv...,81016900536,"Mumbai, Bengaluru, Kolkata, Chennai, Coimbator...",Manager/ad/partner - Indirect Tax - CA,,Not Disclosed by Recruiter,...,,Accounts,115d28f140f694dd1cc61c53d03c66ae,11.0,15.0,13.0,,,,
4,Spire Technologies and Solutions Pvt. Ltd.,UG: B.Tech/B.E. - Any Specialization PG:Any Po...,6 - 8 yrs,IT-Software / Software Services,Job Description Send me Jobs like this Pleas...,120916002122,Bengaluru,JAVA Technical Lead (6-8 yrs) -,4.0,Not Disclosed by Recruiter,...,,IT Software - Application Programming,a12553fc03bc7bcced8b1bb8963f97b4,6.0,8.0,7.0,,,,
5,PFS Web Global Services Pvt Ltd,UG: B.Tech/B.E. - Any Specialization PG:MCA - ...,2 - 5 yrs,IT-Software / Software Services,Job Description Send me Jobs like this We ar...,131016005070,Bengaluru,WALK IN - As400 Developer - Pfsweb Global Serv...,,Not Disclosed by Recruiter,...,,IT Software - Application Programming,8c3af9062ea835b0965779e2c7faac76,2.0,5.0,3.5,,,,


## Now we will clean and perform feature engineering on Experience column and get Minimum and Maximum experience

- Currently there is a single column experience with minimum and maximum values seperated by '-'
- We need to fetch minimum and maximum values for each record and store in 2 seperate numeric columns

In [295]:
# Lets first try to split the experience by "-"

df['experience'][0].split('-')

['0 ', ' 1 yrs']

In [296]:
# we will create a function to perform this on all rows

def split_exp(exp):
    min_exp=exp.split('-')[0]
    max_exp=exp.split('-')[1]
    return min_exp, max_exp

In [297]:
# Above we get the error because this columns is not cleaned and there might be values without '-' or with more than 2 values

df['experience'].apply(split_exp)

IndexError: list index out of range

In [298]:
# Lets check for all those cases where the above funcion fails
len1=[]
for exp in df['experience'].dropna():
    if len(exp.split('-'))!=2:
        len1.append(exp)
        
        
len1

['Not Mentioned',
 'Not Mentioned',
 'Not Mentioned',
 'Not Mentioned',
 'Not Mentioned',
 'Not Mentioned',
 'Not Mentioned',
 'Not Mentioned',
 'Not Mentioned',
 'Not Mentioned',
 'Not Mentioned',
 'Not Mentioned',
 'Not Mentioned',
 'Not Mentioned',
 'Not Mentioned',
 'Not Mentioned',
 'Not Mentioned',
 'Not Mentioned',
 'Not Mentioned',
 'Not Mentioned',
 'Not Mentioned',
 'Not Mentioned',
 'Not Mentioned',
 'Not Mentioned',
 'Not Mentioned',
 'Not Mentioned',
 'Not Mentioned',
 'Not Mentioned',
 'Not Mentioned',
 'Not Mentioned',
 'Not Mentioned',
 'Not Mentioned',
 'Not Mentioned',
 'Not Mentioned',
 'Not Mentioned',
 'Not Mentioned',
 'Not Mentioned',
 'Not Mentioned',
 'Not Mentioned',
 'Not Mentioned',
 'Not Mentioned',
 'Not Mentioned',
 'Not Mentioned',
 'Not Mentioned',
 'Not Mentioned',
 'Not Mentioned',
 'Not Mentioned',
 'Not Mentioned',
 'Not Mentioned',
 'Not Mentioned',
 'Not Mentioned',
 'Not Mentioned',
 'Not Mentioned',
 'Not Mentioned',
 'Not Mentioned',
 'Not Ment

- There are 2 approaches to deal with this, first we can delete all these records but this is not an optimal way. 
- Other than that we can have some ways to clean this up:
- 1. Use split function and seperate it out as we did with payrate feature
- 2. Try to enhance your function to handle these kind of exceptions

In [299]:
# we will modify our function to handle the exceptions and return the values in series format which can be easily 
# appended in df dataframe

def split_exp2(exp):
    try:
        if len(exp.split('-'))==2:
            min_exp=exp.split('-')[0]
            max_exp=exp.split('-')[1]
        return pd.Series([min_exp, max_exp])
    except:
        return pd.Series([np.nan, np.nan])

In [300]:
df[['min_exp','max_exp']]=df['experience'].apply(split_exp2).rename(columns={0:'min_exp', 1:'max_exp'})

In [301]:
df.head(6)

Unnamed: 0,company,education,experience,industry,jobdescription,jobid,joblocation_address,jobtitle,numberofpositions,payrate,...,site_name,skills,uniq_id,min_exp,max_exp,avg_exp,min_pay_2,max_pay_2,min_pay,max_pay
0,MM Media Pvt Ltd,UG: B.Tech/B.E. - Any Specialization PG:Any Po...,0 - 1 yrs,Media / Entertainment / Internet,Job Description Send me Jobs like this Quali...,210516002263,Chennai,Walkin Data Entry Operator (night Shift),,"1,50,000 - 2,25,000 P.A",...,,ITES,43b19632647068535437c774b6ca6cf8,0,1 yrs,0.5,150000.0,225000.0,150000.0,225000.0
1,find live infotech,UG: B.Tech/B.E. - Any Specialization PG:MBA/PG...,0 - 0 yrs,Advertising / PR / MR / Event Management,Job Description Send me Jobs like this Quali...,210516002391,Chennai,Work Based Onhome Based Part Time.,60.0,"1,50,000 - 2,50,000 P.A. 20000",...,,Marketing,d4c72325e57f89f364812b5ed5a795f0,0,0 yrs,0.0,150000.0,250000.0,150000.0,250000.0
2,Softtech Career Infosystem Pvt. Ltd,UG: Any Graduate - Any Specialization PG:Any P...,4 - 8 yrs,IT-Software / Software Services,Job Description Send me Jobs like this - as ...,101016900534,Bengaluru,Pl/sql Developer - SQL,,Not Disclosed by Recruiter,...,,IT Software - Application Programming,c47df6f4cfdf5b46f1fd713ba61b9eba,4,8 yrs,6.0,,,,
3,Onboard HRServices LLP,UG: Any Graduate - Any Specialization PG:CA Do...,11 - 15 yrs,Banking / Financial Services / Broking,Job Description Send me Jobs like this - Inv...,81016900536,"Mumbai, Bengaluru, Kolkata, Chennai, Coimbator...",Manager/ad/partner - Indirect Tax - CA,,Not Disclosed by Recruiter,...,,Accounts,115d28f140f694dd1cc61c53d03c66ae,11,15 yrs,13.0,,,,
4,Spire Technologies and Solutions Pvt. Ltd.,UG: B.Tech/B.E. - Any Specialization PG:Any Po...,6 - 8 yrs,IT-Software / Software Services,Job Description Send me Jobs like this Pleas...,120916002122,Bengaluru,JAVA Technical Lead (6-8 yrs) -,4.0,Not Disclosed by Recruiter,...,,IT Software - Application Programming,a12553fc03bc7bcced8b1bb8963f97b4,6,8 yrs,7.0,,,,
5,PFS Web Global Services Pvt Ltd,UG: B.Tech/B.E. - Any Specialization PG:MCA - ...,2 - 5 yrs,IT-Software / Software Services,Job Description Send me Jobs like this We ar...,131016005070,Bengaluru,WALK IN - As400 Developer - Pfsweb Global Serv...,,Not Disclosed by Recruiter,...,,IT Software - Application Programming,8c3af9062ea835b0965779e2c7faac76,2,5 yrs,3.5,,,,


In [303]:
df['max_exp']=df['max_exp'].str.replace('yrs','')
df.head(5)

Unnamed: 0,company,education,experience,industry,jobdescription,jobid,joblocation_address,jobtitle,numberofpositions,payrate,...,site_name,skills,uniq_id,min_exp,max_exp,avg_exp,min_pay_2,max_pay_2,min_pay,max_pay
0,MM Media Pvt Ltd,UG: B.Tech/B.E. - Any Specialization PG:Any Po...,0 - 1 yrs,Media / Entertainment / Internet,Job Description Send me Jobs like this Quali...,210516002263,Chennai,Walkin Data Entry Operator (night Shift),,"1,50,000 - 2,25,000 P.A",...,,ITES,43b19632647068535437c774b6ca6cf8,0,1,0.5,150000.0,225000.0,150000.0,225000.0
1,find live infotech,UG: B.Tech/B.E. - Any Specialization PG:MBA/PG...,0 - 0 yrs,Advertising / PR / MR / Event Management,Job Description Send me Jobs like this Quali...,210516002391,Chennai,Work Based Onhome Based Part Time.,60.0,"1,50,000 - 2,50,000 P.A. 20000",...,,Marketing,d4c72325e57f89f364812b5ed5a795f0,0,0,0.0,150000.0,250000.0,150000.0,250000.0
2,Softtech Career Infosystem Pvt. Ltd,UG: Any Graduate - Any Specialization PG:Any P...,4 - 8 yrs,IT-Software / Software Services,Job Description Send me Jobs like this - as ...,101016900534,Bengaluru,Pl/sql Developer - SQL,,Not Disclosed by Recruiter,...,,IT Software - Application Programming,c47df6f4cfdf5b46f1fd713ba61b9eba,4,8,6.0,,,,
3,Onboard HRServices LLP,UG: Any Graduate - Any Specialization PG:CA Do...,11 - 15 yrs,Banking / Financial Services / Broking,Job Description Send me Jobs like this - Inv...,81016900536,"Mumbai, Bengaluru, Kolkata, Chennai, Coimbator...",Manager/ad/partner - Indirect Tax - CA,,Not Disclosed by Recruiter,...,,Accounts,115d28f140f694dd1cc61c53d03c66ae,11,15,13.0,,,,
4,Spire Technologies and Solutions Pvt. Ltd.,UG: B.Tech/B.E. - Any Specialization PG:Any Po...,6 - 8 yrs,IT-Software / Software Services,Job Description Send me Jobs like this Pleas...,120916002122,Bengaluru,JAVA Technical Lead (6-8 yrs) -,4.0,Not Disclosed by Recruiter,...,,IT Software - Application Programming,a12553fc03bc7bcced8b1bb8963f97b4,6,8,7.0,,,,


In [304]:
df.dtypes

company                 object
education               object
experience              object
industry                object
jobdescription          object
jobid                    int64
joblocation_address     object
jobtitle                object
numberofpositions      float64
payrate                 object
postdate                object
site_name               object
skills                  object
uniq_id                 object
min_exp                 object
max_exp                 object
avg_exp                float64
min_pay_2              float64
max_pay_2              float64
min_pay                float64
max_pay                float64
dtype: object

In [305]:
# Convert the column types from object to float

df['min_exp']=df['min_exp'].astype(float)
df['max_exp']=df['max_exp'].astype(float)

df.dtypes

company                 object
education               object
experience              object
industry                object
jobdescription          object
jobid                    int64
joblocation_address     object
jobtitle                object
numberofpositions      float64
payrate                 object
postdate                object
site_name               object
skills                  object
uniq_id                 object
min_exp                float64
max_exp                float64
avg_exp                float64
min_pay_2              float64
max_pay_2              float64
min_pay                float64
max_pay                float64
dtype: object

In [306]:
# Let's also calculate average experience and payrate

df['avg_exp']=(df['min_exp'] + df['max_exp']) / 2
df['avg_payrate']=(df['min_pay'] + df['max_pay']) / 2

df.head(6)

Unnamed: 0,company,education,experience,industry,jobdescription,jobid,joblocation_address,jobtitle,numberofpositions,payrate,...,skills,uniq_id,min_exp,max_exp,avg_exp,min_pay_2,max_pay_2,min_pay,max_pay,avg_payrate
0,MM Media Pvt Ltd,UG: B.Tech/B.E. - Any Specialization PG:Any Po...,0 - 1 yrs,Media / Entertainment / Internet,Job Description Send me Jobs like this Quali...,210516002263,Chennai,Walkin Data Entry Operator (night Shift),,"1,50,000 - 2,25,000 P.A",...,ITES,43b19632647068535437c774b6ca6cf8,0.0,1.0,0.5,150000.0,225000.0,150000.0,225000.0,187500.0
1,find live infotech,UG: B.Tech/B.E. - Any Specialization PG:MBA/PG...,0 - 0 yrs,Advertising / PR / MR / Event Management,Job Description Send me Jobs like this Quali...,210516002391,Chennai,Work Based Onhome Based Part Time.,60.0,"1,50,000 - 2,50,000 P.A. 20000",...,Marketing,d4c72325e57f89f364812b5ed5a795f0,0.0,0.0,0.0,150000.0,250000.0,150000.0,250000.0,200000.0
2,Softtech Career Infosystem Pvt. Ltd,UG: Any Graduate - Any Specialization PG:Any P...,4 - 8 yrs,IT-Software / Software Services,Job Description Send me Jobs like this - as ...,101016900534,Bengaluru,Pl/sql Developer - SQL,,Not Disclosed by Recruiter,...,IT Software - Application Programming,c47df6f4cfdf5b46f1fd713ba61b9eba,4.0,8.0,6.0,,,,,
3,Onboard HRServices LLP,UG: Any Graduate - Any Specialization PG:CA Do...,11 - 15 yrs,Banking / Financial Services / Broking,Job Description Send me Jobs like this - Inv...,81016900536,"Mumbai, Bengaluru, Kolkata, Chennai, Coimbator...",Manager/ad/partner - Indirect Tax - CA,,Not Disclosed by Recruiter,...,Accounts,115d28f140f694dd1cc61c53d03c66ae,11.0,15.0,13.0,,,,,
4,Spire Technologies and Solutions Pvt. Ltd.,UG: B.Tech/B.E. - Any Specialization PG:Any Po...,6 - 8 yrs,IT-Software / Software Services,Job Description Send me Jobs like this Pleas...,120916002122,Bengaluru,JAVA Technical Lead (6-8 yrs) -,4.0,Not Disclosed by Recruiter,...,IT Software - Application Programming,a12553fc03bc7bcced8b1bb8963f97b4,6.0,8.0,7.0,,,,,
5,PFS Web Global Services Pvt Ltd,UG: B.Tech/B.E. - Any Specialization PG:MCA - ...,2 - 5 yrs,IT-Software / Software Services,Job Description Send me Jobs like this We ar...,131016005070,Bengaluru,WALK IN - As400 Developer - Pfsweb Global Serv...,,Not Disclosed by Recruiter,...,IT Software - Application Programming,8c3af9062ea835b0965779e2c7faac76,2.0,5.0,3.5,,,,,


In [307]:
df.dtypes

company                 object
education               object
experience              object
industry                object
jobdescription          object
jobid                    int64
joblocation_address     object
jobtitle                object
numberofpositions      float64
payrate                 object
postdate                object
site_name               object
skills                  object
uniq_id                 object
min_exp                float64
max_exp                float64
avg_exp                float64
min_pay_2              float64
max_pay_2              float64
min_pay                float64
max_pay                float64
avg_payrate            float64
dtype: object

## Now we will clean and perform feature engineering on Postdate column and get day, date, month and year from this

In [308]:
# Firsty we need to convert object type to date time for postdate feature. 
# We can convert to datetime64 or M8, which are both similar

np.dtype('datetime64[ns]')==np.dtype('<M8[ns]')

True

In [309]:
# We will  start with creating a function to fetch day, month and year from postdate

def fetch_dt_att(dataframe,feature):
    try:
        return pd.Series([dataframe[feature].dt.day,dataframe[feature].dt.month,dataframe[feature].dt.year])
    except:
        print('Data type not supported')

In [310]:
# If we call this function on postdate column now, since it is of Object type it would print exception

fetch_dt_att(df,'postdate')

Data type not supported


In [311]:
# So, first we need to convert the column from Object type to datetime format

df['postdate']=pd.to_datetime(df['postdate'])

In [312]:
df['postdate'].dtype

datetime64[ns, UTC]

In [313]:
df[['day','month','year']]=fetch_dt_att(df,'postdate')

df.head(6)

Unnamed: 0,company,education,experience,industry,jobdescription,jobid,joblocation_address,jobtitle,numberofpositions,payrate,...,max_exp,avg_exp,min_pay_2,max_pay_2,min_pay,max_pay,avg_payrate,day,month,year
0,MM Media Pvt Ltd,UG: B.Tech/B.E. - Any Specialization PG:Any Po...,0 - 1 yrs,Media / Entertainment / Internet,Job Description Send me Jobs like this Quali...,210516002263,Chennai,Walkin Data Entry Operator (night Shift),,"1,50,000 - 2,25,000 P.A",...,1.0,0.5,150000.0,225000.0,150000.0,225000.0,187500.0,21.0,5.0,2016.0
1,find live infotech,UG: B.Tech/B.E. - Any Specialization PG:MBA/PG...,0 - 0 yrs,Advertising / PR / MR / Event Management,Job Description Send me Jobs like this Quali...,210516002391,Chennai,Work Based Onhome Based Part Time.,60.0,"1,50,000 - 2,50,000 P.A. 20000",...,0.0,0.0,150000.0,250000.0,150000.0,250000.0,200000.0,21.0,5.0,2016.0
2,Softtech Career Infosystem Pvt. Ltd,UG: Any Graduate - Any Specialization PG:Any P...,4 - 8 yrs,IT-Software / Software Services,Job Description Send me Jobs like this - as ...,101016900534,Bengaluru,Pl/sql Developer - SQL,,Not Disclosed by Recruiter,...,8.0,6.0,,,,,,13.0,10.0,2016.0
3,Onboard HRServices LLP,UG: Any Graduate - Any Specialization PG:CA Do...,11 - 15 yrs,Banking / Financial Services / Broking,Job Description Send me Jobs like this - Inv...,81016900536,"Mumbai, Bengaluru, Kolkata, Chennai, Coimbator...",Manager/ad/partner - Indirect Tax - CA,,Not Disclosed by Recruiter,...,15.0,13.0,,,,,,13.0,10.0,2016.0
4,Spire Technologies and Solutions Pvt. Ltd.,UG: B.Tech/B.E. - Any Specialization PG:Any Po...,6 - 8 yrs,IT-Software / Software Services,Job Description Send me Jobs like this Pleas...,120916002122,Bengaluru,JAVA Technical Lead (6-8 yrs) -,4.0,Not Disclosed by Recruiter,...,8.0,7.0,,,,,,13.0,10.0,2016.0
5,PFS Web Global Services Pvt Ltd,UG: B.Tech/B.E. - Any Specialization PG:MCA - ...,2 - 5 yrs,IT-Software / Software Services,Job Description Send me Jobs like this We ar...,131016005070,Bengaluru,WALK IN - As400 Developer - Pfsweb Global Serv...,,Not Disclosed by Recruiter,...,5.0,3.5,,,,,,13.0,10.0,2016.0


### One of the other ways is to use map function to perform this

- Syntax is map(function, sequence) where map act as an iterator which performs the function on each row of the sequence

In [314]:
def fetch_dt_att2(x):
    return ([x.day,x.month,x.year])

In [315]:
fet_date=pd.DataFrame(map(fetch_dt_att2,df['postdate'])).rename(columns={0:'day',1:'month',2:'year'})

In [316]:
fet_date

Unnamed: 0,day,month,year
0,21.0,5.0,2016.0
1,21.0,5.0,2016.0
2,13.0,10.0,2016.0
3,13.0,10.0,2016.0
4,13.0,10.0,2016.0
...,...,...,...
21995,25.0,11.0,2016.0
21996,24.0,11.0,2016.0
21997,25.0,11.0,2016.0
21998,27.0,11.0,2016.0


In [317]:
pd.concat([df,fet_date],axis=1)

Unnamed: 0,company,education,experience,industry,jobdescription,jobid,joblocation_address,jobtitle,numberofpositions,payrate,...,max_pay_2,min_pay,max_pay,avg_payrate,day,month,year,day.1,month.1,year.1
0,MM Media Pvt Ltd,UG: B.Tech/B.E. - Any Specialization PG:Any Po...,0 - 1 yrs,Media / Entertainment / Internet,Job Description Send me Jobs like this Quali...,210516002263,Chennai,Walkin Data Entry Operator (night Shift),,"1,50,000 - 2,25,000 P.A",...,225000.0,150000.0,225000.0,187500.0,21.0,5.0,2016.0,21.0,5.0,2016.0
1,find live infotech,UG: B.Tech/B.E. - Any Specialization PG:MBA/PG...,0 - 0 yrs,Advertising / PR / MR / Event Management,Job Description Send me Jobs like this Quali...,210516002391,Chennai,Work Based Onhome Based Part Time.,60.0,"1,50,000 - 2,50,000 P.A. 20000",...,250000.0,150000.0,250000.0,200000.0,21.0,5.0,2016.0,21.0,5.0,2016.0
2,Softtech Career Infosystem Pvt. Ltd,UG: Any Graduate - Any Specialization PG:Any P...,4 - 8 yrs,IT-Software / Software Services,Job Description Send me Jobs like this - as ...,101016900534,Bengaluru,Pl/sql Developer - SQL,,Not Disclosed by Recruiter,...,,,,,13.0,10.0,2016.0,13.0,10.0,2016.0
3,Onboard HRServices LLP,UG: Any Graduate - Any Specialization PG:CA Do...,11 - 15 yrs,Banking / Financial Services / Broking,Job Description Send me Jobs like this - Inv...,81016900536,"Mumbai, Bengaluru, Kolkata, Chennai, Coimbator...",Manager/ad/partner - Indirect Tax - CA,,Not Disclosed by Recruiter,...,,,,,13.0,10.0,2016.0,13.0,10.0,2016.0
4,Spire Technologies and Solutions Pvt. Ltd.,UG: B.Tech/B.E. - Any Specialization PG:Any Po...,6 - 8 yrs,IT-Software / Software Services,Job Description Send me Jobs like this Pleas...,120916002122,Bengaluru,JAVA Technical Lead (6-8 yrs) -,4.0,Not Disclosed by Recruiter,...,,,,,13.0,10.0,2016.0,13.0,10.0,2016.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
21995,Morgan Stanley Advantage Services Pvt. Ltd.,UG: Any Graduate - Any Specialization,9 - 13 yrs,Banking / Financial Services / Broking,Job Description Send me Jobs like this Greet...,241116007152,Bengaluru,Quality Assurance - VP with Morgan Stanley Ban...,2.0,Not Disclosed by Recruiter,...,,,,,25.0,11.0,2016.0,25.0,11.0,2016.0
21996,Careernet Technologies Pvt Ltd hiring for Client,UG: B.Tech/B.E. - Any Specialization PG:M.Tech...,3 - 5 yrs,IT-Software / Software Services,Job Description Send me Jobs like this Looki...,241116001104,"Bengaluru, Gurgaon",Java Backend Developers for a Product Company,,"8,50,000 - 14,00,000 P.A",...,1400000.0,850000.0,1400000.0,1125000.0,24.0,11.0,2016.0,24.0,11.0,2016.0
21997,Musk - Startup,UG: B.Tech/B.E. - Any Specialization PG:Any Po...,2 - 4 yrs,Internet / Ecommerce,Job Description Send me Jobs like this Work ...,251116900644,Bengaluru,Full Stack Development Engineer,,Not Disclosed by Recruiter,...,,,,,25.0,11.0,2016.0,25.0,11.0,2016.0
21998,Confidential,UG: B.Tech/B.E. - Any Specialization PG:MCA - ...,7 - 12 yrs,IT-Software / Software Services,Job Description Send me Jobs like this We ar...,231116901329,"Delhi NCR, Bengaluru",Sr UI Developer/ Technical Lead - Html/ CSS/ J...,,Not Disclosed by Recruiter,...,,,,,27.0,11.0,2016.0,27.0,11.0,2016.0


## Now we will clean and perform feature engineering on Job Location column

In [318]:
df.columns

Index(['company', 'education', 'experience', 'industry', 'jobdescription',
       'jobid', 'joblocation_address', 'jobtitle', 'numberofpositions',
       'payrate', 'postdate', 'site_name', 'skills', 'uniq_id', 'min_exp',
       'max_exp', 'avg_exp', 'min_pay_2', 'max_pay_2', 'min_pay', 'max_pay',
       'avg_payrate', 'day', 'month', 'year'],
      dtype='object')

In [319]:
# Lets first see distinct values from the column Job Location

df['joblocation_address']

0                                                  Chennai
1                                                  Chennai
2                                                Bengaluru
3        Mumbai, Bengaluru, Kolkata, Chennai, Coimbator...
4                                                Bengaluru
                               ...                        
21995                                            Bengaluru
21996                                   Bengaluru, Gurgaon
21997                                            Bengaluru
21998                                 Delhi NCR, Bengaluru
21999                                            Bengaluru
Name: joblocation_address, Length: 22000, dtype: object

In [320]:
# Lets see the value counts for each value

df['joblocation_address'].value_counts()

Bengaluru/Bangalore                                                                          3775
Mumbai                                                                                       2348
Bengaluru                                                                                    1453
Hyderabad / Secunderabad                                                                     1246
Chennai                                                                                      1204
                                                                                             ... 
Delhi/NCR(National Capital Region) , Ernakulam / Kochi/ Cochin , Kanpur , Kolkata , Nasik       1
Gurgaon , Noida , Delhi                                                                         1
Pune , Noida                                                                                    1
Pune , Delhi , Bengaluru/Bangalore                                                              1
Bengaluru, Malaysia 

In [321]:
# Lets try to restrict it for top 10 values only

df['joblocation_address'].value_counts().head(10)

Bengaluru/Bangalore         3775
Mumbai                      2348
Bengaluru                   1453
Hyderabad / Secunderabad    1246
Chennai                     1204
Hyderabad                    936
Noida                        775
Delhi                        763
Gurgaon                      743
Mumbai , Mumbai              514
Name: joblocation_address, dtype: int64

- As we can see there are multiple rows for values like Bangalore or Hyderabad which we need to clean
- To perform the cleaning, first we would create a copy of the data

In [322]:
data=df.copy()

In [323]:
# Lets see distinct values

data['joblocation_address'].unique()

array(['Chennai', 'Bengaluru',
       'Mumbai, Bengaluru, Kolkata, Chennai, Coimbatore, Tamilnadu', ...,
       'Mumbai, Bengaluru, Hyderabad, Kolkata',
       'Mumbai, Chennai, Delhi, Bengaluru, Hyderabad, Pune',
       'Bengaluru, Malaysia'], dtype=object)

In [324]:
# Now we need to replace the values like "Bengaluru/Bangalore" with "Bengaluru" to make the data consistent
# For this we need to have dictionary which can be used where key would be old value and value would be new value
# We already have this combincation in a CSV file which we would upload
# To convert into dictionary we would make first column as index

rep=pd.read_csv(r'D:\Akshay\Python Projects\Naukri.com Analysis\Datasets/replacements.csv').set_index('Unnamed: 0')
rep

Unnamed: 0_level_0,joblocation_address
Unnamed: 0,Unnamed: 1_level_1
Delhi/NCR(National Capital Region),Delhi
Delhi/NCR(National Capital Region),Delhi
(Bengaluru/Bangalore),Bangalore
Bangalore,Bangalore
"Bangalore , Bangalore / Bangalore",Bangalore
Bangalore Bangalore,Bangalore
"Bangalore,Bangalore / Bangalore",Bangalore
"Bangalore,karnataka",Bangalore
Bengaluru,Bangalore
Bengaluru/Bangalore,Bangalore


In [325]:
replacement_dict=rep.to_dict()
replacement_dict

{'joblocation_address': {' Delhi/NCR(National Capital Region)': 'Delhi',
  ' Delhi/NCR(National Capital Region) ': 'Delhi',
  '(Bengaluru/Bangalore)': 'Bangalore',
  'Bangalore': 'Bangalore',
  'Bangalore , Bangalore / Bangalore': 'Bangalore',
  'Bangalore Bangalore': 'Bangalore',
  'Bangalore,Bangalore / Bangalore': 'Bangalore',
  'Bangalore,karnataka': 'Bangalore',
  'Bengaluru': 'Bangalore',
  'Bengaluru/Bangalore ': 'Bangalore',
  'Bengaluru/Bangalore , bangalore': 'Bangalore',
  'DELHI(NATIONAL CAPITAL REGION)': 'Delhi',
  'Delhi': 'Delhi',
  'Delhi , Delhi': 'Delhi',
  'Delhi , Delhi/Greater Delhi': 'Delhi',
  'Delhi , Delhi/NCR(National Capital Region)': 'Delhi',
  'Delhi NCR': 'Delhi',
  'Delhi(National Capital Region)': 'Delhi',
  'Delhi(National Capital Region) , Delhi': 'Delhi',
  'Delhi,Delhi': 'Delhi',
  'Delhi/NCR': 'Delhi',
  'Delhi/NCR(National Capital Region)': 'Delhi',
  'Delhi/NCR(National Capital Region) ': 'Delhi',
  'Delhi/NCR(National Capital Region) , Delhi': 'D

In [326]:
# Now we would just replace the values as per the dictionary using replace function

data.replace(replacement_dict, inplace=True, regex=True)

In [327]:
data['joblocation_address'].value_counts()

Bangalore                                                                                           5228
Mumbai                                                                                              2887
Delhi                                                                                               2741
Hyderabad                                                                                           2182
Chennai                                                                                             1204
                                                                                                    ... 
Delhi , Delhi , Faridabad                                                                              1
Bangalore , Delhi(National Capital Region) , Delhi , Ahmedabad , Delhi                                 1
Ernakulam / Kochi/ Cochin , Trivandrum , Jaipur , Delhi , Delhi(National Capital Region) , Delhi       1
Mathura , Delhi , Delhi                                

In [328]:
# We can also filter out dataset for a particular location

filter=df['joblocation_address']=='Chennai'
loc_chennai=df[filter]
loc_chennai

Unnamed: 0,company,education,experience,industry,jobdescription,jobid,joblocation_address,jobtitle,numberofpositions,payrate,...,max_exp,avg_exp,min_pay_2,max_pay_2,min_pay,max_pay,avg_payrate,day,month,year
0,MM Media Pvt Ltd,UG: B.Tech/B.E. - Any Specialization PG:Any Po...,0 - 1 yrs,Media / Entertainment / Internet,Job Description Send me Jobs like this Quali...,210516002263,Chennai,Walkin Data Entry Operator (night Shift),,"1,50,000 - 2,25,000 P.A",...,1.0,0.5,150000.0,225000.0,150000.0,225000.0,187500.0,21.0,5.0,2016.0
1,find live infotech,UG: B.Tech/B.E. - Any Specialization PG:MBA/PG...,0 - 0 yrs,Advertising / PR / MR / Event Management,Job Description Send me Jobs like this Quali...,210516002391,Chennai,Work Based Onhome Based Part Time.,60.0,"1,50,000 - 2,50,000 P.A. 20000",...,0.0,0.0,150000.0,250000.0,150000.0,250000.0,200000.0,21.0,5.0,2016.0
3621,Cognizant Technology Solutions India Ltd,UG: B.Tech/B.E. PG:Post Graduation Not Require...,3 - 5 yrs,IT-Software / Software Services,Job Description Send me Jobs like this Techn...,130516501929,Chennai,Sr. Associate - Projects,,Not Disclosed by Recruiter,...,5.0,4.0,,,,,,13.0,5.0,2016.0
3622,Cognizant Technology Solutions India Ltd,UG: B.Tech/B.E. PG:Post Graduation Not Require...,2 - 5 yrs,IT-Software / Software Services,Job Description Send me Jobs like this Under...,130516501917,Chennai,Technology Specialist,,Not Disclosed by Recruiter,...,5.0,3.5,,,,,,13.0,5.0,2016.0
3623,Satincorp Technologies Inc,UG: Any Graduate PG:MBA/PGDM Doctorate:Any Doc...,5 - 8 yrs,IT-Software / Software Services,Job Description Send me Jobs like this The c...,130516503864,Chennai,marketing manager,,Not Disclosed by Recruiter,...,8.0,6.5,,,,,,13.0,5.0,2016.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18194,Talent Maximus,UG: B.Tech/B.E. - Any Specialization PG:MCA - ...,1 - 3 yrs,Other,Job Description Send me Jobs like this 1. Co...,250316001401,Chennai,Software Engineer,,Not Disclosed by Recruiter,...,3.0,2.0,,,,,,20.0,5.0,2016.0
18195,Ken QA Services,UG: Any Graduate - Any Specialization PG:Any P...,4 - 8 yrs,IT-Software / Software Services,Job Description Send me Jobs like this Skill...,200516900845,Chennai,Test Automation Engineer - Selenium,4.0,Not Disclosed by Recruiter,...,8.0,6.0,,,,,,20.0,5.0,2016.0
18196,Saaki Argus And Averil Consulting,"UG: Any Graduate - Any Specialization, Graduat...",3 - 8 yrs,IT-Software / Software Services,Job Description Send me Jobs like this Job D...,200516003885,Chennai,ETL Automation Testing,,"4,00,000 - 9,00,000 P.A",...,8.0,5.5,400000.0,900000.0,400000.0,900000.0,650000.0,20.0,5.0,2016.0
18197,Financial Software Systems Pvt. Ltd.,"UG: Any Graduate - Any Specialization, Any Spe...",2 - 6 yrs,IT-Software / Software Services,Job Description Send me Jobs like this Issui...,310316004720,Chennai,Software Testing Engineer,6.0,Not Disclosed by Recruiter,...,6.0,4.0,,,,,,20.0,5.0,2016.0


In [329]:
# Also we can compare the values before and after the replacement 

df['joblocation_address'].nunique()

2329

In [330]:
data['joblocation_address'].nunique()

2141

## Now lets prepare our final data by removing redundant columns 

In [331]:
data.columns

Index(['company', 'education', 'experience', 'industry', 'jobdescription',
       'jobid', 'joblocation_address', 'jobtitle', 'numberofpositions',
       'payrate', 'postdate', 'site_name', 'skills', 'uniq_id', 'min_exp',
       'max_exp', 'avg_exp', 'min_pay_2', 'max_pay_2', 'min_pay', 'max_pay',
       'avg_payrate', 'day', 'month', 'year'],
      dtype='object')

In [332]:
# Lets create a function to drop columns

def drop_feature(column):
    data.drop(column,axis=1,inplace=True)

In [333]:
drop_feature('payrate')

In [334]:
drop_feature('experience')

In [335]:
drop_feature('postdate')
drop_feature('uniq_id')
drop_feature('min_pay_2')
drop_feature('max_pay_2')

data.columns

Index(['company', 'education', 'industry', 'jobdescription', 'jobid',
       'joblocation_address', 'jobtitle', 'numberofpositions', 'site_name',
       'skills', 'min_exp', 'max_exp', 'avg_exp', 'min_pay', 'max_pay',
       'avg_payrate', 'day', 'month', 'year'],
      dtype='object')

In [336]:
data.head(6)

Unnamed: 0,company,education,industry,jobdescription,jobid,joblocation_address,jobtitle,numberofpositions,site_name,skills,min_exp,max_exp,avg_exp,min_pay,max_pay,avg_payrate,day,month,year
0,MM Media Pvt Ltd,UG: B.Tech/B.E. - Any Specialization PG:Any Po...,Media / Entertainment / Internet,Job Description Send me Jobs like this Quali...,210516002263,Chennai,Walkin Data Entry Operator (night Shift),,,ITES,0.0,1.0,0.5,150000.0,225000.0,187500.0,21.0,5.0,2016.0
1,find live infotech,UG: B.Tech/B.E. - Any Specialization PG:MBA/PG...,Advertising / PR / MR / Event Management,Job Description Send me Jobs like this Quali...,210516002391,Chennai,Work Based Onhome Based Part Time.,60.0,,Marketing,0.0,0.0,0.0,150000.0,250000.0,200000.0,21.0,5.0,2016.0
2,Softtech Career Infosystem Pvt. Ltd,UG: Any Graduate - Any Specialization PG:Any P...,IT-Software / Software Services,Job Description Send me Jobs like this - as ...,101016900534,Bangalore,Pl/sql Developer - SQL,,,IT Software - Application Programming,4.0,8.0,6.0,,,,13.0,10.0,2016.0
3,Onboard HRServices LLP,UG: Any Graduate - Any Specialization PG:CA Do...,Banking / Financial Services / Broking,Job Description Send me Jobs like this - Inv...,81016900536,"Mumbai, Bangalore, Kolkata, Chennai, Coimbator...",Manager/ad/partner - Indirect Tax - CA,,,Accounts,11.0,15.0,13.0,,,,13.0,10.0,2016.0
4,Spire Technologies and Solutions Pvt. Ltd.,UG: B.Tech/B.E. - Any Specialization PG:Any Po...,IT-Software / Software Services,Job Description Send me Jobs like this Pleas...,120916002122,Bangalore,JAVA Technical Lead (6-8 yrs) -,4.0,,IT Software - Application Programming,6.0,8.0,7.0,,,,13.0,10.0,2016.0
5,PFS Web Global Services Pvt Ltd,UG: B.Tech/B.E. - Any Specialization PG:MCA - ...,IT-Software / Software Services,Job Description Send me Jobs like this We ar...,131016005070,Bangalore,WALK IN - As400 Developer - Pfsweb Global Serv...,,,IT Software - Application Programming,2.0,5.0,3.5,,,,13.0,10.0,2016.0


In [337]:
# Now lets save this data in form of CSV file

data.to_csv(r'D:\Akshay\Python Projects\Naukri.com Analysis\Datasets/naukri_data.csv', index=False)