In [11]:
# import important packages
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline
from sklearn.model_selection import train_test_split 
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler, MinMaxScaler
import pandas_profiling

from matplotlib import rcParams
import warnings 

warnings.filterwarnings("ignore")

# figure size in inches 
rcParams["figure.figsize"] = 10, 6 
np.random.seed(42)


In [5]:
data1 = pd.read_csv(r"./eda_data.csv")

In [6]:
data1.sample(5)

Unnamed: 0.1,Unnamed: 0,Job Title,Salary Estimate,Job Description,Rating,Company Name,Location,Headquarters,Size,Founded,...,age,python_yn,R_yn,spark,aws,excel,job_simp,seniority,desc_len,num_comp
120,120,Data Scientist,$77K-$124K (Glassdoor est.),Company Description:\nWhats it like to work at...,4.4,Quick Base\n4.4,"Cambridge, MA","Cambridge, MA",201 to 500 employees,1999,...,21,1,0,0,0,1,data scientist,na,4188,0
196,196,Marketing Data Analyst,$35K-$62K (Glassdoor est.),Under direct supervision of the Director of Da...,3.6,San Manuel Casino\n3.6,"Highland, CA","Highland, CA",1001 to 5000 employees,1986,...,34,0,0,0,0,1,analyst,na,4608,0
336,336,Senior Scientist (Neuroscience),$109K-$200K (Glassdoor est.),Sunovion Pharmaceuticals is looking for a Seni...,3.5,Sunovion\n3.5,"Marlborough, MA","Marlborough, MA",1001 to 5000 employees,2010,...,10,1,0,0,0,1,na,senior,4517,3
582,582,Senior LiDAR Data Scientist,$93K-$151K (Glassdoor est.),Overview\n\n\nLuminar is an autonomous vehicle...,3.9,Luminar Technologies\n3.9,"Orlando, FL","Orlando, FL",201 to 500 employees,2012,...,8,1,0,0,1,0,data scientist,senior,5647,0
395,395,Lead Big Data Engineer,$121K-$203K (Glassdoor est.),Our mission is to help people everywhere find ...,4.0,Glassdoor\n4.0,"San Francisco, CA","Mill Valley, CA",1001 to 5000 employees,2007,...,13,1,0,1,1,1,data engineer,senior,3978,2


In [7]:
data2 = pd.read_csv(r"./glassdoor_jobs.csv")

In [8]:
data3 = pd.read_csv(r"./salary_data_cleaned.csv")

In [9]:
data2.sample(5)

Unnamed: 0.1,Unnamed: 0,Job Title,Salary Estimate,Job Description,Rating,Company Name,Location,Headquarters,Size,Founded,Type of ownership,Industry,Sector,Revenue,Competitors
207,207,Senior Data Scientist,$73K-$124K (Glassdoor est.),The Senior Data Scientist independently develo...,3.3,Plymouth Rock Assurance\n3.3,"Woodbridge, NJ","Boston, MA",1001 to 5000 employees,1982,Company - Private,Insurance Carriers,Insurance,$10 to $25 million (USD),"Arbella Insurance, Safety Insurance"
350,350,Data Scientist,$76K-$126K (Glassdoor est.),Roles and responsibilities include the followi...,2.3,Vanda Pharmaceuticals\n2.3,"Washington, DC","Washington, DC",201 to 500 employees,2003,Company - Public,Biotech & Pharmaceuticals,Biotech & Pharmaceuticals,$100 to $500 million (USD),-1
205,205,Senior Data Analyst,$55K-$100K (Glassdoor est.),As a precaution and in compliance with applica...,2.8,Dodge Data & Analytics\n2.8,"Hamilton, NJ","Hamilton, NJ",201 to 500 employees,2014,Company - Private,IT Services,Information Technology,Unknown / Non-Applicable,-1
49,49,Lead Data Scientist,$158K-$211K (Glassdoor est.),As the world's leader in digital payments tech...,3.7,Visa Inc.\n3.7,"Bellevue, WA","Foster City, CA",10000+ employees,1958,Company - Public,IT Services,Information Technology,$10+ billion (USD),"American Express, Mastercard, Discover"
188,188,Staff Data Scientist,$132K-$211K (Glassdoor est.),Western\nDigital\nThe next big thing in data i...,3.5,Western Digital\n3.5,"San Jose, CA","San Jose, CA",10000+ employees,1970,Company - Public,Computer Hardware & Software,Information Technology,$10+ billion (USD),"Seagate Technology, Toshiba"


In [10]:
data3.sample(5)

Unnamed: 0,Job Title,Salary Estimate,Job Description,Rating,Company Name,Location,Headquarters,Size,Founded,Type of ownership,...,avg_salary,company_txt,job_state,same_state,age,python_yn,R_yn,spark,aws,excel
384,Sr Data Engineer (Sr BI Developer),$90K-$110K(Employer est.),Description/Responsibilities\n\n\nResponsible ...,3.4,Tivity Health\n3.4,"Chandler, AZ","Franklin, TN",501 to 1000 employees,1981,Company - Public,...,100.0,Tivity Health\n,AZ,0,39,0,0,0,0,0
725,Senior Scientist - Regulatory Submissions,$80K-$155K (Glassdoor est.),British American Tobacco\nReynolds American In...,3.1,Reynolds American\n3.1,"Winston-Salem, NC","Winston-Salem, NC",5001 to 10000 employees,1875,Company - Private,...,117.5,Reynolds American\n,NC,1,145,0,0,0,0,1
61,Data Scientist,$84K-$146K (Glassdoor est.),USEReady is looking for 2-3 Data Scientist to ...,4.3,USEReady\n4.3,"New York, NY","New York, NY",201 to 500 employees,2011,Company - Private,...,115.0,USEReady\n,NY,1,9,1,0,1,1,0
411,RESEARCH COMPUTER SCIENTIST - RESEARCH ENGINEE...,$52K-$91K (Glassdoor est.),Serve as a software developer and researcher o...,3.9,Southwest Research Institute\n3.9,"San Antonio, TX","San Antonio, TX",1001 to 5000 employees,1947,Nonprofit Organization,...,71.5,Southwest Research Institute\n,TX,1,73,1,0,0,1,1
483,Data Scientist,$56K-$95K (Glassdoor est.),"ExecOnline is a fast-growing, venture-backed e...",4.2,ExecOnline\n4.2,"New York, NY","New York, NY",51 to 200 employees,2012,Company - Private,...,75.5,ExecOnline\n,NY,1,8,1,0,0,1,0


In [12]:
data1.columns

Index(['Unnamed: 0', 'Job Title', 'Salary Estimate', 'Job Description',
       'Rating', 'Company Name', 'Location', 'Headquarters', 'Size', 'Founded',
       'Type of ownership', 'Industry', 'Sector', 'Revenue', 'Competitors',
       'hourly', 'employer_provided', 'min_salary', 'max_salary', 'avg_salary',
       'company_txt', 'job_state', 'same_state', 'age', 'python_yn', 'R_yn',
       'spark', 'aws', 'excel', 'job_simp', 'seniority', 'desc_len',
       'num_comp'],
      dtype='object')

In [13]:
data2.columns

Index(['Unnamed: 0', 'Job Title', 'Salary Estimate', 'Job Description',
       'Rating', 'Company Name', 'Location', 'Headquarters', 'Size', 'Founded',
       'Type of ownership', 'Industry', 'Sector', 'Revenue', 'Competitors'],
      dtype='object')

In [14]:
data3.columns

Index(['Job Title', 'Salary Estimate', 'Job Description', 'Rating',
       'Company Name', 'Location', 'Headquarters', 'Size', 'Founded',
       'Type of ownership', 'Industry', 'Sector', 'Revenue', 'Competitors',
       'hourly', 'employer_provided', 'min_salary', 'max_salary', 'avg_salary',
       'company_txt', 'job_state', 'same_state', 'age', 'python_yn', 'R_yn',
       'spark', 'aws', 'excel'],
      dtype='object')