<h1 align='center'> Data Cleaning </h1>

# Import Libraires and Jobs Data

In [1]:
import pandas as pd
import re

In [54]:
#data_analyst_jobs = pd.read_csv('data/data_analyst_jobs_no_repeat.csv')
#data_scientist_jobs = pd.read_csv('data/data_scientist_jobs_no_repeat.csv')
#df = pd.concat([data_analyst_jobs, data_scientist_jobs], ignore_index=True)
df = pd.read_csv('data/data_scientist_jobs_no_repeat.csv')
df

Unnamed: 0,Job Title,Salary Estimate,Job Description,Rating,Company Name,Location,Size,Founded,Type of ownership,Industry,Sector,Revenue
0,"Data Scientist, Insights & Analytics",$101K - $137K (Glassdoor est.),"By clicking the “Apply” button, I understand t...",4.0,Takeda Pharmaceutical\n4.0,"Boston, MA",10000+ Employees,1781,Company - Private,Biotech & Pharmaceuticals,Pharmaceutical & Biotechnology,$10+ billion (USD)
1,"Data Scientist, Human Factors and User Research",$106K - $158K (Glassdoor est.),"By clicking the “Apply” button, I understand t...",4.0,Takeda Pharmaceutical\n4.0,"Lexington, MA",10000+ Employees,1781,Company - Private,Biotech & Pharmaceuticals,Pharmaceutical & Biotechnology,$10+ billion (USD)
2,Remote Senior Data Scientist NLP,$89K - $137K (Glassdoor est.),Company Summary\nJoin a team that puts its Peo...,3.7,First American Financial Corporation\n3.7,"Santa Ana, CA",10000+ Employees,1889,Company - Public,Insurance Carriers,Insurance,$5 to $10 billion (USD)
3,Data Scientist,Employer Provided Salary:$100K - $130K,Data Scientist\nIf you are a Data Scientist wi...,4.1,CyberCoders\n4.1,"Fort Worth, TX",201 to 500 Employees,1999,Subsidiary or Business Segment,Staffing & Subcontracting,Human Resources & Staffing,$100 to $500 million (USD)
4,Sr Data Scientist,-1,"Sr. Data Scientist for Chesterfield, MO to bld...",-1.0,"St. Louis, MO Area Jobs","Saint Louis, MO",-1,-1,-1,-1,-1,-1
...,...,...,...,...,...,...,...,...,...,...,...,...
975,"Senior Data Scientist, Core Experience",$109K - $169K (Glassdoor est.),"Data, Research & Insights\nData Science\nDeliv...",4.3,Spotify\n4.3,"New York, NY",5001 to 10000 Employees,2006,Company - Public,Internet & Web Services,Information Technology,Unknown / Non-Applicable
976,Data Scientist (REMOTE),$48K - $79K (Glassdoor est.),The core data science team at Dick’s Sporting ...,3.8,DICK'S Sporting Goods\n3.8,"Coraopolis, PA",10000+ Employees,1948,Company - Public,Sporting Goods Stores,Retail & Wholesale,$5 to $10 billion (USD)
977,"Data Scientist, Engineering Analytics",$92K - $122K (Glassdoor est.),"United States, Georgia, Atlanta\nTechOps\n09-A...",4.3,Delta\n4.3,"Atlanta, GA",10000+ Employees,1928,Company - Public,"Airlines, Airports & Air Transportation",Transportation & Logistics,$10+ billion (USD)
978,Data Science Co-op (January - July 2023),Employer Provided Salary:$26.00 - $32.00 Per Hour,Help shape the future of Data Science across L...,3.9,Liberty Mutual Insurance\n3.9,Remote,10000+ Employees,1912,Company - Private,Insurance Carriers,Insurance,$10+ billion (USD)


# Drop Duplicate Rows

In [55]:
df.duplicated().value_counts()

False    971
True       9
dtype: int64

In [56]:
df.drop_duplicates(inplace=True, ignore_index=True)
df.duplicated().value_counts()

False    971
dtype: int64

# Rename Columns

In [57]:
# lowercase and replace space with _
col_name = list(df.columns)
df.columns = [name.lower().replace(' ', '_') for name in col_name]
list(df.columns)

['job_title',
 'salary_estimate',
 'job_description',
 'rating',
 'company_name',
 'location',
 'size',
 'founded',
 'type_of_ownership',
 'industry',
 'sector',
 'revenue']

# Data Cleaning

## ***`job_title`***
- Change the value of *`location`* to *Remote* if it appears in *`job_title`*.
- Create a new column *`seniority`* with 3 values : *NA*, *Low*, and *High*.
- Simplified *`job_title`* to 6 values : *NA*, *Data Analyst*, *Data Scientist*, *Data Engineer*, *ML Engineer*, and *Manager*.

In [58]:
df['job_title'][60:120]

60                  Senior Data Science Analyst - Remote
61                                        Data Scientist
62                                        Data Scientist
63             Data Scientist - Strategic Data Solutions
64                                        Data Scientist
65                          Data Scientist - Telecommute
66      Senior Workforce Planning Analyst/Data Scientist
67                                        Data Scientist
68                     Senior Data Analyst - Telecommute
69               Senior Data Scientist - Payor Relations
70                                   Senior Data Analyst
71              Sr Healthcare Economics Analyst - Remote
72                                         AI Engineer I
73                    Technology Assessment Data Analyst
74                                Co-Op Machine Learning
75                 Statistical Programmer 2, Global BIOS
76                            Sr. Statistical Programmer
77     Associate Director, Data

In [59]:
def remote(row):
    title = row['job_title'].lower()
    if 'remote' in title and 'not remote' not in title:
        row['location'] = 'Remote'
    return row

def seniority(title):
    title = title.lower()
    
    if 'sr' in title or 'senior' in title or 'mid' in title or 'experienced' in title or\
    'lead' in title or 'manager' in title or 'principal' in title or 'director' in title: 
        return 'High'
    elif 'jr' in title or 'junior' in title or 'entry level' in title or 'associate' in title or 'graduate' in title:
        return 'Low'
    else:
        return 'Unknown'
    
    
def title_simplify(title):
    title = title.lower()
    if 'analyst' in title:
        return 'Data Analyst'
    elif 'data scientist' in title or 'data science' in title:
        return 'Data Scientist'
    elif 'Data Engineer' in title:
        return 'data engineer'
    elif 'machine learning' in title or 'deep learning' in title or 'ai' in title or 'ml' in title:
        return 'ML Engineer'
    elif 'lead' in title or 'manager' in title or 'principal' in title or 'director' in title:
        return 'Manager'
    else:
        return 'Others'    

In [60]:
# change the value of location to Remote if it appears in job_title
df = df.apply(remote, axis=1)

# create a new column seniority with 3 values : NA, Low, and High   
df['seniority'] = df['job_title'].apply(seniority)

# simplified job_title to 6 values : NA, Data Analyst, Data Scientist, Data Engineer, ML Engineer, and Manager
df['job_title'] = df['job_title'].apply(title_simplify)

# check results
print(df['job_title'].value_counts())
print('\n')
print(df['seniority'].value_counts())

Data Scientist    572
ML Engineer       166
Data Analyst      124
Others             86
Manager            23
Name: job_title, dtype: int64


Unknown    560
High       361
Low         50
Name: seniority, dtype: int64


## ***`salary_estimate`***
- *`salary_estimate`* is the target value, so remove the record when it is *-1*.
- Create a new column *`employer_provided_salary`*: *1* if the string contains *Employer Provided Salary*, otherwise *0*.
- Create a new column *`hourly_wage`*: *1* if the string contains *Per Hour*, otherwise *0*.
- Clean the string, keeping the format *XXX-XXX* only.
- Create new columns *`salary_min`* and *`salary_max`* from *(XXX)-(XXX)*. Turn into float type.
- Transfer the salary from hourly to yearly by multiplying 2 (unit in K) if *`hourly_wage`* is *1*.
- Create a new column *`salary_avg`* by averaging *`salary_min`* and *`salary_max`*.

In [61]:
df['salary_estimate'][:60]

0               $101K - $137K (Glassdoor est.)
1               $106K - $158K (Glassdoor est.)
2                $89K - $137K (Glassdoor est.)
3       Employer Provided Salary:$100K - $130K
4                                           -1
5                $77K - $118K (Glassdoor est.)
6               $110K - $152K (Glassdoor est.)
7                                           -1
8                                           -1
9                                           -1
10               $95K - $139K (Glassdoor est.)
11                                          -1
12               $69K - $100K (Glassdoor est.)
13               $72K - $104K (Glassdoor est.)
14        Employer Provided Salary:$55K - $57K
15              $108K - $161K (Glassdoor est.)
16                $62K - $90K (Glassdoor est.)
17              $132K - $197K (Glassdoor est.)
18                                          -1
19               $95K - $144K (Glassdoor est.)
20      Employer Provided Salary:$120K - $160K
21      Emplo

In [62]:
# remove the record when salary is -1
df = df[df['salary_estimate'] != '-1']
# create employer_provided_salary: 1 if salary contains 'Employer', otherwise 0.
df['employer_provided_salary'] = df['salary_estimate'].apply(lambda x: 1 if 'Employer' in x else 0)
# create hourly_wage: 1 if salary contains 'Per Hour', otherwise 0.
df['hourly_wage'] = df['salary_estimate'].apply(lambda x: 1 if 'Per Hour' in x else 0)


    # extract formats $___K,  $__.__,  $___K - $___K,  $__.__ - $__.__
pattern = r'(\$[-$\d.K ]*[K\d])'
result = (df['salary_estimate'].str.extract(pattern).squeeze()
    # remove $, K, and space
    .str.replace('[$K ]', '', regex=True)
    # transform XXX -> XXX-XXX
    .apply(lambda x: x if '-' in x else x + '-' + x)
    # create salary_min and salary_max     
    .str.extract(r'(?P<salary_min>.*)-(?P<salary_max>.*)')
    .astype(float))


salary = ['salary_min', 'salary_max']
# transfer the salary from hourly to yearly by multiplying 2 (unit in K) if hourly_wage is 1
result = (pd.concat([result, df[['hourly_wage']]], axis=1)    
    .apply(lambda x: x[salary]*2 if x['hourly_wage'] == 1 else x[salary], axis=1))
df = pd.concat([df, result], axis=1)

# create salary_avg by averaging salary_min and salary_max
df['salary_avg'] = (df['salary_min'] + df['salary_max']) / 2

df.drop(['salary_estimate', 'employer_provided_salary', 'hourly_wage'], axis=1, inplace=True)

# check results
df.loc[:, 'salary_min':'salary_avg'][:60]

Unnamed: 0,salary_min,salary_max,salary_avg
0,101.0,137.0,119.0
1,106.0,158.0,132.0
2,89.0,137.0,113.0
3,100.0,130.0,115.0
5,77.0,118.0,97.5
6,110.0,152.0,131.0
10,95.0,139.0,117.0
12,69.0,100.0,84.5
13,72.0,104.0,88.0
14,55.0,57.0,56.0


## ***`job_description`***
- Create a new column *`description_len`* to count the length of the job description.
- Create *`skill_XX`* columns: *1* if *`job_description`* mentions the specific skill, otherwise *0*. Skills include:
    - Big Data (Spark or Hadoop)
    - Cloud Computing (AWS or Azure)    
    - Excel
    - Machine Learning or Deep Learning
    - R or Python
    - SQL
    - Visualization Tools (Tableau, Power BI)    

In [63]:
# create description_len to count the length of the job description
df['description_len'] = df['job_description'].apply(len) 

# create skill_XX: 1 if job_description mentions the specific skill, otherwise 0
# Spark or Hadoop
df['skill_bigdata'] = df['job_description'].apply(lambda x: 1 if re.search('Spark|Hadoop', x) else 0)
# AWS or Azure
df['skill_cloud'] = df['job_description'].apply(lambda x: 1 if re.search('AWS|Azure', x) else 0)
# Excel
df['skill_excel'] = df['job_description'].apply(lambda x: 1 if re.search('\WExcel\W', x) else 0)
# machine learning or deep learning
df['skill_ml'] = df['job_description'].apply(lambda x: 1 if re.search('machine learning|deep learning', x.lower()) else 0)
# R or Python
df['skill_rpython'] = df['job_description'].apply(lambda x: 1 if re.search('\W(R|Python)\W', x) else 0)
# SQL
df['skill_sql'] = df['job_description'].apply(lambda x: 1 if re.search('SQL', x) else 0)
# Tableau, Power BI
df['skill_viztool'] = df['job_description'].apply(lambda x: 1 if re.search('Tableau|PowerBI|Power BI', x) else 0)

# check results
df.loc[:, 'description_len':'skill_viztool']

Unnamed: 0,description_len,skill_bigdata,skill_cloud,skill_excel,skill_ml,skill_rpython,skill_sql,skill_viztool
0,5392,0,1,0,1,1,1,0
1,4946,0,0,0,1,1,0,1
2,5083,0,1,0,1,1,1,0
3,2165,0,0,0,1,1,1,0
5,5650,1,1,0,1,1,1,0
...,...,...,...,...,...,...,...,...
966,4602,0,0,0,0,1,1,0
967,1961,0,1,0,1,1,1,0
968,2333,0,1,0,1,1,1,0
969,3802,0,0,0,1,1,0,0


## ***`rating`***

In [64]:
df['rating'].value_counts()

 3.8    94
 3.9    85
 4.1    71
 4.0    65
 4.2    55
-1.0    51
 3.7    50
 4.4    47
 4.3    44
 3.6    42
 3.5    27
 3.4    26
 3.3    25
 5.0    20
 4.5    18
 3.1    11
 4.6    11
 4.7     9
 4.8     8
 3.0     6
 4.9     4
 3.2     2
 2.6     2
 2.7     1
 2.8     1
 2.3     1
 2.0     1
Name: rating, dtype: int64

## ***`company_name`***
- Remove the new line and rating

In [65]:
print(df['company_name'][:60])

# remove new line and rating if rating exists
df['company_name'] = df.apply(lambda x: x['company_name'] if x['rating'] < 0 else x['company_name'][:-4], axis=1)

# check results
print('\n')
df['company_name'][:60]

0                            Takeda Pharmaceutical\n4.0
1                            Takeda Pharmaceutical\n4.0
2             First American Financial Corporation\n3.7
3                                      CyberCoders\n4.1
5                              Frontier Psychiatry\n4.8
6                                          Metric5\n5.0
10                       JPMorgan Chase Bank, N.A.\n3.8
12                       JPMorgan Chase Bank, N.A.\n3.8
13                                 Piper Companies\n4.4
14                                        Intact Global
15                         Amazon.com Services LLC\n3.8
16                      Baylor College of Medicine\n3.8
17                         Amazon.com Services LLC\n3.8
19                                           Arete\n3.9
20                                                Sceye
21                         Rays TechServ Pvt. Ltd.\n3.7
22                                  Work from Home\n3.4
23                                  HCA Healthca

0                             Takeda Pharmaceutical
1                             Takeda Pharmaceutical
2              First American Financial Corporation
3                                       CyberCoders
5                               Frontier Psychiatry
6                                           Metric5
10                        JPMorgan Chase Bank, N.A.
12                        JPMorgan Chase Bank, N.A.
13                                  Piper Companies
14                                    Intact Global
15                          Amazon.com Services LLC
16                       Baylor College of Medicine
17                          Amazon.com Services LLC
19                                            Arete
20                                            Sceye
21                          Rays TechServ Pvt. Ltd.
22                                   Work from Home
23                                   HCA Healthcare
25                             Cornerstone Research
26          

## ***`location`***
- Keeps the *Remote* and the 2-digit state abbreviations
- Fix values which are not mentioned above

In [66]:
df['location'].value_counts()

Remote                 131
New York, NY            56
San Francisco, CA       40
Seattle, WA             28
Atlanta, GA             17
                      ... 
Branchburg, NJ           1
Auburn, AL               1
Marysville, MI           1
Charlottesville, VA      1
Chantilly, VA            1
Name: location, Length: 256, dtype: int64

In [67]:
# get the string after ',', remain the same if ',' doesn't exist
df['location'] = df['location'].apply(lambda x: x.split(', ')[1] if x.find(',') != -1 else x)
df['location'].value_counts()

CA               135
Remote           131
NY                61
TX                51
VA                44
MA                41
WA                40
IL                23
GA                21
MD                18
PA                17
MN                17
NJ                17
TN                13
FL                12
MI                12
NC                11
CT                 9
OH                 9
DC                 9
AZ                 8
CO                 6
AL                 6
DE                 6
UT                 5
SC                 4
IN                 4
MO                 4
United States      4
KY                 3
WI                 3
OR                 3
RI                 3
NM                 3
Colorado           2
Massachusetts      2
LA                 2
Florida            2
Manhattan          2
IA                 1
WV                 1
NE                 1
AR                 1
Alaska             1
Texas              1
Pennsylvania       1
OK                 1
MT           

In [68]:
# fix errors
fix = {'Alabama': 'AL',
       'Alaska': 'AK',
       'Colorado': 'CO',
       'Florida': 'FL',
       'Manhattan': 'NY',
       'Massachusetts': 'MA',
       'Ohio': 'OH',
       'Pennsylvania' : 'PA',
       'South Carolina': 'SC',
       'Texas': 'TX',
       'United States': 'Remote',
       'Utah': 'UT',
       'Washington State': 'WA'}

df['location'] = df['location'].replace(fix)

# check results
df['location'].value_counts()

Remote    135
CA        135
NY         63
TX         52
VA         44
MA         43
WA         40
IL         23
GA         21
MD         18
PA         18
NJ         17
MN         17
FL         14
TN         13
MI         12
NC         11
OH          9
DC          9
CT          9
AZ          8
CO          8
AL          6
DE          6
UT          5
IN          4
MO          4
SC          4
OR          3
RI          3
NM          3
KY          3
WI          3
LA          2
IA          1
WV          1
NE          1
AR          1
AK          1
ID          1
OK          1
WY          1
MT          1
KS          1
HI          1
MS          1
Name: location, dtype: int64

## ***`size`***
- *-1* &rarr; *Unknown*

In [69]:
print(df['size'].value_counts())

df['size'] = df['size'].str.replace('-1', 'Unknown')

# check results
print('\n')
df['size'].value_counts()

10000+ Employees           346
1001 to 5000 Employees      97
201 to 500 Employees        64
51 to 200 Employees         62
1 to 50 Employees           58
501 to 1000 Employees       47
5001 to 10000 Employees     40
Unknown                     34
-1                          29
Name: size, dtype: int64




10000+ Employees           346
1001 to 5000 Employees      97
201 to 500 Employees        64
Unknown                     63
51 to 200 Employees         62
1 to 50 Employees           58
501 to 1000 Employees       47
5001 to 10000 Employees     40
Name: size, dtype: int64

## ***`founded`***
- 2022 - founded year

In [70]:
print(df['founded'])

df['age'] = df['founded'].apply(lambda x: 2022-x if x != -1 else x)
df.drop(['founded'], axis=1, inplace=True)

# show results
print('\n')
df['age']

0      1781
1      1781
2      1889
3      1999
5      2019
       ... 
966    2006
967    1948
968    1928
969    1912
970    2016
Name: founded, Length: 777, dtype: int64




0      241
1      241
2      133
3       23
5        3
      ... 
966     16
967     74
968     94
969    110
970      6
Name: age, Length: 777, dtype: int64

## ***`type_of_ownership`***
- *-1* &rarr; *Unknown*

In [71]:
print(df['type_of_ownership'].value_counts())

df['type_of_ownership'] = df['type_of_ownership'].str.replace('-1', 'Unknown')

# check results
print('\n')
df['type_of_ownership'].value_counts()

Company - Public                  355
Company - Private                 272
Subsidiary or Business Segment     32
-1                                 29
Nonprofit Organization             23
Government                         20
College / University               17
Hospital                           12
Self-employed                       5
Unknown                             5
Private Practice / Firm             3
Contract                            3
School / School District            1
Name: type_of_ownership, dtype: int64




Company - Public                  355
Company - Private                 272
Unknown                            34
Subsidiary or Business Segment     32
Nonprofit Organization             23
Government                         20
College / University               17
Hospital                           12
Self-employed                       5
Private Practice / Firm             3
Contract                            3
School / School District            1
Name: type_of_ownership, dtype: int64

## ***`industry`***

In [72]:
df['industry'].value_counts()[:60]

Internet & Web Services                    83
-1                                         79
Health Care Services & Hospitals           62
Information Technology Support Services    57
Computer Hardware Development              46
HR Consulting                              46
Enterprise Software & Network Solutions    35
Business Consulting                        34
Biotech & Pharmaceuticals                  30
Banking & Lending                          27
Staffing & Subcontracting                  26
Colleges & Universities                    17
National Agencies                          17
Insurance Carriers                         17
Aerospace & Defense                        14
General Merchandise & Superstores          14
Energy & Utilities                         14
Investment & Asset Management              12
Telecommunications Services                10
Advertising & Public Relations             10
Home Furniture & Housewares Stores          9
Food & Beverage Manufacturing     

In [73]:
df['industry'] = df['industry'].str.replace('-1', 'Unknown')

# check results
df['industry']

0                    Biotech & Pharmaceuticals
1                    Biotech & Pharmaceuticals
2                           Insurance Carriers
3                    Staffing & Subcontracting
5             Health Care Services & Hospitals
                        ...                   
966                    Internet & Web Services
967                      Sporting Goods Stores
968    Airlines, Airports & Air Transportation
969                         Insurance Carriers
970                    Internet & Web Services
Name: industry, Length: 777, dtype: object

## ***`sector`***

In [74]:
df['sector'].value_counts()

Information Technology                         228
-1                                              79
Human Resources & Staffing                      72
Healthcare                                      62
Financial Services                              50
Management & Consulting                         44
Retail & Wholesale                              35
Pharmaceutical & Biotechnology                  31
Manufacturing                                   30
Insurance                                       22
Media & Communication                           21
Government & Public Administration              21
Education                                       18
Aerospace & Defense                             14
Energy, Mining & Utilities                      14
Telecommunications                              11
Transportation & Logistics                       9
Construction, Repair & Maintenance Services      5
Arts, Entertainment & Recreation                 3
Nonprofit & NGO                

In [75]:
df['sector'] = df['sector'].str.replace('-1', 'Unknown')

# check results
df['sector']

0      Pharmaceutical & Biotechnology
1      Pharmaceutical & Biotechnology
2                           Insurance
3          Human Resources & Staffing
5                          Healthcare
                    ...              
966            Information Technology
967                Retail & Wholesale
968        Transportation & Logistics
969                         Insurance
970            Information Technology
Name: sector, Length: 777, dtype: object

## ***`revenue`***
- *-1* &rarr; *Unknown*

In [76]:
print(df['revenue'].value_counts())

df['revenue'] = df['revenue'].str.replace('Unknown / Non-Applicable', 'Unknown')
df['revenue'] = df['revenue'].str.replace('-1', 'Unknown')

# show results
print('\n')
df['revenue'].value_counts()

$10+ billion (USD)                  240
Unknown / Non-Applicable            204
$1 to $5 billion (USD)               69
$100 to $500 million (USD)           62
$5 to $10 billion (USD)              49
$25 to $100 million (USD)            49
$5 to $25 million (USD)              31
-1                                   29
$500 million to $1 billion (USD)     19
$1 to $5 million (USD)               14
Less than $1 million (USD)           11
Name: revenue, dtype: int64




$10+ billion (USD)                  240
Unknown                             233
$1 to $5 billion (USD)               69
$100 to $500 million (USD)           62
$5 to $10 billion (USD)              49
$25 to $100 million (USD)            49
$5 to $25 million (USD)              31
$500 million to $1 billion (USD)     19
$1 to $5 million (USD)               14
Less than $1 million (USD)           11
Name: revenue, dtype: int64

# Output Cleaned Data

In [77]:
df

Unnamed: 0,job_title,job_description,rating,company_name,location,size,type_of_ownership,industry,sector,revenue,...,salary_avg,description_len,skill_bigdata,skill_cloud,skill_excel,skill_ml,skill_rpython,skill_sql,skill_viztool,age
0,Data Scientist,"By clicking the “Apply” button, I understand t...",4.0,Takeda Pharmaceutical,MA,10000+ Employees,Company - Private,Biotech & Pharmaceuticals,Pharmaceutical & Biotechnology,$10+ billion (USD),...,119.0,5392,0,1,0,1,1,1,0,241
1,Data Scientist,"By clicking the “Apply” button, I understand t...",4.0,Takeda Pharmaceutical,MA,10000+ Employees,Company - Private,Biotech & Pharmaceuticals,Pharmaceutical & Biotechnology,$10+ billion (USD),...,132.0,4946,0,0,0,1,1,0,1,241
2,Data Scientist,Company Summary\nJoin a team that puts its Peo...,3.7,First American Financial Corporation,Remote,10000+ Employees,Company - Public,Insurance Carriers,Insurance,$5 to $10 billion (USD),...,113.0,5083,0,1,0,1,1,1,0,133
3,Data Scientist,Data Scientist\nIf you are a Data Scientist wi...,4.1,CyberCoders,TX,201 to 500 Employees,Subsidiary or Business Segment,Staffing & Subcontracting,Human Resources & Staffing,$100 to $500 million (USD),...,115.0,2165,0,0,0,1,1,1,0,23
5,Data Scientist,Frontier Psychiatry is a telepsychiatry practi...,4.8,Frontier Psychiatry,MT,1 to 50 Employees,Private Practice / Firm,Health Care Services & Hospitals,Healthcare,Unknown,...,97.5,5650,1,1,0,1,1,1,0,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
966,Data Scientist,"Data, Research & Insights\nData Science\nDeliv...",4.3,Spotify,NY,5001 to 10000 Employees,Company - Public,Internet & Web Services,Information Technology,Unknown,...,139.0,4602,0,0,0,0,1,1,0,16
967,Data Scientist,The core data science team at Dick’s Sporting ...,3.8,DICK'S Sporting Goods,Remote,10000+ Employees,Company - Public,Sporting Goods Stores,Retail & Wholesale,$5 to $10 billion (USD),...,63.5,1961,0,1,0,1,1,1,0,74
968,Data Scientist,"United States, Georgia, Atlanta\nTechOps\n09-A...",4.3,Delta,GA,10000+ Employees,Company - Public,"Airlines, Airports & Air Transportation",Transportation & Logistics,$10+ billion (USD),...,107.0,2333,0,1,0,1,1,1,0,94
969,Data Scientist,Help shape the future of Data Science across L...,3.9,Liberty Mutual Insurance,Remote,10000+ Employees,Company - Private,Insurance Carriers,Insurance,$10+ billion (USD),...,58.0,3802,0,0,0,1,1,0,0,110


In [78]:
df.to_csv('data/glassdoor_jobs_cleaned.csv', index=False)