In [6]:
from google.colab import drive
import pandas as pd

drive.mount('/content/drive')

# Load the dataset
file_path = '/content/drive/MyDrive/DataSet/DataScientist.csv'
df = pd.read_csv(file_path)

# Step 1: Drop unnecessary columns
df.drop(['Unnamed: 0', 'index'], axis=1, inplace=True)

# Confirm removal
print("Remaining Columns:")
print(df.columns.tolist())

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Remaining Columns:
['Job Title', 'Salary Estimate', 'Job Description', 'Rating', 'Company Name', 'Location', 'Headquarters', 'Size', 'Founded', 'Type of ownership', 'Industry', 'Sector', 'Revenue', 'Competitors', 'Easy Apply']


In [7]:
# Step 2: Standardize Key Text Fields

# List of columns to clean
text_cols = ['Job Title', 'Company Name', 'Location', 'Industry', 'Sector', 'Job Description']

# Convert to lowercase and strip whitespace
for col in text_cols:
    df[col] = df[col].astype(str).str.strip().str.lower()

# Preview cleaned values
df[text_cols].head(10)

Unnamed: 0,Job Title,Company Name,Location,Industry,Sector,Job Description
0,senior data scientist,hopper\n3.5,"new york, ny",travel agencies,travel & tourism,"about hopper\n\nat hopper, we’re on a mission ..."
1,"data scientist, product analytics",noom us\n4.5,"new york, ny","health, beauty, & fitness",consumer services,"at noom, we use scientifically proven methods ..."
2,data science manager,decode_m,"new york, ny",-1,-1,decode_m\n\nhttps://www.decode-m.com/\n\ndata ...
3,data analyst,sapphire digital\n3.4,"lyndhurst, nj",internet,information technology,sapphire digital seeks a dynamic and driven mi...
4,"director, data science",united entertainment group\n3.4,"new york, ny",advertising & marketing,business services,"director, data science - (200537)\ndescription..."
5,data scientist,ifg companies\n2.9,"new york, ny",insurance carriers,insurance,job brief\n\nthe ideal candidate will have pre...
6,quantitative researcher,pdt partners\n4.4,"new york, ny",investment banking & asset management,finance,experience: entry-level (phd program) or exper...
7,quantitative research associate,enlightenment research,"new york, ny",-1,-1,seeking a quant to work with senior researcher...
8,ai scientist,paige\n5.0,"new york, ny",enterprise software & network solutions,information technology,paige is a software company helping pathologis...
9,quantitative researcher,jane street\n4.8,"new york, ny",investment banking & asset management,finance,"about the position\n\n\nat jane street, we con..."


In [8]:
import re

# Function to clean text: remove \n, \t, multiple spaces, etc.
def clean_special_chars(text):
    text = re.sub(r'\s+', ' ', text)              # Replace multiple spaces/newlines/tabs with single space
    text = re.sub(r'[^\w\s.,!?%-]', '', text)     # Keep basic punctuation
    return text.strip()

# Apply to selected columns
cols_to_clean = ['Company Name', 'Job Description']

for col in cols_to_clean:
    df[col] = df[col].astype(str).apply(clean_special_chars)

# Preview cleaned company names
df['Company Name'].head(10)


Unnamed: 0,Company Name
0,hopper 3.5
1,noom us 4.5
2,decode_m
3,sapphire digital 3.4
4,united entertainment group 3.4
5,ifg companies 2.9
6,pdt partners 4.4
7,enlightenment research
8,paige 5.0
9,jane street 4.8


In [9]:
# Keep only rows where Job Title contains 'data'
df = df[df['Job Title'].str.contains('data')].reset_index(drop=True)

print("Remaining rows after filtering:", df.shape[0])

Remaining rows after filtering: 2823


In [10]:
# Save the cleaned dataset for Day 3
df.to_csv('/content/drive/MyDrive/DataSet/cleaned_DataScientist.csv', index=False)

## Day 2: Data Cleaning Summary

### Objective:
To clean and prepare the dataset for analysis and visualization by:
- Removing unnecessary or redundant data
- Standardizing text fields
- Cleaning job descriptions
- Saving a processed dataset for reuse

---

### Cleaning Actions Performed:

1. **Dropped Columns**:  
   Removed `Unnamed: 0` and `index` as they were redundant row identifiers.

2. **Standardized Text Fields**:  
   Applied `lowercase` conversion and `whitespace stripping` to:
   - Job Title  
   - Company Name  
   - Location  
   - Industry  
   - Sector  
   - Job Description

3. **Cleaned Special Characters**:  
   Removed `\n`, `\t`, multiple spaces, and unwanted symbols from:
   - `Company Name`  
   - `Job Description`

4. **Optional Filter**:  
   Role-based filter applied with the keyword **"data"**.

5. **Saved Output**:  
   Cleaned dataset saved to:'/content/drive/MyDrive/DataSet/cleaned_DataScientist.csv'

---