In [7]:
df = pd.read_csv("../data/women_in_tech.csv")  # use correct path for your system

# Quick overview
print("Original dataset shape:", df.shape)
print(df.columns.tolist())
df.head()

Original dataset shape: (251, 7)
['key', 'company', 'team', 'num_female_eng', 'num_eng', 'percent_female_eng', 'last_updated']


Unnamed: 0,key,company,team,num_female_eng,num_eng,percent_female_eng,last_updated
0,all,ALL,,2979,15967,18.66,3/15/2016
1,wellsfargo,Wells Fargo,,1296,5407,23.97,7/22/2015
2,thoughtworks,ThoughtWorks,,337,1425,23.65,5/19/2014
3,mozilla,Mozilla,,43,500,8.6,10/23/2013
4,athenahealth,athenahealth,,75,473,15.86,2/25/2016


In [8]:
columns = [
    'key',                 # unique identifier
    'company',             # company name
    'team',                # optional, team name
    'num_female_eng',      # number of female engineers
    'num_eng',             # total engineers
    'percent_female_eng',  # % of female engineers
    'last_updated'         # when data was collected
]

df = df[columns]

# Quick check
print("Filtered columns dataset shape:", df.shape)
df.head()

Filtered columns dataset shape: (251, 7)


Unnamed: 0,key,company,team,num_female_eng,num_eng,percent_female_eng,last_updated
0,all,ALL,,2979,15967,18.66,3/15/2016
1,wellsfargo,Wells Fargo,,1296,5407,23.97,7/22/2015
2,thoughtworks,ThoughtWorks,,337,1425,23.65,5/19/2014
3,mozilla,Mozilla,,43,500,8.6,10/23/2013
4,athenahealth,athenahealth,,75,473,15.86,2/25/2016


In [9]:
# Remove rows with missing key or company
df = df[df['company'].notnull() & df['key'].notnull()]

# Convert numeric columns to float
df['num_female_eng'] = pd.to_numeric(df['num_female_eng'], errors='coerce')
df['num_eng'] = pd.to_numeric(df['num_eng'], errors='coerce')
df['percent_female_eng'] = pd.to_numeric(df['percent_female_eng'], errors='coerce')

# Drop rows where numeric columns are missing
df = df.dropna(subset=['num_female_eng', 'num_eng', 'percent_female_eng'])

# Optional: convert last_updated to datetime
df['last_updated'] = pd.to_datetime(df['last_updated'], errors='coerce')

# Final shape after cleaning
print("Cleaned dataset shape:", df.shape)
df.head()


Cleaned dataset shape: (251, 7)


Unnamed: 0,key,company,team,num_female_eng,num_eng,percent_female_eng,last_updated
0,all,ALL,,2979,15967,18.66,2016-03-15
1,wellsfargo,Wells Fargo,,1296,5407,23.97,2015-07-22
2,thoughtworks,ThoughtWorks,,337,1425,23.65,2014-05-19
3,mozilla,Mozilla,,43,500,8.6,2013-10-23
4,athenahealth,athenahealth,,75,473,15.86,2016-02-25


In [None]:
# Step 4: Save cleaned dataset for future fast loading

df.to_csv("../data/women_in_tech_cleaned.csv", index=False)
print("Cleaned dataset saved to 'women_in_tech_cleaned.csv'")

Cleaned dataset saved to 'women_in_tech_cleaned.csv'
