In [1]:
# Import needed libraries and tools
import pandas as pd

In [2]:
apps=pd.read_csv("Data_CSV/Clean_Data.csv") # Create dataframe and read CSV

In [3]:
apps['experience'].unique() # print unique values in experience feature

array(['2', '15', '9', '10', '13', '>20', '11', '5', '7', '8', '4', '3',
       '6', '14', '12', '16', '19', '17', '1', '18', '20', '<1', '0'],
      dtype=object)

In [4]:
# Removing special characters(<,>) and replace it with an appropriate value
apps['experience'] = apps['experience'].apply(lambda x: 21 if x == '>20' else (0 if x == '<1' else int(x))) 

<span style="font-size: 20px; color: lightgreen;">Notes: Convert experience feature to numeric.</span>

In [5]:
# Drop target feature to save memory space
target_filter=apps[apps['target'] == 1]
target_filter=target_filter.drop('target', axis=1)

<span style="font-size: 20px; color: lightgreen;">Notes: Create a new data frame for target enrollees.</span>

In [6]:
# Filter enrollees with more than 10 years of experience
exp10_filter = target_filter[target_filter['experience'] > 10]

<span style="font-size: 20px; color: lightgreen;">Notes: Create a new data frame from the target data frame for enrollees with 10+ experience.</span>

In [7]:
exp10_filter.loc[:, 'relevent_experience'] = exp10_filter['relevent_experience'].replace({
    'No relevent experience': 'No',
    'Has relevent experience': 'Yes'
})

<span style="font-size: 20px; color: lightgreen;">Notes: Shorten the relevant experience values to 'Yes' or 'No'.</span>

In [8]:
exp10_filter = exp10_filter[exp10_filter['relevent_experience'] == "Yes"]

<span style="font-size: 20px; color: lightgreen;">Notes: Filter the 10+ experience enrollees only with relevant experience.</span>

In [9]:
exp10_filter = exp10_filter.drop(columns=['relevent_experience'])

In [10]:
duplicated_id=exp10_filter.duplicated(subset='enrollee_id')
exp10_filter[duplicated_id]

Unnamed: 0,enrollee_id,city,city_development_index,gender,enrolled_university,education_level,major_discipline,experience,company_size,company_type,last_new_job,training_hours


<span style="font-size: 20px; color: lightgreen;">Notes: No Duplicates.</span>

In [11]:
company_size_summary = exp10_filter['company_size'].describe()
print(company_size_summary)

count     1023
unique       8
top          0
freq       508
Name: company_size, dtype: object


In [12]:
# View the unique values of the enrolled_university and education_level
unique_enrollments = exp10_filter['enrolled_university'].unique()
unique_education = exp10_filter['education_level'].unique()

print("Unique Enrollments:", unique_enrollments)
print("Unique Education Levels:", unique_education)


Unique Enrollments: ['no_enrollment' 'Part time course' 'Full time course']
Unique Education Levels: ['Graduate' 'Phd' 'Masters' 'Not provided' 'High School' 'Primary School']


In [13]:
exp10_filter['education_level'] = exp10_filter['education_level'].replace('Graduate', 'Bachelor')

<span style="font-size: 20px; color: lightgreen;">Notes: Replacing "Graduate" with "Bachelor".</span>

In [14]:
exp10_filter = exp10_filter[exp10_filter['enrolled_university'] == 'no_enrollment']
exp10_filter = exp10_filter.drop(columns='enrolled_university')

In [15]:
# Use a lambda function to convert "last_new_job" values to integers
exp10_filter['last_new_job'] = exp10_filter['last_new_job'].apply(lambda x: 0 if x == 'never' else (5 if x == '>4' else int(x)))

<span style="font-size: 20px; color: lightgreen;">Notes: Convert last new job feature to numeric.</span>

In [16]:
# Drop enrollees with 'High School' education level
exp10_filter = exp10_filter[exp10_filter['education_level'] != 'High School']

<span style="font-size: 20px; color: lightgreen;">Notes: Exclude high school from data.</span>

In [17]:
# Filter rows with company_size '0'
zero_size_rows = exp10_filter[exp10_filter['company_size'] == '0']

# Create a new DataFrame from the filtered rows
df_with_zero_size = pd.DataFrame(zero_size_rows)

# Remove enrollees with 0 size company 
exp10_filter = exp10_filter[exp10_filter['company_size'] != '0']

# Save this new DataFrame to a CSV file or any other desired format
df_with_zero_size.to_csv('Data_CSV/zero_companies.csv', index=False) 


<span style="font-size: 20px; color: lightgreen;">Notes: Save enrollees with no company in a new csv.</span>

In [18]:
# Save the filtered DataFrame (without rows with company_size '0') to a CSV file
exp10_filter.to_csv('Data_CSV/Explored_data.csv', index=False)  

<span style="font-size: 20px; color: lightgreen;">Notes: save enrollees with company in a new csv.</span>

In [19]:
exp10_filter.head(5)

Unnamed: 0,enrollee_id,city,city_development_index,gender,education_level,major_discipline,experience,company_size,company_type,last_new_job,training_hours
7,10,city_114,0.926,Female,Bachelor,STEM,15,<10,Funded Startup,1,53
27,50,city_103,0.92,Male,Phd,STEM,21,10000+,Pvt Ltd,1,79
65,124,city_103,0.92,Male,Masters,STEM,21,10000+,Pvt Ltd,2,34
97,200,city_103,0.92,Male,Bachelor,Humanities,21,100-500,Pvt Ltd,3,11
100,208,city_16,0.91,Male,Bachelor,STEM,21,10000+,Pvt Ltd,5,58


In [20]:
exp10_filter.tail(5)

Unnamed: 0,enrollee_id,city,city_development_index,gender,education_level,major_discipline,experience,company_size,company_type,last_new_job,training_hours
17340,32757,city_160,0.92,Male,Bachelor,STEM,21,1000-4999,Pvt Ltd,4,28
17356,32782,city_21,0.624,Male,Masters,STEM,17,10000+,Other,1,26
17409,32869,city_103,0.92,Male,Bachelor,STEM,11,1000-4999,Public Sector,5,92
17494,33055,city_165,0.903,Male,Bachelor,STEM,14,10000+,Pvt Ltd,3,40
17613,33254,city_160,0.92,Female,Masters,STEM,21,50-99,Pvt Ltd,5,120


<span style="font-size: 30px; color: #FF5050; font-weight:Bold">Result: Good looking data ready for analysis.</span>