In [34]:
import pandas as pd

#load the datasets:
df = pd.read_csv('Salary_Data.csv')

print(len(df)) # Gives no of rows
print(len(df.columns)) # Gives no of columns

6704
6


In [35]:
df.head() # Extracts the top 5 data from the df

Unnamed: 0,Age,Gender,Education Level,Job Title,Years of Experience,Salary
0,32.0,Male,Bachelor's,Software Engineer,5.0,90000.0
1,28.0,Female,Master's,Data Analyst,3.0,65000.0
2,45.0,Male,PhD,Senior Manager,15.0,150000.0
3,36.0,Female,Bachelor's,Sales Associate,7.0,60000.0
4,52.0,Male,Master's,Director,20.0,200000.0


In [36]:
df.columns # Gives all the column names of the dataset

Index(['Age', 'Gender', 'Education Level', 'Job Title', 'Years of Experience',
       'Salary'],
      dtype='object')

In [37]:
df.info() # gives quick description of our dataset

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6704 entries, 0 to 6703
Data columns (total 6 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   Age                  6702 non-null   float64
 1   Gender               6702 non-null   object 
 2   Education Level      6701 non-null   object 
 3   Job Title            6702 non-null   object 
 4   Years of Experience  6701 non-null   float64
 5   Salary               6699 non-null   float64
dtypes: float64(3), object(3)
memory usage: 314.4+ KB



Descriptive analysis

In [38]:
df.describe() # Use df.describe() to check the distribution of values.

Unnamed: 0,Age,Years of Experience,Salary
count,6702.0,6701.0,6699.0
mean,33.620859,8.094687,115326.964771
std,7.614633,6.059003,52786.183911
min,21.0,0.0,350.0
25%,28.0,3.0,70000.0
50%,32.0,7.0,115000.0
75%,38.0,12.0,160000.0
max,62.0,34.0,250000.0


In [39]:
# Summary statistics for categorical columns
print(df.describe(include='float'))
print()
print()
print()
print(df.describe(include='object'))

               Age  Years of Experience         Salary
count  6702.000000          6701.000000    6699.000000
mean     33.620859             8.094687  115326.964771
std       7.614633             6.059003   52786.183911
min      21.000000             0.000000     350.000000
25%      28.000000             3.000000   70000.000000
50%      32.000000             7.000000  115000.000000
75%      38.000000            12.000000  160000.000000
max      62.000000            34.000000  250000.000000



       Gender    Education Level          Job Title
count    6702               6701               6702
unique      3                  7                193
top      Male  Bachelor's Degree  Software Engineer
freq     3674               2267                518


 Gender
count = 6702 → There are 6702 entries in the Gender column (2 missing values).

unique = 3 → There are 3 unique gender categories (e.g., 'Male', 'Female', maybe 'Other' or 'Non-Binary').

top = 'Male' → The most common gender is 'Male'.

freq = 3674 → 'Male' appears 3674 times.

count = 6701 → 6701 entries (3 missing value).

unique = 7 → 7 distinct education levels (e.g., High School, Bachelor's, Master's, PhD, etc.).

top = 'Bachelor's Degree' → Most frequent education level.

freq = 2267 → 'Bachelor's Degree' appears 2267 times.

In [40]:
df

Unnamed: 0,Age,Gender,Education Level,Job Title,Years of Experience,Salary
0,32.0,Male,Bachelor's,Software Engineer,5.0,90000.0
1,28.0,Female,Master's,Data Analyst,3.0,65000.0
2,45.0,Male,PhD,Senior Manager,15.0,150000.0
3,36.0,Female,Bachelor's,Sales Associate,7.0,60000.0
4,52.0,Male,Master's,Director,20.0,200000.0
...,...,...,...,...,...,...
6699,49.0,Female,PhD,Director of Marketing,20.0,200000.0
6700,32.0,Male,High School,Sales Associate,3.0,50000.0
6701,30.0,Female,Bachelor's Degree,Financial Manager,4.0,55000.0
6702,46.0,Male,Master's Degree,Marketing Manager,14.0,140000.0


In [41]:
df.isnull().sum()  # Gives the sum of no. of null values present in our column

Age                    2
Gender                 2
Education Level        3
Job Title              2
Years of Experience    3
Salary                 5
dtype: int64

the dropna() function in Pandas is used to remove missing values (i.e., NaNs) from a DataFrame or Series.

What It Does:
Removes all rows that contain at least one NaN (missing value).

Returns a new DataFrame with only complete rows (no missing data).

In [42]:
df_cleaned = df.dropna() # Removes the row that contains at least one null values
print(f"Remaining rows: {len(df_cleaned)}")


Remaining rows: 6698


In [43]:
print(df_cleaned.isnull().sum()) #checking again to confirm it's clean:


Age                    0
Gender                 0
Education Level        0
Job Title              0
Years of Experience    0
Salary                 0
dtype: int64


In [44]:
df_cleaned

Unnamed: 0,Age,Gender,Education Level,Job Title,Years of Experience,Salary
0,32.0,Male,Bachelor's,Software Engineer,5.0,90000.0
1,28.0,Female,Master's,Data Analyst,3.0,65000.0
2,45.0,Male,PhD,Senior Manager,15.0,150000.0
3,36.0,Female,Bachelor's,Sales Associate,7.0,60000.0
4,52.0,Male,Master's,Director,20.0,200000.0
...,...,...,...,...,...,...
6699,49.0,Female,PhD,Director of Marketing,20.0,200000.0
6700,32.0,Male,High School,Sales Associate,3.0,50000.0
6701,30.0,Female,Bachelor's Degree,Financial Manager,4.0,55000.0
6702,46.0,Male,Master's Degree,Marketing Manager,14.0,140000.0


In [45]:
print("\nData Types:")
print(df_cleaned.dtypes)


Data Types:
Age                    float64
Gender                  object
Education Level         object
Job Title               object
Years of Experience    float64
Salary                 float64
dtype: object


In [46]:
# Summary statistics for categorical columns
print(df_cleaned.describe(include='float'))
print()
print()
print()
print(df_cleaned.describe(include='object'))

               Age  Years of Experience         Salary
count  6698.000000          6698.000000    6698.000000
mean     33.623022             8.095178  115329.253061
std       7.615784             6.060291   52789.792507
min      21.000000             0.000000     350.000000
25%      28.000000             3.000000   70000.000000
50%      32.000000             7.000000  115000.000000
75%      38.000000            12.000000  160000.000000
max      62.000000            34.000000  250000.000000



       Gender    Education Level          Job Title
count    6698               6698               6698
unique      3                  7                191
top      Male  Bachelor's Degree  Software Engineer
freq     3671               2265                518


In [47]:
# 1. Find the maximum salary value
max_salary = df_cleaned['Salary'].max()
print(max_salary)

250000.0


In [48]:
# 2. Get the row(s) with that maximum salary
max_salary_row = df_cleaned[df_cleaned['Salary'] == max_salary]
# 3. Display the result
print("Employee(s) with Maximum Salary:")
print(max_salary_row)

Employee(s) with Maximum Salary:
       Age Gender    Education Level                 Job Title  \
30    50.0   Male         Bachelor's                       CEO   
83    52.0   Male                PhD  Chief Technology Officer   
5001  45.0   Male  Bachelor's Degree         Financial Manager   

      Years of Experience    Salary  
30                   25.0  250000.0  
83                   24.0  250000.0  
5001                 21.0  250000.0  


df_cleaned['Salary'] == max_salary

This part creates a Boolean Series.

It checks each row's salary in the Salary column and compares it to the max_salary value.

Result: a Series of True or False values.

In [49]:
i=df_cleaned['Salary'] == max_salary
i.head(50)

0     False
1     False
2     False
3     False
4     False
5     False
6     False
7     False
8     False
9     False
10    False
11    False
12    False
13    False
14    False
15    False
16    False
17    False
18    False
19    False
20    False
21    False
22    False
23    False
24    False
25    False
26    False
27    False
28    False
29    False
30     True
31    False
32    False
33    False
34    False
35    False
36    False
37    False
38    False
39    False
40    False
41    False
42    False
43    False
44    False
45    False
46    False
47    False
48    False
49    False
Name: Salary, dtype: bool

In [50]:
# 1. Get unique job titles and their counts
job_title_counts = df_cleaned['Job Title'].value_counts()

# 2. Display the result
print(" Job Title List and Counts:")
print(job_title_counts)

 Job Title List and Counts:
Job Title
Software Engineer                     518
Data Scientist                        453
Software Engineer Manager             376
Data Analyst                          363
Senior Project Engineer               318
                                     ... 
Junior Research Scientist               1
Senior Product Development Manager      1
Junior Social Media Specialist          1
Senior Software Architect               1
Social Media Man                        1
Name: count, Length: 191, dtype: int64


In [51]:
# Get the job title that appears the most
most_common_job_title = df_cleaned['Job Title'].value_counts().idxmax() # Index of max value in each column
count = df_cleaned['Job Title'].value_counts().max() # Max value in each column

print(f"Most common job title: {most_common_job_title} ({most_common_job_title} occurrences)")
print(f"Most common job title: {count} ({count} occurrences)")


Most common job title: Software Engineer (Software Engineer occurrences)
Most common job title: 518 (518 occurrences)


In [52]:
most_common_title = 'Software Engineer'

# 1. Filter rows where Job Title is 'Software Engineer'
software_engineers = df_cleaned[df_cleaned['Job Title'] == most_common_title]

# 2. Find max salary among Software Engineers
max_salary = software_engineers['Salary'].max()

# 3. Get row(s) with that max salary
top_earner = software_engineers[software_engineers['Salary'] == max_salary]

# 4. Show result
print(f" Highest Paid '{most_common_title}':")
print(top_earner)


 Highest Paid 'Software Engineer':
      Age  Gender    Education Level          Job Title  Years of Experience  \
952  42.0  Female  Bachelor's Degree  Software Engineer                 13.0   

       Salary  
952  197000.0  


In [53]:
# 1. Find the maximum age
max_age = df_cleaned['Age'].max() # Receives the max age from the age column

# 2. Get the row(s) where Age equals max_age
oldest_person = df_cleaned[df_cleaned['Age'] == max_age] # Find the peoples who have age == max_age

# 3. Display the result
print(f" Person(s) with Maximum Age ({max_age} years):")
print(oldest_person) # Prints the row having age == max_age


 Person(s) with Maximum Age (62.0 years):
       Age Gender Education Level                  Job Title  \
1225  62.0   Male             PhD  Software Engineer Manager   
1236  62.0   Male             PhD  Software Engineer Manager   
1258  62.0   Male             PhD  Software Engineer Manager   
1304  62.0   Male             PhD  Software Engineer Manager   
1305  62.0   Male             PhD  Software Engineer Manager   

      Years of Experience    Salary  
1225                 19.0  200000.0  
1236                 20.0  200000.0  
1258                 19.0  200000.0  
1304                 20.0  200000.0  
1305                 19.0  200000.0  


In [54]:
#average salary by gender:
avg_salary_gender = df_cleaned.groupby('Gender')['Salary'].mean()  # df_cleaned.groupby('Gender')  This groups the rows of the DataFrame by the Gender column.
# ['Salary'] After grouping, we pick the Salary column from each group and calculate the mean (average) salary
print(avg_salary_gender)


Gender
Female    107888.998672
Male      121395.697630
Other     125869.857143
Name: Salary, dtype: float64


#############################
.groupby('Gender')
This groups the entire data based on unique values in the 'Gender' column.





In [55]:
edu_counts = df_cleaned['Education Level'].value_counts()
print(edu_counts)


Education Level
Bachelor's Degree    2265
Master's Degree      1572
PhD                  1368
Bachelor's            756
High School           448
Master's              288
phD                     1
Name: count, dtype: int64


In [56]:
# Count how many have "Master's Degree" in Education Level
master_count = df_cleaned[df_cleaned['Education Level'] == "Master's Degree"].shape[0] # shape return s dimensions in tuple where shape[0] gives first element where it is stored as rows

print(f"Number of people with Master's Degree: {master_count}")

Number of people with Master's Degree: 1572


In [57]:
#find the age of people under 30  to 40:
age_30_40 = df_cleaned[(df_cleaned['Age'] >= 30) & (df['Age'] <= 40)]
print(age_30_40)


       Age  Gender    Education Level                    Job Title  \
0     32.0    Male         Bachelor's            Software Engineer   
3     36.0  Female         Bachelor's              Sales Associate   
7     31.0    Male         Bachelor's                Sales Manager   
9     38.0    Male                PhD             Senior Scientist   
12    35.0    Male         Bachelor's            Financial Analyst   
...    ...     ...                ...                          ...   
6691  36.0  Female                PhD            Marketing Manager   
6695  33.0  Female  Bachelor's Degree    Content Marketing Manager   
6698  37.0    Male  Bachelor's Degree  Junior Sales Representative   
6700  32.0    Male        High School              Sales Associate   
6701  30.0  Female  Bachelor's Degree            Financial Manager   

      Years of Experience    Salary  
0                     5.0   90000.0  
3                     7.0   60000.0  
7                     4.0   80000.0  
9      

  age_30_40 = df_cleaned[(df_cleaned['Age'] >= 30) & (df['Age'] <= 40)]


In [58]:
# Filter Job title aged between 30 and 40
abc = df_cleaned[df_cleaned['Age'].between(30, 40)][['Job Title', 'Age']]
print(abc)


                        Job Title   Age
0               Software Engineer  32.0
3                 Sales Associate  36.0
7                   Sales Manager  31.0
9                Senior Scientist  38.0
12              Financial Analyst  35.0
...                           ...   ...
6691            Marketing Manager  36.0
6695    Content Marketing Manager  33.0
6698  Junior Sales Representative  37.0
6700              Sales Associate  32.0
6701            Financial Manager  30.0

[2907 rows x 2 columns]


In [59]:
#average salary in age 30-40 :
avg_salary_30_40 = age_30_40['Salary'].mean()
print(f" Average salary (age 30–40): {avg_salary_30_40}")


 Average salary (age 30–40): 124828.79738562091
