## Generating a Sample Excel File

In [None]:
import pandas as pd

# Sample employee data
data = {
    'employee_id': [101, 102, 103, 104, 105],
    'name': ['Alice', 'Bob', 'Charlie', 'David', 'Eva'],
    'department': ['HR', 'Finance', 'IT', 'Sales', 'Marketing'],
    'salary': [55000, 62000, 72000, 50000, 57000],
    'performance_score': [3.8, 4.2, 4.5, 3.5, 4.0],
    'years_at_company': [2, 5, 3, 4, 1]
}

# Create a DataFrame
df = pd.DataFrame(data)

# Save to an Excel file
df.to_excel('employee_data.xlsx', index=False)

print("Sample Excel file 'employee_data.xlsx' generated successfully.")

Sample Excel file 'employee_data.xlsx' generated successfully.


## Reading in the Excel File

In [None]:
# Read Excel file into a DataFrame
df = pd.read_excel('employee_data.xlsx')

print(df.head())

   employee_id     name department  salary  performance_score  \
0          101    Alice         HR   55000                3.8   
1          102      Bob    Finance   62000                4.2   
2          103  Charlie         IT   72000                4.5   
3          104    David      Sales   50000                3.5   
4          105      Eva  Marketing   57000                4.0   

   years_at_company  
0                 2  
1                 5  
2                 3  
3                 4  
4                 1  


## Exploring and Summarizing Data

In [None]:
# Get info about the DataFrame
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5 entries, 0 to 4
Data columns (total 6 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   employee_id        5 non-null      int64  
 1   name               5 non-null      object 
 2   department         5 non-null      object 
 3   salary             5 non-null      int64  
 4   performance_score  5 non-null      float64
 5   years_at_company   5 non-null      int64  
dtypes: float64(1), int64(3), object(2)
memory usage: 368.0+ bytes
None


In [None]:
# Get descriptive statistics
print(df.describe())

       employee_id        salary  performance_score  years_at_company
count     5.000000      5.000000           5.000000          5.000000
mean    103.000000  59200.000000           4.000000          3.000000
std       1.581139   8348.652586           0.380789          1.581139
min     101.000000  50000.000000           3.500000          1.000000
25%     102.000000  55000.000000           3.800000          2.000000
50%     103.000000  57000.000000           4.000000          3.000000
75%     104.000000  62000.000000           4.200000          4.000000
max     105.000000  72000.000000           4.500000          5.000000


## Handling Missing Values

In [None]:
# Check for missing values
missing_values = df.isna().sum()
print(missing_values)

employee_id          0
name                 0
department           0
salary               0
performance_score    0
years_at_company     0
dtype: int64


In [None]:
# Fill missing performance scores with the average
df['performance_score'] = df['performance_score'].fillna(df['performance_score'].mean())

## Basic Data Manipulation

In [None]:
 # Filter employees with a performance score above 4
high_performers = df[df['performance_score'] > 4]
print(high_performers)

   employee_id     name department  salary  performance_score  \
1          102      Bob    Finance   62000                4.2   
2          103  Charlie         IT   72000                4.5   

   years_at_company  
1                 5  
2                 3  


In [None]:
# Select specific columns
selected_columns = df[['name', 'department', 'salary']]
print(selected_columns)

      name department  salary
0    Alice         HR   55000
1      Bob    Finance   62000
2  Charlie         IT   72000
3    David      Sales   50000
4      Eva  Marketing   57000


In [None]:
# Add a new column for bonus
df['bonus'] = df['salary'].apply(lambda x: x * 0.10)
print(df.head())

   employee_id     name department  salary  performance_score  \
0          101    Alice         HR   55000                3.8   
1          102      Bob    Finance   62000                4.2   
2          103  Charlie         IT   72000                4.5   
3          104    David      Sales   50000                3.5   
4          105      Eva  Marketing   57000                4.0   

   years_at_company   bonus  
0                 2  5500.0  
1                 5  6200.0  
2                 3  7200.0  
3                 4  5000.0  
4                 1  5700.0  


## Grouping and Aggregating Data

In [None]:
# Calculate average salary grouped by department
average_salary_by_department = df.groupby('department')['salary'].mean().reset_index()
print(average_salary_by_department)

  department   salary
0    Finance  62000.0
1         HR  55000.0
2         IT  72000.0
3  Marketing  57000.0
4      Sales  50000.0
