In [1]:
import numpy as np
import pandas as pd

In [2]:
# Step 1: Create a sample DataFrame
data = {
    'ID': [101, 102, 103, 104, 105],
    'Name': ['Alice', 'Bob', 'Charlie', 'David', 'Eva'],
    'Age': [25, 30, 35, 40, 45],
    'Salary': [50000, 60000, 70000, 80000, 90000],
    'Department': ['HR', 'IT', 'Finance', 'Marketing', 'Operations']
}

df = pd.DataFrame(data)
df

Unnamed: 0,ID,Name,Age,Salary,Department
0,101,Alice,25,50000,HR
1,102,Bob,30,60000,IT
2,103,Charlie,35,70000,Finance
3,104,David,40,80000,Marketing
4,105,Eva,45,90000,Operations


In [3]:
# Step 2: Adding new columns with more variability
df['Experience'] = df['Age'] - 22 + np.random.randint(-2, 3, size=len(df))  # Adding randomness to experience
df['Bonus'] = df['Salary'] * (0.05 + np.random.random(len(df)) * 0.10)  # Random bonus between 5% and 15%
df['Seniority Level'] = pd.cut(df['Experience'], bins=[0, 5, 10, 15, 20], labels=['Junior', 'Mid', 'Senior', 'Lead'], right=False)
df

Unnamed: 0,ID,Name,Age,Salary,Department,Experience,Bonus,Seniority Level
0,101,Alice,25,50000,HR,1,6800.196213,Junior
1,102,Bob,30,60000,IT,7,7792.875377,Mid
2,103,Charlie,35,70000,Finance,14,8093.679153,Senior
3,104,David,40,80000,Marketing,18,5403.351937,Lead
4,105,Eva,45,90000,Operations,24,13159.216053,


In [4]:
# Step 3: Investigating the DataFrame
print("Full DataFrame:\n", df)
print("\nDataFrame Info:")
print(df.info())
print("\nStatistical Summary:")
print(df.describe())
print("\nMissing Values:")
print(df.isnull().sum())

# Select only numeric columns for the correlation matrix
numeric_df = df.select_dtypes(include=['number'])
print("\nCorrelation Matrix:")
print(numeric_df.corr())


Full DataFrame:
     ID     Name  Age  Salary  Department  Experience         Bonus  \
0  101    Alice   25   50000          HR           1   6800.196213   
1  102      Bob   30   60000          IT           7   7792.875377   
2  103  Charlie   35   70000     Finance          14   8093.679153   
3  104    David   40   80000   Marketing          18   5403.351937   
4  105      Eva   45   90000  Operations          24  13159.216053   

  Seniority Level  
0          Junior  
1             Mid  
2          Senior  
3            Lead  
4             NaN  

DataFrame Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5 entries, 0 to 4
Data columns (total 8 columns):
 #   Column           Non-Null Count  Dtype   
---  ------           --------------  -----   
 0   ID               5 non-null      int64   
 1   Name             5 non-null      object  
 2   Age              5 non-null      int64   
 3   Salary           5 non-null      int64   
 4   Department       5 non-null      objec

In [5]:
save_checkpoint = True

In [6]:
# Step 4: Full Investigation Report
def full_investigation_report(df):
    report = ""
    report += "Full DataFrame:\n" + df.to_string() + "\n\n"
    report += "DataFrame Info:\n" + str(df.info()) + "\n\n"
    report += "Statistical Summary:\n" + df.describe().to_string() + "\n\n"
    report += "Missing Values:\n" + df.isnull().sum().to_string() + "\n\n"
    report += "Correlation Matrix:\n" + df.select_dtypes(include=['number']).corr().to_string() + "\n"
    return report

# Generate the report
report = full_investigation_report(df)
print(report)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5 entries, 0 to 4
Data columns (total 8 columns):
 #   Column           Non-Null Count  Dtype   
---  ------           --------------  -----   
 0   ID               5 non-null      int64   
 1   Name             5 non-null      object  
 2   Age              5 non-null      int64   
 3   Salary           5 non-null      int64   
 4   Department       5 non-null      object  
 5   Experience       5 non-null      int64   
 6   Bonus            5 non-null      float64 
 7   Seniority Level  4 non-null      category
dtypes: category(1), float64(1), int64(4), object(2)
memory usage: 617.0+ bytes
Full DataFrame:
    ID     Name  Age  Salary  Department  Experience         Bonus Seniority Level
0  101    Alice   25   50000          HR           1   6800.196213          Junior
1  102      Bob   30   60000          IT           7   7792.875377             Mid
2  103  Charlie   35   70000     Finance          14   8093.679153          Senior
3 

In [7]:
save_checkpoint = True

In [8]:
# Step 5: Save the final DataFrame to a CSV file
df.to_csv('final_dataframe.csv', index=False)