Creating a Synthetic dataset

In [1]:
import numpy as np
import pandas as pd
np.random.seed(42)
n_rows = 100
data = {
    "Age": np.random.randint(18, 60, size=n_rows),
    "Salary": np.random.randint(30000, 120000, size=n_rows),
    "Department": np.random.choice(["HR", "IT", "Finance", "Marketing"], size=n_rows),
    "Experience": np.random.randint(0, 20, size=n_rows)
}
df = pd.DataFrame(data)
nan_indices_salary = np.random.choice(df.index, size=10, replace=False)
df.loc[nan_indices_salary, "Salary"] = np.nan
nan_indices_age = np.random.choice(df.index, size=7, replace=False)
df.loc[nan_indices_age, "Age"] = np.nan
nan_indices_exp = np.random.choice(df.index, size=5, replace=False)
df.loc[nan_indices_exp, "Experience"] = np.nan
print("Synthetic Dataset (100 rows) with NaN values:\n")
print(df.head(15))


Synthetic Dataset (100 rows) with NaN values:

     Age    Salary Department  Experience
0   56.0   38392.0         HR         1.0
1   46.0   60535.0  Marketing         2.0
2   32.0  108603.0         IT        15.0
3   25.0   82256.0         IT         8.0
4   38.0  119135.0         IT         3.0
5   56.0       NaN    Finance         0.0
6   36.0       NaN    Finance         NaN
7   40.0  109575.0  Marketing         0.0
8   28.0  114651.0  Marketing        13.0
9   28.0   93335.0         HR        15.0
10  41.0   40965.0         HR        19.0
11  53.0   54538.0  Marketing         7.0
12  57.0  100592.0    Finance         6.0
13  41.0   38110.0         IT         2.0
14   NaN       NaN  Marketing        16.0


Problem 1: Compute (a) mean, (b) median, and (c) age-weighted mean of income. Ignore
NaNs where appropriate. Explain when a weighted mean is preferable.

In [2]:
# (a) Mean salary (ignoring NaNs)
mean_income = df["Salary"].mean()
# (b) Median salary
median_income = df["Salary"].median()

# (c) Age-weighted mean salary
# Formula: weighted_mean = sum(w*x) / sum(w)
valid_data = df.dropna(subset=["Age", "Salary"])
age_weighted_mean = np.average(valid_data["Salary"], weights=valid_data["Age"])


Q) When is a Weighted Mean preferable?

Ans) When some data points should influence the average more than others.

Example:
Grading systems (assignments, exams weighted differently).

So, Use weighted mean when not all data points are equally important.

In [3]:
print("Mean Income:", mean_income)
print("Median Income:", median_income)
print("Age-Weighted Mean Income:", age_weighted_mean)

Mean Income: 77699.7
Median Income: 80932.0
Age-Weighted Mean Income: 75213.51340523883


Problem 2: Standardize income (z-score). Report how many incomes are outliers using rule |z|> 3. Handle NaNs correctly (do not drop entire rows unnecessarily).


In [4]:
# Problem 2: Z-score standardization
mean_salary = df["Salary"].mean(skipna=True)
std_salary = df["Salary"].std(skipna=True)

# Compute z-scores for Salary
df["Salary_z"] = (df["Salary"] - mean_salary) / std_salary

# Identify outliers using |z| > 3
outliers = df["Salary_z"].abs() > 3
n_outliers = outliers.sum()

In [5]:
print("Mean Salary:", mean_salary)
print("Std Dev Salary:", std_salary)
print("Number of Outliers (|z| > 3):", n_outliers)
print("\nRows with z-scores:\n", df[["Salary", "Salary_z"]].head(15))

Mean Salary: 77699.7
Std Dev Salary: 25821.535381746504
Number of Outliers (|z| > 3): 0

Rows with z-scores:
       Salary  Salary_z
0    38392.0 -1.522284
1    60535.0 -0.664744
2   108603.0  1.196803
3    82256.0  0.176453
4   119135.0  1.604680
5        NaN       NaN
6        NaN       NaN
7   109575.0  1.234446
8   114651.0  1.431026
9    93335.0  0.605514
10   40965.0 -1.422638
11   54538.0 -0.896992
12  100592.0  0.886558
13   38110.0 -1.533205
14       NaN       NaN


Problem 3: Create age bins: [18-25), [25-35), [35-45), [45-60) and compute for each bin:
● count of observations,
● mean income,
● median score.
Show result as a tidy DataFrame sorted by age bin.


In [6]:
# Problem 3: Age Bins
bins = [18, 25, 35, 45, 60]
labels = ["18-25", "25-35", "35-45", "45-60"]

df["Age_bin"] = pd.cut(df["Age"], bins=bins, labels=labels, right=False)

# Group by age bin
result = df.groupby("Age_bin").agg(
    count_obs=("Salary", "count"),
    mean_income=("Salary", "mean"),
    median_income=("Salary", "median")
).reset_index()
result = result.sort_values("Age_bin")

  result = df.groupby("Age_bin").agg(


In [7]:
print(result)

  Age_bin  count_obs   mean_income  median_income
0   18-25         15  81925.133333        82992.0
1   25-35         22  82382.954545        80932.0
2   35-45         19  79670.947368        84384.0
3   45-60         29  68153.793103        60535.0


Problem 4: Create an array it cannot be of 1 Dimension. And then showcase the operation for
the following:
● Shape and Resize → shape, size, Transpose, Flatten
● Showcasing negative indexing and display error while doing slicing
● Arithmetic Operations → Broadcasting, Dot Product
● Linear Algebra → Determinant, Inverse

In [10]:
np.random.seed(42)
arr = np.random.randint(1, 10, size=(3, 3))
print("Original Array:\n", arr)

# Shape and Resize
print("\nShape:", arr.shape)
print("Size:", arr.size)
print("Transpose:\n", arr.T)
print("Flatten:\n", arr.flatten())

# Negative Indexing
print("\nLast Row (negative indexing):", arr[-1])
print("Last Element in 2D array:", arr[-1, -1])

# Error in slicing
try:
    print(arr[-5])
except IndexError as e:
    print("\nIndexError:", e)



Original Array:
 [[7 4 8]
 [5 7 3]
 [7 8 5]]

Shape: (3, 3)
Size: 9
Transpose:
 [[7 5 7]
 [4 7 8]
 [8 3 5]]
Flatten:
 [7 4 8 5 7 3 7 8 5]

Last Row (negative indexing): [7 8 5]
Last Element in 2D array: 5

IndexError: index -5 is out of bounds for axis 0 with size 3


In [9]:
# Arithmetic Operations
# Broadcasting (adding scalar)
print("\nBroadcasting (arr + 5):\n", arr + 5)

# Dot product (matrix multiplication)
dot_product = np.dot(arr, arr)
print("\nDot Product (arr x arr):\n", dot_product)

# Linear Algebra Operations
det = np.linalg.det(arr)
print("\nDeterminant:", det)

# Inverse (only if determinant != 0)
if det != 0:
    inv = np.linalg.inv(arr)
    print("Inverse:\n", inv)
else:
    print("Matrix is singular, inverse does not exist")


Broadcasting (arr + 5):
 [[12  9 13]
 [10 12  8]
 [12 13 10]]

Dot Product (arr x arr):
 [[125 120 108]
 [ 91  93  76]
 [124 124 105]]

Determinant: -11.00000000000003
Inverse:
 [[-1.         -4.          4.        ]
 [ 0.36363636  1.90909091 -1.72727273]
 [ 0.81818182  2.54545455 -2.63636364]]
