In [1]:
import pandas as pd
import numpy as np

In [2]:
# Create a small dataset with null values and outliers
data = {'Name': ['Alice', 'Bob', 'Charlie', 'David', 'Emily', 'Frank', 'George', 'Hannah', 'Isabella', 'Jack', 'Hrushikesh'],
        'Age': [25, 30, np.nan, 40, 50, 55, 60, 65, np.nan, 75, 2300],
        'Grade': [85, 90, 95, 100, 105, 110, 120, np.nan, 135, 140, 2000]}

In [3]:
df = pd.DataFrame(data)

In [4]:
df


Unnamed: 0,Name,Age,Grade
0,Alice,25.0,85.0
1,Bob,30.0,90.0
2,Charlie,,95.0
3,David,40.0,100.0
4,Emily,50.0,105.0
5,Frank,55.0,110.0
6,George,60.0,120.0
7,Hannah,65.0,
8,Isabella,,135.0
9,Jack,75.0,140.0


In [5]:
# Scan all variables for missing values and inconsistencies
print(df.isnull().sum())  # Check for null values
df['Age'].fillna(df['Age'].mean(), inplace=True)  # Replace null values with mean age
print(df)  # Check for inconsistencies

Name     0
Age      2
Grade    1
dtype: int64
          Name     Age   Grade
0        Alice    25.0    85.0
1          Bob    30.0    90.0
2      Charlie   300.0    95.0
3        David    40.0   100.0
4        Emily    50.0   105.0
5        Frank    55.0   110.0
6       George    60.0   120.0
7       Hannah    65.0     NaN
8     Isabella   300.0   135.0
9         Jack    75.0   140.0
10  Hrushikesh  2300.0  2000.0


In [6]:
# Scan all numeric variables for outliers
q1 = df['Grade'].quantile(0.25)
q3 = df['Grade'].quantile(0.75)
iqr = q3 - q1
lower_bound = q1 - 1.5 * iqr
upper_bound = q3 + 1.5 * iqr
df = df[(df['Grade'] > lower_bound) & (df['Grade'] < upper_bound)]  # Remove outliers
print(f"Upper Bound: {upper_bound}")
print(f"Lower Bound: {lower_bound}")

Upper Bound: 183.75
Lower Bound: 43.75


In [9]:
# Apply data transformations on at least one of the variables
df['Grade_sqrt'] = np.sqrt(df['Grade'])  # Apply square root transformation to Grade variable

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Grade_sqrt'] = np.sqrt(df['Grade'])  # Apply square root transformation to Grade variable


In [8]:
print(df)  # Print final dataset

       Name    Age  Grade  Grade_sqrt
0     Alice   25.0   85.0    9.219544
1       Bob   30.0   90.0    9.486833
2   Charlie  300.0   95.0    9.746794
3     David   40.0  100.0   10.000000
4     Emily   50.0  105.0   10.246951
5     Frank   55.0  110.0   10.488088
6    George   60.0  120.0   10.954451
8  Isabella  300.0  135.0   11.618950
9      Jack   75.0  140.0   11.832160
