In [44]:
#adult.csv
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, MinMaxScaler

In [45]:
df = pd.read_csv("D:\\adult.csv")
print(df.head())
print(df.shape)

   age  workclass  fnlwgt     education  educational-num      marital-status  \
0   25    Private  226802          11th                7       Never-married   
1   38    Private   89814       HS-grad                9  Married-civ-spouse   
2   28  Local-gov  336951    Assoc-acdm               12  Married-civ-spouse   
3   44    Private  160323  Some-college               10  Married-civ-spouse   
4   18          ?  103497  Some-college               10       Never-married   

          occupation relationship   race  gender  capital-gain  capital-loss  \
0  Machine-op-inspct    Own-child  Black    Male             0             0   
1    Farming-fishing      Husband  White    Male             0             0   
2    Protective-serv      Husband  White    Male             0             0   
3  Machine-op-inspct      Husband  Black    Male          7688             0   
4                  ?    Own-child  White  Female             0             0   

   hours-per-week native-country incom

In [46]:
#Null Values 
print("Missing Values in Each Column:",df.isnull().sum())


Missing Values in Each Column: age                0
workclass          0
fnlwgt             0
education          0
educational-num    0
marital-status     0
occupation         0
relationship       0
race               0
gender             0
capital-gain       0
capital-loss       0
hours-per-week     0
native-country     0
income             0
dtype: int64


In [47]:
df = df.dropna()
print("Dropped rows with missing values. New shape:", df.shape)

Dropped rows with missing values. New shape: (48842, 15)


In [48]:
def remove_outliers(df, col):
    Q1 = df[col].quantile(0.25)
    Q3 = df[col].quantile(0.75)
    IQR = Q3 - Q1
    lower = Q1 - 1.5 * IQR
    upper = Q3 + 1.5 * IQR
    return df[(df[col] >= lower) & (df[col] <= upper)]

numeric_cols = df.select_dtypes(include=np.number).columns
for col in numeric_cols:
    df = remove_outliers(df, col)

print("Outliers Removed. New shape:", df.shape)

Outliers Removed. New shape: (28560, 15)


In [49]:
scaler = StandardScaler()
df_scaled = df.copy()
df_scaled[numeric_cols] = scaler.fit_transform(df_scaled[numeric_cols])
print("Standard Scaled Data Sample:")
print(df_scaled.head())

Standard Scaled Data Sample:
        age  workclass    fnlwgt     education  educational-num  \
0 -1.080674    Private  0.532148          11th        -1.511213   
1 -0.003343    Private -1.041795       HS-grad        -0.580414   
2 -0.832059  Local-gov  1.797721    Assoc-acdm         0.815786   
6 -0.749187          ?  0.534722       HS-grad        -0.580414   
8 -1.163545    Private  2.173616  Some-college        -0.115014   

       marital-status         occupation relationship   race  gender  \
0       Never-married  Machine-op-inspct    Own-child  Black    Male   
1  Married-civ-spouse    Farming-fishing      Husband  White    Male   
2  Married-civ-spouse    Protective-serv      Husband  White    Male   
6       Never-married                  ?    Unmarried  Black    Male   
8       Never-married      Other-service    Unmarried  White  Female   

   capital-gain  capital-loss  hours-per-week native-country income  
0           0.0           0.0       -0.371325  United-States  <=5

In [50]:
normalizer = MinMaxScaler()
df_normalized = df.copy()
df_normalized[numeric_cols] = normalizer.fit_transform(df_normalized[numeric_cols])
print("\n📏 Min-Max Normalized Data Sample:")
print(df_normalized.head())


📏 Min-Max Normalized Data Sample:
        age  workclass    fnlwgt     education  educational-num  \
0  0.131148    Private  0.526982          11th         0.181818   
1  0.344262    Private  0.188113       HS-grad         0.363636   
2  0.180328  Local-gov  0.799459    Assoc-acdm         0.636364   
6  0.196721          ?  0.527536       HS-grad         0.363636   
8  0.114754    Private  0.880389  Some-college         0.454545   

       marital-status         occupation relationship   race  gender  \
0       Never-married  Machine-op-inspct    Own-child  Black    Male   
1  Married-civ-spouse    Farming-fishing      Husband  White    Male   
2  Married-civ-spouse    Protective-serv      Husband  White    Male   
6       Never-married                  ?    Unmarried  Black    Male   
8       Never-married      Other-service    Unmarried  White  Female   

   capital-gain  capital-loss  hours-per-week native-country income  
0           0.0           0.0        0.368421  United-State

In [52]:
df.groupby("gender")["age"].describe()

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
gender,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Female,9424.0,37.120862,12.252046,17.0,27.0,35.0,46.0,77.0
Male,19136.0,38.493154,11.949268,17.0,29.0,37.0,47.0,78.0


In [51]:
df.groupby("marital-status")["age"].mean()

marital-status
Divorced                 42.386179
Married-AF-spouse        28.294118
Married-civ-spouse       41.874377
Married-spouse-absent    38.829132
Never-married            29.307102
Separated                38.921668
Widowed                  54.764793
Name: age, dtype: float64

In [53]:
df.groupby("marital-status")["age"].median()

marital-status
Divorced                 42.0
Married-AF-spouse        28.0
Married-civ-spouse       41.0
Married-spouse-absent    38.0
Never-married            27.0
Separated                38.0
Widowed                  56.0
Name: age, dtype: float64

In [54]:
df.groupby("marital-status")["age"].std()

marital-status
Divorced                  9.834428
Married-AF-spouse         5.976670
Married-civ-spouse       11.038415
Married-spouse-absent    11.126012
Never-married             9.175611
Separated                10.023240
Widowed                  10.417478
Name: age, dtype: float64

In [7]:


#iris.csv
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, MinMaxScaler

In [8]:
df = pd.read_csv("D:\\iris_dataset\\Iris.csv") 
print(df.head())

   Id  SepalLengthCm  SepalWidthCm  PetalLengthCm  PetalWidthCm      Species
0   1            5.1           3.5            1.4           0.2  Iris-setosa
1   2            4.9           3.0            1.4           0.2  Iris-setosa
2   3            4.7           3.2            1.3           0.2  Iris-setosa
3   4            4.6           3.1            1.5           0.2  Iris-setosa
4   5            5.0           3.6            1.4           0.2  Iris-setosa


In [9]:
print(" Missing Values: ",df.isnull().sum())


 Missing Values:  Id               0
SepalLengthCm    0
SepalWidthCm     0
PetalLengthCm    0
PetalWidthCm     0
Species          0
dtype: int64


In [10]:
df.shape


(150, 6)

In [11]:
def remove_outliers(df, col):
    Q1 = df[col].quantile(0.25)
    Q3 = df[col].quantile(0.75)
    IQR = Q3 - Q1
    lower = Q1 - 1.5 * IQR
    upper = Q3 + 1.5 * IQR
    return df[(df[col] >= lower) & (df[col] <= upper)]

for col in df.select_dtypes(include=[np.number]).columns:
    df = remove_outliers(df, col)

print("Shape after outlier removal:", df.shape)

Shape after outlier removal: (146, 6)


In [12]:
scaler = StandardScaler()
df_scaled = df.copy()
numeric_cols = df.select_dtypes(include=[np.number]).columns
df_scaled[numeric_cols] = scaler.fit_transform(df[numeric_cols])
print("Sample Scaled Data (StandardScaler): ")
print(df_scaled.head())

Sample Scaled Data (StandardScaler): 
         Id  SepalLengthCm  SepalWidthCm  PetalLengthCm  PetalWidthCm  \
0 -1.745376      -0.910515      1.175789      -1.374878     -1.345899   
1 -1.722284      -1.151122     -0.093924      -1.374878     -1.345899   
2 -1.699192      -1.391729      0.413961      -1.431986     -1.345899   
3 -1.676099      -1.512032      0.160019      -1.317771     -1.345899   
4 -1.653007      -1.030819      1.429732      -1.374878     -1.345899   

       Species  
0  Iris-setosa  
1  Iris-setosa  
2  Iris-setosa  
3  Iris-setosa  
4  Iris-setosa  


In [14]:
normalizer = MinMaxScaler()
df_normalized = df.copy()
df_normalized[numeric_cols] = normalizer.fit_transform(df[numeric_cols])
print(" Sample Normalized Data (MinMaxScaler):")
print(df_normalized.head())

 Sample Normalized Data (MinMaxScaler):
         Id  SepalLengthCm  SepalWidthCm  PetalLengthCm  PetalWidthCm  \
0  0.000000       0.222222      0.722222       0.067797      0.041667   
1  0.006711       0.166667      0.444444       0.067797      0.041667   
2  0.013423       0.111111      0.555556       0.050847      0.041667   
3  0.020134       0.083333      0.500000       0.084746      0.041667   
4  0.026846       0.194444      0.777778       0.067797      0.041667   

       Species  
0  Iris-setosa  
1  Iris-setosa  
2  Iris-setosa  
3  Iris-setosa  
4  Iris-setosa  


In [16]:
grouped = df.groupby('Species').agg(['mean', 'median', 'min', 'max', 'std'])
print(" Summary Statistics Grouped by 'Species':")
print(grouped)

 Summary Statistics Grouped by 'Species':
                         Id                             SepalLengthCm         \
                       mean median  min  max        std          mean median   
Species                                                                        
Iris-setosa       25.361702   25.0    1   50  14.885688      4.976596    5.0   
Iris-versicolor   75.795918   76.0   51  100  14.575921      5.955102    5.9   
Iris-virginica   125.500000  125.5  101  150  14.577380      6.588000    6.5   

                                     ... PetalLengthCm                   \
                 min  max       std  ...          mean median  min  max   
Species                              ...                                  
Iris-setosa      4.3  5.8  0.338932  ...       1.46383   1.50  1.0  1.9   
Iris-versicolor  4.9  7.0  0.503348  ...       4.27551   4.40  3.0  5.1   
Iris-virginica   4.9  7.9  0.635880  ...       5.55200   5.55  4.5  6.9   

                          

In [19]:
mean_petal_lengths = df.groupby('Species')['PetalLengthCm'].mean().tolist()
print("Mean Petal Length for each Species (List):")
print(mean_petal_lengths)

Mean Petal Length for each Species (List):
[1.4638297872340424, 4.275510204081633, 5.5520000000000005]


In [21]:
print(" Detailed Descriptive Stats for Each Species:")
for Species in df['Species'].unique():
    print(f" Statistics for: {Species}")
    Species_data = df[df['Species'] == Species]
    print(Species_data.describe(percentiles=[0.25, 0.5, 0.75]))

 Detailed Descriptive Stats for Each Species:
 Statistics for: Iris-setosa
              Id  SepalLengthCm  SepalWidthCm  PetalLengthCm  PetalWidthCm
count  47.000000      47.000000     47.000000      47.000000     47.000000
mean   25.361702       4.976596      3.365957       1.463830      0.244681
std    14.885688       0.338932      0.327890       0.178673      0.105930
min     1.000000       4.300000      2.300000       1.000000      0.100000
25%    12.500000       4.800000      3.100000       1.400000      0.200000
50%    25.000000       5.000000      3.400000       1.500000      0.200000
75%    38.500000       5.100000      3.550000       1.600000      0.300000
max    50.000000       5.800000      4.000000       1.900000      0.600000
 Statistics for: Iris-versicolor
               Id  SepalLengthCm  SepalWidthCm  PetalLengthCm  PetalWidthCm
count   49.000000      49.000000     49.000000      49.000000     49.000000
mean    75.795918       5.955102      2.785714       4.275510    