In [None]:
import pandas as pd
from sklearn.datasets import fetch_california_housing

# Step 2: Load the California housing dataset
housing_data = fetch_california_housing()

# Step 3: Convert to DataFrame
df = pd.DataFrame(housing_data.data, columns=housing_data.feature_names)

# Step 4: Add target variable (house prices)
df["MedHouseVal"] = housing_data.target

# Step 5: Display information of all columns
print("Dataset Information:")
print(df.info())

# Step 6: Display statistical summary of numerical columns
print("\nStatistical Summary of Numerical Columns:")
print(df.describe())

# Step 7: Simulate a categorical column ('Ocean Proximity') for analysis
df["Ocean Proximity"] = pd.cut(df["MedHouseVal"], bins=5, labels=["Near Bay", "Inland", "Island", "Near Ocean", "Ocean"])
print("\nCount of Unique Labels in 'Ocean Proximity' Column:")
print(df["Ocean Proximity"].value_counts())

# Step 8: Display columns with missing values (California dataset does not have missing values)
missing_values = df.isnull().sum()
columns_with_missing = missing_values[missing_values > 0]

print("\nColumns with Missing Values:")
print(columns_with_missing if not columns_with_missing.empty else "No missing values found")


Dataset Information:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20640 entries, 0 to 20639
Data columns (total 9 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   MedInc       20640 non-null  float64
 1   HouseAge     20640 non-null  float64
 2   AveRooms     20640 non-null  float64
 3   AveBedrms    20640 non-null  float64
 4   Population   20640 non-null  float64
 5   AveOccup     20640 non-null  float64
 6   Latitude     20640 non-null  float64
 7   Longitude    20640 non-null  float64
 8   MedHouseVal  20640 non-null  float64
dtypes: float64(9)
memory usage: 1.4 MB
None

Statistical Summary of Numerical Columns:
             MedInc      HouseAge      AveRooms     AveBedrms    Population  \
count  20640.000000  20640.000000  20640.000000  20640.000000  20640.000000   
mean       3.870671     28.639486      5.429000      1.096675   1425.476744   
std        1.899822     12.585558      2.474173      0.473911   1132.462122   
min  

In [None]:
# Step 1: Import required libraries
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler, StandardScaler

# Step 2: Load the Diabetes dataset (Ensure it's uploaded to Colab)
file_path = "/content/Dataset of Diabetes .csv"  # Update the path if needed
df = pd.read_csv(file_path)

# Step 3: Handling Missing Values (Replacing NaNs with mean)
# Create a copy of the DataFrame to avoid modifying the original DataFrame
df_numeric = df.select_dtypes(include=['number']).copy() # Select only numeric columns

# Initialize the SimpleImputer with the 'mean' strategy
imputer = SimpleImputer(strategy="mean")

# Apply the imputer to fill missing values in numeric columns only
df_numeric.iloc[:, :] = imputer.fit_transform(df_numeric)

# Merge the imputed numeric columns back into the original DataFrame
df[df_numeric.columns] = df_numeric

# Step 4: Handling Outliers (Removing values beyond 1.5*IQR)
# Calculate quartiles and IQR for numeric columns
Q1 = df_numeric.quantile(0.25) # Only compute quartiles on numeric data
Q3 = df_numeric.quantile(0.75) # Only compute quartiles on numeric data
IQR = Q3 - Q1

# Filter outliers from numeric columns and update the original DataFrame
df = df[~((df_numeric < (Q1 - 1.5 * IQR)) | (df_numeric > (Q3 + 1.5 * IQR))).any(axis=1)]

# Step 5: Data Transformation
# Min-Max Normalization (for numeric features)
min_max_scaler = MinMaxScaler()
df_minmax = pd.DataFrame(min_max_scaler.fit_transform(df_numeric), columns=df_numeric.columns) # Only transform the numeric columns

# Standardization (Z-score normalization) (for numeric features)
standard_scaler = StandardScaler()
df_standard = pd.DataFrame(standard_scaler.fit_transform(df_numeric), columns=df_numeric.columns)  # Only transform the numeric columns

# Step 6: Display Processed Data
print("\nProcessed Diabetes Dataset (Min-Max Scaled):")
print(df_minmax.head())

print("\nProcessed Diabetes Dataset (Standard Scaled):")
print(df_standard.head())


Processed Diabetes Dataset (Min-Max Scaled):
         ID  No_Pation       AGE      Urea        Cr     HbA1c      Chol  \
0  0.627034   0.000237  0.508475  0.109375  0.050378  0.264901  0.407767   
1  0.918648   0.000452  0.101695  0.104167  0.070529  0.264901  0.359223   
2  0.524406   0.000634  0.508475  0.109375  0.050378  0.264901  0.407767   
3  0.849812   0.001160  0.508475  0.109375  0.050378  0.264901  0.407767   
4  0.629537   0.000452  0.220339  0.171875  0.050378  0.264901  0.475728   

         TG       HDL       LDL      VLDL       BMI  
0  0.044444  0.226804  0.114583  0.011461  0.173913  
1  0.081481  0.092784  0.187500  0.014327  0.139130  
2  0.044444  0.226804  0.114583  0.011461  0.173913  
3  0.044444  0.226804  0.114583  0.011461  0.173913  
4  0.051852  0.061856  0.177083  0.008596  0.069565  

Processed Diabetes Dataset (Standard Scaled):
         ID  No_Pation       AGE      Urea        Cr     HbA1c      Chol  \
0  0.672140  -0.074747 -0.401144 -0.144781 -0.3826

In [None]:
# Step 1: Import required libraries
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler, StandardScaler, LabelEncoder

# Step 2: Load the Adult Income dataset (Ensure it's uploaded to Colab)
file_path = "/content/adult.csv.zip"  # Update the path if needed
df = pd.read_csv(file_path)

# Step 3: Handling Missing Values (Replacing '?' with NaN and imputing mode for categorical, mean for numerical)
df.replace("?", np.nan, inplace=True)

# Handling numerical missing values with mean
num_imputer = SimpleImputer(strategy="mean")
df[df.select_dtypes(include=['number']).columns] = num_imputer.fit_transform(df.select_dtypes(include=['number']))

# Handling categorical missing values with mode
cat_imputer = SimpleImputer(strategy="most_frequent")
df[df.select_dtypes(include=['object']).columns] = cat_imputer.fit_transform(df.select_dtypes(include=['object']))

# Step 4: Handling Categorical Data (Encoding)
label_encoders = {}
for col in df.select_dtypes(include=['object']).columns:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    label_encoders[col] = le

# Step 5: Handling Outliers (Removing values beyond 1.5*IQR)
Q1 = df.quantile(0.25)
Q3 = df.quantile(0.75)
IQR = Q3 - Q1
df = df[~((df < (Q1 - 1.5 * IQR)) | (df > (Q3 + 1.5 * IQR))).any(axis=1)]

# Step 6: Data Transformation
# Min-Max Normalization
min_max_scaler = MinMaxScaler()
df_minmax = pd.DataFrame(min_max_scaler.fit_transform(df), columns=df.columns)

# Standardization (Z-score normalization)
standard_scaler = StandardScaler()
df_standard = pd.DataFrame(standard_scaler.fit_transform(df), columns=df.columns)

# Step 7: Display Processed Data
print("\nProcessed Adult Income Dataset (Min-Max Scaled):")
print(df_minmax.head())

print("\nProcessed Adult Income Dataset (Standard Scaled):")
print(df_standard.head())



Processed Adult Income Dataset (Min-Max Scaled):
        age  workclass    fnlwgt  education  educational-num  marital-status  \
0  0.344262        0.0  0.188277   0.555556         0.363636        0.333333   
1  0.114754        0.0  0.881156   1.000000         0.454545        0.666667   
2  0.147541        0.0  0.169156   0.555556         0.363636        0.666667   
3  0.672131        0.0  0.708251   0.555556         0.363636        0.333333   
4  0.131148        0.0  0.475807   0.333333         0.727273        0.333333   

   occupation  relationship  race  gender  capital-gain  capital-loss  \
0    0.307692           0.0   0.0     1.0           0.0           0.0   
1    0.538462           0.8   0.0     0.0           0.0           0.0   
2    0.000000           0.2   0.0     0.0           0.0           0.0   
3    0.692308           0.0   0.0     1.0           0.0           0.0   
4    0.692308           0.0   0.0     1.0           0.0           0.0   

   hours-per-week  native-coun

In [None]:
import pandas as pd

df = pd.read_csv("house.csv")
print("Information of all columns:")
print(df.info())


print("\nStatistical information of numerical columns:")
print(df.describe())

# count of unique labels for the Ocean Proximity column
print("\nCount of unique labels in 'Ocean Proximity' column:")
print(df['ocean_proximity'].value_counts())

# Display attributes with missing values count greater than zero
print("\nAttributes with missing values:")
missing_values = df.isnull().sum()
missing_columns = missing_values[missing_values > 0]
print(missing_columns)

Information of all columns:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20640 entries, 0 to 20639
Data columns (total 10 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   longitude           20640 non-null  float64
 1   latitude            20640 non-null  float64
 2   housing_median_age  20640 non-null  float64
 3   total_rooms         20640 non-null  float64
 4   total_bedrooms      20433 non-null  float64
 5   population          20640 non-null  float64
 6   households          20640 non-null  float64
 7   median_income       20640 non-null  float64
 8   median_house_value  20640 non-null  float64
 9   ocean_proximity     20640 non-null  object 
dtypes: float64(9), object(1)
memory usage: 1.6+ MB
None

Statistical information of numerical columns:
          longitude      latitude  housing_median_age   total_rooms  \
count  20640.000000  20640.000000        20640.000000  20640.000000   
mean    -119.569704     35.63