In [2]:
import pandas as pd

# Load the adult.csv dataset
adult_df = pd.read_csv('adult.csv')

# Display the first few rows of the DataFrame
display(adult_df.head())

# Display the shape of the DataFrame
print("Shape of the data: ",adult_df.shape)

# Identify missing values represented by '?' in each column
for col in adult_df.columns:
    missing_count = (adult_df[col] == '?').sum()
    if missing_count > 0:
        print(f"Column '{col}': {missing_count} missing values.")

import numpy as np

# Replace '?' with np.nan
adult_df.replace('?', np.nan, inplace=True)

# Remove rows with missing values
for col in ['workclass', 'occupation', 'native-country']:
    adult_df.dropna(subset=[col], inplace=True)

# Convert categorical features to numerical
from sklearn.preprocessing import LabelEncoder, OneHotEncoder

# Label encoding for binary columns and columns with many unique values
label_encoder = LabelEncoder()
for col in ['gender', 'income', 'relationship', 'marital-status']:
    adult_df[col] = label_encoder.fit_transform(adult_df[col])

# One-hot encoding for columns with few unique values
onehot_cols = ['workclass', 'education', 'occupation', 'race', 'native-country']
adult_df = pd.get_dummies(adult_df, columns=onehot_cols, drop_first=True)

# Handle outliers using IQR method with capping
numerical_cols = ['age', 'fnlwgt', 'educational-num', 'capital-gain', 'capital-loss', 'hours-per-week']

for col in numerical_cols:
    Q1 = adult_df[col].quantile(0.25)
    Q3 = adult_df[col].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR

    adult_df[col] = np.where(adult_df[col] < lower_bound, lower_bound, adult_df[col])
    adult_df[col] = np.where(adult_df[col] > upper_bound, upper_bound, adult_df[col])

display(adult_df.head())

Unnamed: 0,age,workclass,fnlwgt,education,educational-num,marital-status,occupation,relationship,race,gender,capital-gain,capital-loss,hours-per-week,native-country,income
0,25,Private,226802,11th,7,Never-married,Machine-op-inspct,Own-child,Black,Male,0,0,40,United-States,<=50K
1,38,Private,89814,HS-grad,9,Married-civ-spouse,Farming-fishing,Husband,White,Male,0,0,50,United-States,<=50K
2,28,Local-gov,336951,Assoc-acdm,12,Married-civ-spouse,Protective-serv,Husband,White,Male,0,0,40,United-States,>50K
3,44,Private,160323,Some-college,10,Married-civ-spouse,Machine-op-inspct,Husband,Black,Male,7688,0,40,United-States,>50K
4,18,?,103497,Some-college,10,Never-married,?,Own-child,White,Female,0,0,30,United-States,<=50K


Shape of the data:  (48842, 15)
Column 'workclass': 2799 missing values.
Column 'occupation': 2809 missing values.
Column 'native-country': 857 missing values.


Unnamed: 0,age,fnlwgt,educational-num,marital-status,relationship,gender,capital-gain,capital-loss,hours-per-week,income,...,native-country_Portugal,native-country_Puerto-Rico,native-country_Scotland,native-country_South,native-country_Taiwan,native-country_Thailand,native-country_Trinadad&Tobago,native-country_United-States,native-country_Vietnam,native-country_Yugoslavia
0,25.0,226802.0,7.0,4,3,1,0.0,0.0,40.0,0,...,False,False,False,False,False,False,False,True,False,False
1,38.0,89814.0,9.0,2,0,1,0.0,0.0,50.0,0,...,False,False,False,False,False,False,False,True,False,False
2,28.0,336951.0,12.0,2,0,1,0.0,0.0,40.0,1,...,False,False,False,False,False,False,False,True,False,False
3,44.0,160323.0,10.0,2,0,1,0.0,0.0,40.0,1,...,False,False,False,False,False,False,False,True,False,False
5,34.0,198693.0,6.0,4,1,1,0.0,0.0,32.5,0,...,False,False,False,False,False,False,False,True,False,False


In [6]:
import pandas as pd

# Load the dataset
diabetes_df = pd.read_csv('Dataset of Diabetes .csv')

# Display the first few rows
display(diabetes_df.head())

# Print the shape
print(diabetes_df.shape)

# Print DataFrame information
print(diabetes_df.info())

# Display the shape of the DataFrame
print("Shape of diabetes_df:", diabetes_df.shape)

# Identify data types of each column
print("\nData types of each column:")
print(diabetes_df.dtypes)

# Check for missing values
print("\nMissing values per column:")
print(diabetes_df.isnull().sum())

# Analyze the distribution of numerical features
numerical_cols = diabetes_df.select_dtypes(include=['number']).columns
print("\nDescriptive statistics for numerical features:")
display(diabetes_df[numerical_cols].describe())


# Analyze the distribution of categorical features
categorical_cols = ['Gender', 'CLASS']  # Specify categorical columns


Unnamed: 0,ID,No_Pation,Gender,AGE,Urea,Cr,HbA1c,Chol,TG,HDL,LDL,VLDL,BMI,CLASS
0,502,17975,F,50,4.7,46,4.9,4.2,0.9,2.4,1.4,0.5,24.0,N
1,735,34221,M,26,4.5,62,4.9,3.7,1.4,1.1,2.1,0.6,23.0,N
2,420,47975,F,50,4.7,46,4.9,4.2,0.9,2.4,1.4,0.5,24.0,N
3,680,87656,F,50,4.7,46,4.9,4.2,0.9,2.4,1.4,0.5,24.0,N
4,504,34223,M,33,7.1,46,4.9,4.9,1.0,0.8,2.0,0.4,21.0,N


(1000, 14)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 14 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   ID         1000 non-null   int64  
 1   No_Pation  1000 non-null   int64  
 2   Gender     1000 non-null   object 
 3   AGE        1000 non-null   int64  
 4   Urea       1000 non-null   float64
 5   Cr         1000 non-null   int64  
 6   HbA1c      1000 non-null   float64
 7   Chol       1000 non-null   float64
 8   TG         1000 non-null   float64
 9   HDL        1000 non-null   float64
 10  LDL        1000 non-null   float64
 11  VLDL       1000 non-null   float64
 12  BMI        1000 non-null   float64
 13  CLASS      1000 non-null   object 
dtypes: float64(8), int64(4), object(2)
memory usage: 109.5+ KB
None
Shape of diabetes_df: (1000, 14)

Data types of each column:
ID             int64
No_Pation      int64
Gender        object
AGE            int64
Urea         float64
Cr      

Unnamed: 0,ID,No_Pation,AGE,Urea,Cr,HbA1c,Chol,TG,HDL,LDL,VLDL,BMI
count,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0
mean,340.5,270551.4,53.528,5.124743,68.943,8.28116,4.86282,2.34961,1.20475,2.60979,1.8547,29.57802
std,240.397673,3380758.0,8.799241,2.935165,59.984747,2.534003,1.301738,1.401176,0.660414,1.115102,3.663599,4.962388
min,1.0,123.0,20.0,0.5,6.0,0.9,0.0,0.3,0.2,0.3,0.1,19.0
25%,125.75,24063.75,51.0,3.7,48.0,6.5,4.0,1.5,0.9,1.8,0.7,26.0
50%,300.5,34395.5,55.0,4.6,60.0,8.0,4.8,2.0,1.1,2.5,0.9,30.0
75%,550.25,45384.25,59.0,5.7,73.0,10.2,5.6,2.9,1.3,3.3,1.5,33.0
max,800.0,75435660.0,79.0,38.9,800.0,16.0,10.3,13.8,9.9,9.9,35.0,47.75


In [5]:
import pandas as pd

df = pd.read_csv('california_housing_test.csv')
display(df.head())

# Examine the shape of the DataFrame
print("Shape of the DataFrame:", df.shape)

# Check data types
print("\nData types of each column:")
print(df.dtypes)

# Look for missing values
print("\nMissing values per column:")
print(df.isnull().sum())

# Identify numerical and categorical columns
numerical_cols = df.select_dtypes(include=['number']).columns
categorical_cols = df.select_dtypes(include=['object']).columns

print("\nNumerical columns:")
print(numerical_cols)

print("\nCategorical columns:")
print(categorical_cols)

# Calculate descriptive statistics for numerical features
numerical_cols = df.select_dtypes(include=['number']).columns
print("\nDescriptive statistics for numerical features:")
display(df[numerical_cols].describe())

# Analyze the distribution of numerical features (basic observations)
print("\nDistribution Analysis:")
for col in numerical_cols:
  print(f"\nColumn: {col}")
  print(f"  Skewness: {df[col].skew()}")  # Check for skewness
  print(f"  Kurtosis: {df[col].kurt()}")  # Check for kurtosis

# Analyze relationships between pairs of numerical features (correlations)
print("\nCorrelation Matrix:")
correlation_matrix = df[numerical_cols].corr()
display(correlation_matrix)
print("\nStrong correlations (absolute value > 0.7):")
strong_correlations = correlation_matrix[abs(correlation_matrix) > 0.7]
display(strong_correlations)

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value
0,-122.05,37.37,27.0,3885.0,661.0,1537.0,606.0,6.6085,344700.0
1,-118.3,34.26,43.0,1510.0,310.0,809.0,277.0,3.599,176500.0
2,-117.81,33.78,27.0,3589.0,507.0,1484.0,495.0,5.7934,270500.0
3,-118.36,33.82,28.0,67.0,15.0,49.0,11.0,6.1359,330000.0
4,-119.67,36.33,19.0,1241.0,244.0,850.0,237.0,2.9375,81700.0


Shape of the DataFrame: (3000, 9)

Data types of each column:
longitude             float64
latitude              float64
housing_median_age    float64
total_rooms           float64
total_bedrooms        float64
population            float64
households            float64
median_income         float64
median_house_value    float64
dtype: object

Missing values per column:
longitude             0
latitude              0
housing_median_age    0
total_rooms           0
total_bedrooms        0
population            0
households            0
median_income         0
median_house_value    0
dtype: int64

Numerical columns:
Index(['longitude', 'latitude', 'housing_median_age', 'total_rooms',
       'total_bedrooms', 'population', 'households', 'median_income',
       'median_house_value'],
      dtype='object')

Categorical columns:
Index([], dtype='object')

Descriptive statistics for numerical features:


Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value
count,3000.0,3000.0,3000.0,3000.0,3000.0,3000.0,3000.0,3000.0,3000.0
mean,-119.5892,35.63539,28.845333,2599.578667,529.950667,1402.798667,489.912,3.807272,205846.275
std,1.994936,2.12967,12.555396,2155.593332,415.654368,1030.543012,365.42271,1.854512,113119.68747
min,-124.18,32.56,1.0,6.0,2.0,5.0,2.0,0.4999,22500.0
25%,-121.81,33.93,18.0,1401.0,291.0,780.0,273.0,2.544,121200.0
50%,-118.485,34.27,29.0,2106.0,437.0,1155.0,409.5,3.48715,177650.0
75%,-118.02,37.69,37.0,3129.0,636.0,1742.75,597.25,4.656475,263975.0
max,-114.49,41.92,52.0,30450.0,5419.0,11935.0,4930.0,15.0001,500001.0



Distribution Analysis:

Column: longitude
  Skewness: -0.29785763262779996
  Kurtosis: -1.3627716597885198

Column: latitude
  Skewness: 0.45981593679052085
  Kurtosis: -1.1243724704483558

Column: housing_median_age
  Skewness: 0.018513121159991593
  Kurtosis: -0.8037837284188449

Column: total_rooms
  Skewness: 4.167637358501556
  Kurtosis: 32.09994094352687

Column: total_bedrooms
  Skewness: 3.8633931890458686
  Kurtosis: 28.53707082052359

Column: population
  Skewness: 2.9496706908288868
  Kurtosis: 16.443268177004413

Column: households
  Skewness: 3.5597534116462914
  Kurtosis: 26.22936134813419

Column: median_income
  Skewness: 1.6985117348319874
  Kurtosis: 5.626184148981826

Column: median_house_value
  Skewness: 0.9895619132015638
  Kurtosis: 0.39539899642985343

Correlation Matrix:


Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value
longitude,1.0,-0.925017,-0.064203,0.049865,0.070869,0.111572,0.051062,-0.018701,-0.050662
latitude,-0.925017,1.0,-0.025143,-0.039632,-0.068245,-0.117318,-0.068296,-0.072363,-0.138428
housing_median_age,-0.064203,-0.025143,1.0,-0.36785,-0.323154,-0.299888,-0.305171,-0.144315,0.091409
total_rooms,0.049865,-0.039632,-0.36785,1.0,0.937749,0.838867,0.914116,0.221249,0.160427
total_bedrooms,0.070869,-0.068245,-0.323154,0.937749,1.0,0.856387,0.970758,0.024025,0.082279
population,0.111572,-0.117318,-0.299888,0.838867,0.856387,1.0,0.89553,0.032361,-0.001192
households,0.051062,-0.068296,-0.305171,0.914116,0.970758,0.89553,1.0,0.048625,0.100176
median_income,-0.018701,-0.072363,-0.144315,0.221249,0.024025,0.032361,0.048625,1.0,0.672695
median_house_value,-0.050662,-0.138428,0.091409,0.160427,0.082279,-0.001192,0.100176,0.672695,1.0



Strong correlations (absolute value > 0.7):


Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value
longitude,1.0,-0.925017,,,,,,,
latitude,-0.925017,1.0,,,,,,,
housing_median_age,,,1.0,,,,,,
total_rooms,,,,1.0,0.937749,0.838867,0.914116,,
total_bedrooms,,,,0.937749,1.0,0.856387,0.970758,,
population,,,,0.838867,0.856387,1.0,0.89553,,
households,,,,0.914116,0.970758,0.89553,1.0,,
median_income,,,,,,,,1.0,
median_house_value,,,,,,,,,1.0
