In [11]:
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

In [5]:
data = {
    'Age': [25, 30, np.nan, 35, 40, np.nan, 45],
    'Salary': [50000, np.nan, 70000, np.nan, 90000, 120000, 110000],
    'Gender': ['Male', 'Female', np.nan, 'Female', 'Male', 'Male', np.nan],
    'Experience': [1, 3, 5, np.nan, 10, 15, np.nan]
}

df = pd.DataFrame(data)
print("Original DataFrame with Missing Values:")
print(df)

Original DataFrame with Missing Values:
    Age    Salary  Gender  Experience
0  25.0   50000.0    Male         1.0
1  30.0       NaN  Female         3.0
2   NaN   70000.0     NaN         5.0
3  35.0       NaN  Female         NaN
4  40.0   90000.0    Male        10.0
5   NaN  120000.0    Male        15.0
6  45.0  110000.0     NaN         NaN


In [6]:
print(df.isnull().sum())  # Shows count of missing values per column

Age           2
Salary        2
Gender        2
Experience    2
dtype: int64


In [7]:
# Remove missing values

# Drop rows with any missing values
df_dropped_rows = df.dropna()
print("\nDataFrame after dropping rows with missing values:")
print(df_dropped_rows)



DataFrame after dropping rows with missing values:
    Age   Salary Gender  Experience
0  25.0  50000.0   Male         1.0
4  40.0  90000.0   Male        10.0


In [8]:
# Impute missing values with the mean
df_mean = df
df_mean['Age'].fillna(df_mean['Age'].mean(), inplace=True)
df_mean['Salary'].fillna(df_mean['Salary'].mean(), inplace=True)
df_mean['Experience'].fillna(df_mean['Experience'].mean(), inplace=True)

print("\nDataFrame after imputing numerical columns with mean:")
print(df_mean)


DataFrame after imputing numerical columns with mean:
    Age    Salary  Gender  Experience
0  25.0   50000.0    Male         1.0
1  30.0   88000.0  Female         3.0
2  35.0   70000.0     NaN         5.0
3  35.0   88000.0  Female         6.8
4  40.0   90000.0    Male        10.0
5  35.0  120000.0    Male        15.0
6  45.0  110000.0     NaN         6.8


In [9]:
# Impute missing values with the mode
df_mode = df
df_mode['Gender'].fillna(df_mode['Gender'].mode()[0], inplace=True)


print("\nDataFrame after imputing categorical columns with mode:")
print(df_mode)



DataFrame after imputing categorical columns with mode:
    Age    Salary  Gender  Experience
0  25.0   50000.0    Male         1.0
1  30.0   88000.0  Female         3.0
2  35.0   70000.0    Male         5.0
3  35.0   88000.0  Female         6.8
4  40.0   90000.0    Male        10.0
5  35.0  120000.0    Male        15.0
6  45.0  110000.0    Male         6.8


In [16]:
import pandas as pd

# Sample DataFrame
df = pd.DataFrame({
    'Price': [10, 20, 15, 18, 25],
    'Color': ['Red', 'Blue', None, 'Green', 'Red'],
    'Size': [None, 'M', 'L', 'S', 'M']
})


df['Color'] = df['Color'].fillna(df['Color'].mode()[0])  # Most frequent value
df['Size'] = df['Size'].fillna(df['Size'].mode()[0])  # Most frequent value

print(df)

   Price  Color Size
0     10    Red    M
1     20   Blue    M
2     15    Red    L
3     18  Green    S
4     25    Red    M


In [17]:
# Label Encoding  vs ordinal Encoding 

from sklearn.preprocessing import LabelEncoder

# Apply Label Encoding to 'Size'
label_encoder = LabelEncoder()
df['Size'] = label_encoder.fit_transform(df['Size'])

print("\nDataFrame after Label Encoding for 'Size':\n", df)



DataFrame after Label Encoding for 'Size':
    Price  Color  Size
0     10    Red     1
1     20   Blue     1
2     15    Red     0
3     18  Green     2
4     25    Red     1


In [18]:
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder


# Define the preprocessor
preprocessor = ColumnTransformer(
    transformers=[('onehot', OneHotEncoder(), ['Color'])],  # Use column names
    remainder='passthrough'  # Keep numerical columns as they are
)

# Apply the transformation
transformed_data = preprocessor.fit_transform(df)

# Convert back to DataFrame with feature names
feature_names = preprocessor.get_feature_names_out()
df_encoded = pd.DataFrame(transformed_data, columns=feature_names)

print(df_encoded)


   onehot__Color_Blue  onehot__Color_Green  onehot__Color_Red  \
0                 0.0                  0.0                1.0   
1                 1.0                  0.0                0.0   
2                 0.0                  0.0                1.0   
3                 0.0                  1.0                0.0   
4                 0.0                  0.0                1.0   

   remainder__Price  remainder__Size  
0              10.0              1.0  
1              20.0              1.0  
2              15.0              0.0  
3              18.0              2.0  
4              25.0              1.0  


In [23]:
housing=pd.read_csv("./housing.csv")
housing.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY


In [24]:
housing.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20640 entries, 0 to 20639
Data columns (total 10 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   longitude           20640 non-null  float64
 1   latitude            20640 non-null  float64
 2   housing_median_age  20640 non-null  float64
 3   total_rooms         20640 non-null  float64
 4   total_bedrooms      20433 non-null  float64
 5   population          20640 non-null  float64
 6   households          20640 non-null  float64
 7   median_income       20640 non-null  float64
 8   median_house_value  20640 non-null  float64
 9   ocean_proximity     20640 non-null  object 
dtypes: float64(9), object(1)
memory usage: 1.6+ MB


In [25]:
housing["ocean_proximity"].value_counts()

ocean_proximity
<1H OCEAN     9136
INLAND        6551
NEAR OCEAN    2658
NEAR BAY      2290
ISLAND           5
Name: count, dtype: int64

In [26]:
housing_new = housing.drop(columns=['ocean_proximity'])

In [27]:
corr_matrix = housing_new.corr()
# check the correlation in descending order
corr_matrix["median_house_value"].sort_values(ascending=False)

median_house_value    1.000000
median_income         0.688075
total_rooms           0.134153
housing_median_age    0.105623
households            0.065843
total_bedrooms        0.049686
population           -0.024650
longitude            -0.045967
latitude             -0.144160
Name: median_house_value, dtype: float64

In [28]:
# creating own features and  check whether its correlation
housing_new["rooms_per_household"] = housing_new["total_rooms"]/housing_new["households"]
housing_new["bedrooms_per_room"] = housing_new["total_bedrooms"]/housing_new["total_rooms"]
housing_new["population_per_household"]=housing_new["population"]/housing_new["households"]


# checking over the correlation of median house price with other features
corr_matrix = housing_new.corr()
corr_matrix["median_house_value"].sort_values(ascending=False)

median_house_value          1.000000
median_income               0.688075
rooms_per_household         0.151948
total_rooms                 0.134153
housing_median_age          0.105623
households                  0.065843
total_bedrooms              0.049686
population_per_household   -0.023737
population                 -0.024650
longitude                  -0.045967
latitude                   -0.144160
bedrooms_per_room          -0.255880
Name: median_house_value, dtype: float64

In [29]:
housing

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY
...,...,...,...,...,...,...,...,...,...,...
20635,-121.09,39.48,25.0,1665.0,374.0,845.0,330.0,1.5603,78100.0,INLAND
20636,-121.21,39.49,18.0,697.0,150.0,356.0,114.0,2.5568,77100.0,INLAND
20637,-121.22,39.43,17.0,2254.0,485.0,1007.0,433.0,1.7000,92300.0,INLAND
20638,-121.32,39.43,18.0,1860.0,409.0,741.0,349.0,1.8672,84700.0,INLAND
