In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
data = pd.read_csv('housing.csv')
data

In [None]:
data.describe()

In [None]:
for column in data.columns:
    data[column] = pd.to_numeric(data[column], errors='coerce')
    plt.figure()
    data[column].plot(kind='hist', bins=10)
    plt.title(f'Histogram of {column}')
    plt.xlabel(column)
    plt.ylabel('Frequency')
    plt.show()

In [None]:
plt.figure()

ocean_proximity_counts = data['ocean_proximity'].value_counts()

plt.bar(ocean_proximity_counts.index, ocean_proximity_counts.values)

plt.title('Bar plot of Ocean Proximity')

plt.xlabel('Ocean Proximity')

plt.ylabel('Count')

plt.show()

In [None]:
correlation_matrix = data.corr()

plt.figure(figsize=(10, 8))

sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt='.2f')

plt.title('Correlation Matrix')
plt.show()

In [None]:
numerical_columns = data.select_dtypes(include=['float64', 'int64']).columns

# Create a pairplot for all numerical columns
sns.pairplot(data[numerical_columns])

plt.suptitle('Pairplot of All Numerical Columns', y=1.02)  # Adjust title position

plt.show()

In [None]:
numerical_columns = data.select_dtypes(include=['float64', 'int64']).columns

for column in numerical_columns:
    plt.figure(figsize=(10, 6))
    sns.boxplot(y=data[column])
    plt.title(f'Boxplot of {column}')
    plt.ylabel(column)
    plt.show()

In [None]:
#PreProcessing

In [None]:
data.dtypes

In [None]:
null_values = data.isnull().sum()
print(null_values)

In [None]:
columns_to_fill = ['total_bedrooms', 'households', 'median_house_value', 'ocean_proximity']
for column in columns_to_fill:
    mode_value = data[column].mode()[0]
    data[column].fillna(mode_value, inplace=True)

In [None]:
data

In [None]:
#label encodening

In [None]:
from sklearn import preprocessing

label_encoder = preprocessing.LabelEncoder()

data['ocean_proximity']= label_encoder.fit_transform(data['ocean_proximity'])

print(data['ocean_proximity']);

In [None]:
from sklearn.model_selection import train_test_split
X = data.drop('median_house_value', axis=1)
y = data['median_house_value']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
# Convert the scaled data back to DataFrame for better readability
X_train_scaled = pd.DataFrame(X_train_scaled, columns=X_train.columns)
X_test_scaled = pd.DataFrame(X_test_scaled, columns=X_test.columns)

# Print the resulting datasets
print("X_train_scaled:\n", X_train_scaled)
print("X_test_scaled:\n", X_test_scaled)

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score

model = LinearRegression()

model.fit(X_train_scaled, y_train)

y_pred = model.predict(X_test_scaled)

# Calculate R-squared (R2)
r2 = r2_score(y_test, y_pred)

print(f"R-squared (R2): {r2}")