In [1]:
import pandas as pd
import numpy as np
from scipy import stats


In [2]:
data = pd.read_csv("/content/sample_data/california_housing_test.csv")
print(data.head())


   longitude  latitude  housing_median_age  total_rooms  total_bedrooms  \
0    -122.05     37.37                27.0       3885.0           661.0   
1    -118.30     34.26                43.0       1510.0           310.0   
2    -117.81     33.78                27.0       3589.0           507.0   
3    -118.36     33.82                28.0         67.0            15.0   
4    -119.67     36.33                19.0       1241.0           244.0   

   population  households  median_income  median_house_value  
0      1537.0       606.0         6.6085            344700.0  
1       809.0       277.0         3.5990            176500.0  
2      1484.0       495.0         5.7934            270500.0  
3        49.0        11.0         6.1359            330000.0  
4       850.0       237.0         2.9375             81700.0  


In [3]:
data.isnull().sum()

Unnamed: 0,0
longitude,0
latitude,0
housing_median_age,0
total_rooms,0
total_bedrooms,0
population,0
households,0
median_income,0
median_house_value,0


In [16]:
data.dropna(inplace=True)
print(data)

      longitude  latitude  housing_median_age  total_rooms  total_bedrooms  \
0       -122.05     37.37                27.0       3885.0           661.0   
1       -118.30     34.26                43.0       1510.0           310.0   
2       -117.81     33.78                27.0       3589.0           507.0   
3       -118.36     33.82                28.0         67.0            15.0   
4       -119.67     36.33                19.0       1241.0           244.0   
...         ...       ...                 ...          ...             ...   
2995    -119.86     34.42                23.0       1450.0           642.0   
2996    -118.14     34.06                27.0       5257.0          1082.0   
2997    -119.70     36.30                10.0        956.0           201.0   
2998    -117.12     34.10                40.0         96.0            14.0   
2999    -119.63     34.42                42.0       1765.0           263.0   

      population  households  median_income  median_house_value

In [17]:
data = data.drop_duplicates()
print(data)

      longitude  latitude  housing_median_age  total_rooms  total_bedrooms  \
0       -122.05     37.37                27.0       3885.0           661.0   
1       -118.30     34.26                43.0       1510.0           310.0   
2       -117.81     33.78                27.0       3589.0           507.0   
3       -118.36     33.82                28.0         67.0            15.0   
4       -119.67     36.33                19.0       1241.0           244.0   
...         ...       ...                 ...          ...             ...   
2995    -119.86     34.42                23.0       1450.0           642.0   
2996    -118.14     34.06                27.0       5257.0          1082.0   
2997    -119.70     36.30                10.0        956.0           201.0   
2998    -117.12     34.10                40.0         96.0            14.0   
2999    -119.63     34.42                42.0       1765.0           263.0   

      population  households  median_income  median_house_value

In [11]:
z = np.abs(stats.zscore(data.select_dtypes(include=[np.number])))
data = data[(z < 3).all(axis=1)]


In [12]:
data.to_csv("/content/sample_data/california_housing_test.csv", index=False)


In [13]:
def clean_data(file_path):
    df = pd.read_csv(file_path)
    df.drop_duplicates(inplace=True)
    df.fillna(df.mean(numeric_only=True), inplace=True)
    z = np.abs(stats.zscore(df.select_dtypes(include=[np.number])))
    df = df[(z < 3).all(axis=1)]
    df.to_csv("cleaned_data.csv", index=False)
    return df


In [20]:
clean_data("/content/sample_data/california_housing_test.csv")


Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value
0,-122.05,37.37,27.0,3885.0,661.0,1537.0,606.0,6.6085,344700.0
1,-118.30,34.26,43.0,1510.0,310.0,809.0,277.0,3.5990,176500.0
2,-117.81,33.78,27.0,3589.0,507.0,1484.0,495.0,5.7934,270500.0
3,-118.36,33.82,28.0,67.0,15.0,49.0,11.0,6.1359,330000.0
4,-119.67,36.33,19.0,1241.0,244.0,850.0,237.0,2.9375,81700.0
...,...,...,...,...,...,...,...,...,...
2872,-117.93,33.86,35.0,931.0,181.0,516.0,174.0,5.5867,182500.0
2873,-119.86,34.42,23.0,1450.0,642.0,1258.0,607.0,1.1790,225000.0
2874,-118.14,34.06,27.0,5257.0,1082.0,3496.0,1036.0,3.3906,237200.0
2875,-119.70,36.30,10.0,956.0,201.0,693.0,220.0,2.2895,62000.0


In [21]:
df = clean_data("/content/sample_data/california_housing_test.csv")

In [22]:
df.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value
0,-122.05,37.37,27.0,3885.0,661.0,1537.0,606.0,6.6085,344700.0
1,-118.3,34.26,43.0,1510.0,310.0,809.0,277.0,3.599,176500.0
2,-117.81,33.78,27.0,3589.0,507.0,1484.0,495.0,5.7934,270500.0
3,-118.36,33.82,28.0,67.0,15.0,49.0,11.0,6.1359,330000.0
4,-119.67,36.33,19.0,1241.0,244.0,850.0,237.0,2.9375,81700.0


In [23]:
df.describe()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value
count,2761.0,2761.0,2761.0,2761.0,2761.0,2761.0,2761.0,2761.0,2761.0
mean,-119.61473,35.667993,29.535675,2241.367258,463.540022,1241.673307,432.777617,3.618069,198015.706628
std,1.991727,2.142821,12.268652,1243.58019,248.143783,672.212738,228.964735,1.490281,105733.609263
min,-124.18,32.56,1.0,6.0,2.0,5.0,2.0,0.4999,22500.0
25%,-121.82,33.94,19.0,1354.0,285.0,768.0,269.0,2.5179,117300.0
50%,-118.52,34.31,30.0,2025.0,422.0,1119.0,395.0,3.4187,174400.0
75%,-118.04,37.7,38.0,2899.0,598.0,1632.0,561.0,4.5122,252400.0
max,-114.49,41.92,52.0,6660.0,1355.0,3595.0,1245.0,8.2827,500001.0


In [24]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 2761 entries, 0 to 2876
Data columns (total 9 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   longitude           2761 non-null   float64
 1   latitude            2761 non-null   float64
 2   housing_median_age  2761 non-null   float64
 3   total_rooms         2761 non-null   float64
 4   total_bedrooms      2761 non-null   float64
 5   population          2761 non-null   float64
 6   households          2761 non-null   float64
 7   median_income       2761 non-null   float64
 8   median_house_value  2761 non-null   float64
dtypes: float64(9)
memory usage: 215.7 KB
