In [19]:
# Import Library yang Dibutuhkan
import pandas as pd

# Mengimpor Dataset CSV
df = pd.read_csv('1553768847-housing.csv')

# Menampilkan Informasi Struktur Dataset
print("Informasi Struktur Dataset:")
print(df.info())  # Menampilkan info struktur dataset
print("\nDeskripsi Statistik Dataset:")
print(df.describe())  # Statistik deskriptif dataset


Informasi Struktur Dataset:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20640 entries, 0 to 20639
Data columns (total 10 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   longitude           20640 non-null  float64
 1   latitude            20640 non-null  float64
 2   housing_median_age  20640 non-null  int64  
 3   total_rooms         20640 non-null  int64  
 4   total_bedrooms      20433 non-null  float64
 5   population          20640 non-null  int64  
 6   households          20640 non-null  int64  
 7   median_income       20640 non-null  float64
 8   ocean_proximity     20640 non-null  object 
 9   median_house_value  20640 non-null  int64  
dtypes: float64(4), int64(5), object(1)
memory usage: 1.6+ MB
None

Deskripsi Statistik Dataset:
          longitude      latitude  housing_median_age   total_rooms  \
count  20640.000000  20640.000000        20640.000000  20640.000000   
mean    -119.569704     35.631861   

In [20]:
# Mengidentifikasi Missing Values
missing_values = df.isnull().sum()
print("Sebelum menangani missing values")
print("Jumlah Missing Values per Kolom:")
print(missing_values)

# Menangani Missing Values
# Kita akan mengisi missing values pada kolom 'total_bedrooms' dengan median kolom tersebut
df['total_bedrooms'] = df['total_bedrooms'].fillna(df['total_bedrooms'].median())

missing_values = df.isnull().sum()
print("\nSetelah menangani missing values")
print("Jumlah Missing Values per Kolom:")
print(missing_values)


Sebelum menangani missing values
Jumlah Missing Values per Kolom:
longitude               0
latitude                0
housing_median_age      0
total_rooms             0
total_bedrooms        207
population              0
households              0
median_income           0
ocean_proximity         0
median_house_value      0
dtype: int64

Setelah menangani missing values
Jumlah Missing Values per Kolom:
longitude             0
latitude              0
housing_median_age    0
total_rooms           0
total_bedrooms        0
population            0
households            0
median_income         0
ocean_proximity       0
median_house_value    0
dtype: int64


In [21]:
# Mengidentifikasi Outliers dengan metode IQR (Interquartile Range)
Q1 = df['total_rooms'].quantile(0.25)
Q3 = df['total_rooms'].quantile(0.75)
IQR = Q3 - Q1
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

# Menghapus outliers
df_outliers_removed = df[(df['total_rooms'] >= lower_bound) & (df['total_rooms'] <= upper_bound)]

print("\nDeskripsi Statistik Setelah Menghapus Outliers:")
df_outliers_removed.describe()



Deskripsi Statistik Setelah Menghapus Outliers:


Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value
count,19353.0,19353.0,19353.0,19353.0,19353.0,19353.0,19353.0,19353.0,19353.0
mean,-119.591048,35.647446,29.522193,2225.118793,463.237793,1244.520384,433.455537,3.812732,204961.348835
std,2.000904,2.142796,12.308489,1153.906566,253.11833,711.481397,235.16394,1.88356,115283.433436
min,-124.35,32.54,1.0,2.0,1.0,3.0,1.0,0.4999,14999.0
25%,-121.81,33.94,19.0,1406.0,289.0,766.0,272.0,2.5275,117600.0
50%,-118.5,34.26,30.0,2036.0,419.0,1118.0,392.0,3.4812,177700.0
75%,-118.03,37.72,38.0,2892.0,592.0,1591.0,557.0,4.6687,262200.0
max,-114.31,41.95,52.0,5694.0,2610.0,8733.0,2406.0,15.0001,500001.0


In [22]:
# Mengidentifikasi Data Duplicates
duplicates = df_outliers_removed.duplicated().sum()
print("\nJumlah Data Duplicates:")
print(duplicates)

# Menghapus Data Duplicates
df_no_duplicates = df_outliers_removed.drop_duplicates()

print("\nJumlah Data Setelah Penghapusan Duplicates:")
print(df_no_duplicates.shape[0])



Jumlah Data Duplicates:
0

Jumlah Data Setelah Penghapusan Duplicates:
19353


In [23]:
# Simulasi Penggabungan Dua Dataset
# Mengambil subset untuk pelanggan dan transaksi
df_pelanggan = df[['longitude', 'latitude', 'median_house_value']]
df_transaksi = df[['total_rooms', 'total_bedrooms', 'households']]

# Menambahkan ID sebagai penghubung antara pelanggan dan transaksi
df_pelanggan['ID'] = df_pelanggan.index
df_transaksi['ID'] = df_transaksi.index

# Menggabungkan kedua dataset berdasarkan ID
df_gabungan = pd.merge(df_pelanggan, df_transaksi, on='ID', how='inner')

# Menangani Inkonsistensi Data (contoh: ocean_proximity)
# Misalnya kita ingin memastikan konsistensi penulisan 'ocean_proximity' yang mungkin memiliki inkonsistensi penulisan
df['ocean_proximity'] = df['ocean_proximity'].str.lower()  # Mengubah menjadi huruf kecil dan menghapus spasi
df['ocean_proximity'] = df['ocean_proximity'].str.strip()

# Jika ada kolom tanggal, pastikan formatnya konsisten
# Contoh format tanggal (jika ada kolom 'date')
# df['tanggal'] = pd.to_datetime(df['tanggal'], errors='coerce')

# Menampilkan hasil penggabungan dan koreksi inkonsistensi
df_gabungan.head(), df['ocean_proximity'].unique()


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_pelanggan['ID'] = df_pelanggan.index
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_transaksi['ID'] = df_transaksi.index


(   longitude  latitude  median_house_value  ID  total_rooms  total_bedrooms  \
 0    -122.23     37.88              452600   0          880           129.0   
 1    -122.22     37.86              358500   1         7099          1106.0   
 2    -122.24     37.85              352100   2         1467           190.0   
 3    -122.25     37.85              341300   3         1274           235.0   
 4    -122.25     37.85              342200   4         1627           280.0   
 
    households  
 0         126  
 1        1138  
 2         177  
 3         219  
 4         259  ,
 array(['near bay', '<1h ocean', 'inland', 'near ocean', 'island'],
       dtype=object))

In [24]:
print(df.head())  # Statistik deskriptif dataset

   longitude  latitude  housing_median_age  total_rooms  total_bedrooms  \
0    -122.23     37.88                  41          880           129.0   
1    -122.22     37.86                  21         7099          1106.0   
2    -122.24     37.85                  52         1467           190.0   
3    -122.25     37.85                  52         1274           235.0   
4    -122.25     37.85                  52         1627           280.0   

   population  households  median_income ocean_proximity  median_house_value  
0         322         126         8.3252        near bay              452600  
1        2401        1138         8.3014        near bay              358500  
2         496         177         7.2574        near bay              352100  
3         558         219         5.6431        near bay              341300  
4         565         259         3.8462        near bay              342200  
