In [23]:
# Import Library
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

In [24]:
# Import Dataset
df = pd.read_csv('London.csv')

In [25]:
# Mengisi nilai null pada attribute Location dengan nilai default 'Unknown'
df['Location'] = df['Location'].fillna('Unknown')

# Menghitung frequency untuk frequency encoding pada tipe data non-numeric (House Type dan Location)
df['House Type Encoded'] = df['House Type'].map(df['House Type'].value_counts() / len(df))
df['Location Encoded'] = df['Location'].map(df['Location'].value_counts() / len(df))

In [None]:
# Menghapus attribute yang tidak diperlukan untuk analisis
df = df.drop(['Unnamed: 0', 'Property Name', 'Postal Code', 'House Type', 'Location', 'City/County'], axis=1)

# Print 5 baris pertama dataframe
print(df.head())

In [None]:
def detect_outliers_iqr(data):
    outliers_dict = {}
    outliers_index = []
    
    for col in data.columns:
        Q1 = data[col].quantile(0.25)
        Q3 = data[col].quantile(0.75)
        IQR = Q3 - Q1

        lower_bound = Q1 - 1.5 * IQR
        upper_bound = Q3 + 1.5 * IQR

        # Temukan outliers dan simpan index-nya
        outliers = data[(data[col] < lower_bound) | (data[col] > upper_bound)]
        outliers_dict[col] = outliers[col]  # Menyimpan outlier per kolom
        outliers_index.extend(outliers.index)  # Menyimpan index outlier
    
    # Return outliers per kolom dan unique index
    return outliers_dict, list(set(outliers_index))

# Mendeteksi outliers di seluruh dataset
outliers_dict, outliers_index = detect_outliers_iqr(df)

# Tampilkan jumlah outliers di setiap kolom
for col, outliers_in_col in outliers_dict.items():
    print(f"Number of outliers in {col}: {len(outliers_in_col)}")

In [None]:
# Menghapus baris yang berisi outliers (berdasarkan index unique)
df_cleaned = df.drop(index=outliers_index)

# Tampilkan ukuran dataset sebelum dan sesudah menghapus outliers
print(f"Original dataset size: {df.shape[0]}")
print(f"Cleaned dataset size: {df_cleaned.shape[0]}")

# Tampilkan jumlah total baris yang dihapus
print(f"Total rows removed: {df.shape[0] - df_cleaned.shape[0]}")

In [None]:
# Membuat plot komparasi antara atribut sebelum dan sesudah outlier dihilangkan
def plot_comparison_before_after(df_before, df_after, column):
    plt.figure(figsize=(12, 6))

    # Plot sebelum outlier dihilangkan
    plt.subplot(1, 2, 1)
    sns.boxplot(data=df_before[column])
    plt.title(f'Before Removing Outliers: {column}')
    
    # Plot setelah outlier dihilangkan
    plt.subplot(1, 2, 2)
    sns.boxplot(data=df_after[column])
    plt.title(f'After Removing Outliers: {column}')

    plt.tight_layout()
    plt.show()

# Memilih kolom yang ingin divisualisasikan, misalnya 'No. of Bedrooms'
plot_comparison_before_after(df, df_cleaned, 'No. of Bedrooms')


In [None]:
# Membuat correlation matrix
correlation_matrix = df_cleaned.corr()
print(correlation_matrix)

# Visualisasi correlation matrix menggunakan heatmap
plt.figure(figsize=(16, 12))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt='.2f', linewidths=0.5)
plt.xticks(rotation=45, ha='right')
plt.title('Correlation Matrix Heatmap')
plt.show()

In [None]:
# Membuat covariance matrix
covariance_matrix = df_cleaned.cov()
print(covariance_matrix)

# Visualisasi covariance matrix menggunakan heatmap
plt.figure(figsize=(16, 12))
sns.heatmap(covariance_matrix, annot=True, cmap='coolwarm', fmt='.2f', linewidths=0.5)
plt.xticks(rotation=45, ha='right')
plt.title('Covariance Matrix Heatmap')
plt.show()

In [None]:
# Standarisasi data
features = df_cleaned.columns
scaler = StandardScaler()
df_scaled = scaler.fit_transform(df_cleaned)

# Melakukan analisis PCA
pca = PCA()
pca_result = pca.fit_transform(df_scaled)

# Melihat varians yang dijelaskan oleh setiap komponen utama
explained_variance = pca.explained_variance_ratio_

# Membuat plot untuk melihat cumulative variance explained
plt.figure(figsize=(10, 7))
plt.plot(np.arange(1, len(explained_variance) + 1), np.cumsum(explained_variance))
plt.xlabel('Number of Components')
plt.ylabel('Cumulative Explained Variance')
plt.title('PCA Cumulative Explained Variance')
plt.grid(True)
plt.show()

# Mengambil 4 Komponen Utama
pca = PCA(n_components=4)
pca_result_2d = pca.fit_transform(df_scaled)

# Menyimpan hasil PCA ke dataframe
pca_df = pd.DataFrame(data=pca_result_2d, columns=['PC1', 'PC2', 'PC3', 'PC4'])

# Menampilkan 5 baris pertama dari hasil PCA
print(pca_df.head())

In [None]:
# Membuat dataframe dari analisis PCA
pca_components = pd.DataFrame(pca.components_, columns=df_cleaned.columns, index=[f'PC{i+1}' for i in range(pca.n_components_)])
print(pca_components)

# Plot untuk kontribusi masing-masing komponen
plt.figure(figsize=(10, 6))
sns.heatmap(pca_components, cmap='coolwarm', annot=True)
plt.title('PCA Loading Heatmap')
plt.xticks(rotation=45, ha='right')
plt.show()