In [None]:
import pandas as pd
import numpy as np
import os
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.metrics import mean_squared_error, accuracy_score, classification_report
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, LabelEncoder

def read_file(file_name):
    """Membaca file Excel berdasarkan input user"""
    if os.path.exists(file_name):
        df = pd.read_excel(file_name)
        print("\nFile berhasil dibaca!")
        return df
    else:
        print("File tidak ditemukan. Periksa nama file!")
        return None

def preprocess_data(df):
    """Fungsi untuk preprocessing data"""
    print("\n--- Proses Preprocessing ---")
    
    # 1. Cek missing values
    print("\nMissing Values sebelum diatasi:")
    print(df.isnull().sum())
    
    # 2. Tangani missing values pada kolom 'Air Quality'
    if 'Air Quality' in df.columns:
        # Hitung modus (kategori yang paling sering muncul)
        air_quality_mode = df['Air Quality'].mode()[0]
        
        # Isi nilai nan dengan modus
        df['Air Quality'] = df['Air Quality'].fillna(air_quality_mode)
        
        # Pastikan nilai error diubah menjadi modus
        valid_categories = ['Good', 'Moderate', 'Unhealthy', 'Hazardous']
        df['Air Quality'] = df['Air Quality'].apply(
            lambda x: x if x in valid_categories else air_quality_mode
        )
        print("\nMissing values dan error pada 'Air Quality' telah diisi dengan modus.")
    else:
        print("\nKolom 'Air Quality' tidak ditemukan.")
    
    # 3. Konversi kategori 'Air Quality' ke numerik
    quality_mapping = {
        'Good': 3,
        'Moderate': 2,
        'Unhealthy': 1,
        'Hazardous': 0
    }
    df['Air Quality'] = df['Air Quality'].map(quality_mapping)
    print("\nKolom 'Air Quality' telah dikonversi ke numerik.")
    
    # 4. Hapus duplikat
    df.drop_duplicates(inplace=True)
    print("\nDuplikat telah dihapus.")
    
    # 5. Standarisasi format data numerik
    for col in df.select_dtypes(include=[np.number]).columns:
        df[col] = pd.to_numeric(df[col], errors='coerce')
    
    print("\nData setelah preprocessing:")
    print(df.head())

    return df


def analyze_data(df):
    """Analisis tambahan pada data"""
    print("\n--- Analisis Data ---")
    print("Statistik Deskriptif Data:")
    print(df.describe().to_string())

def linear_regression_analysis(df):
    """Fungsi untuk analisis Linear Regression"""
    print("\n--- Linear Regression Analysis ---")
    
    # Pilih fitur (X) dan target (y)
    features = ['Temperature', 'Humidity', 'PM2.5', 'PM10', 'NO2', 'SO2', 'CO',
                'Proximity_to_Industrial_Areas', 'Population_Density']
    target = 'Air Quality'
    
    if target not in df.columns:
        print("Kolom target 'Air Quality' tidak ditemukan!")
        return
    
    X = df[features]
    y = df[target]
    
    # Isi nilai NaN dengan mean
    imputer = SimpleImputer(strategy='mean')
    X = imputer.fit_transform(X)
    
    # Bagi data menjadi train-test split
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    
    # Inisialisasi model Linear Regression
    model = LinearRegression()
    model.fit(X_train, y_train)
    
    # Prediksi
    y_pred = model.predict(X_test)
    
    # Evaluasi model
    mse = mean_squared_error(y_test, y_pred)
    print(f"Mean Squared Error (MSE): {mse:.2f}")
    print("Koefisien Linear Regression:", model.coef_)

def logistic_regression_analysis(df):
    """Fungsi untuk analisis Logistic Regression"""
    print("\n--- Logistic Regression Analysis ---")
    
    # Validasi kolom yang diperlukan
    required_columns = ['Air Quality', 'Temperature', 'Humidity', 'PM2.5', 'PM10', 
                        'NO2', 'SO2', 'CO', 'Proximity_to_Industrial_Areas', 'Population_Density']
    missing_columns = [col for col in required_columns if col not in df.columns]
    if missing_columns:
        print(f"Kolom berikut tidak ditemukan di data: {missing_columns}")
        return
    
    # Tambahkan klasifikasi: Air Quality sebagai 'Baik' atau 'Buruk'
    df['Air_Quality_Category'] = df['Air Quality'].apply(lambda x: 'Baik' if x == 3 else 'Buruk')
    
    # Pilih fitur dan target
    features = ['Temperature', 'Humidity', 'PM2.5', 'PM10', 'NO2', 'SO2', 'CO',
                'Proximity_to_Industrial_Areas', 'Population_Density']
    target = 'Air_Quality_Category'
    
    X = df[features]
    y = df[target]
    
    # Tangani missing values
    imputer = SimpleImputer(strategy='mean')
    X_imputed = imputer.fit_transform(X)
    
    # Standardisasi data
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X_imputed)
    
    # Encode label (Baik -> 1, Buruk -> 0)
    label_encoder = LabelEncoder()
    y_encoded = label_encoder.fit_transform(y)
    
    # Bagi data menjadi train-test split
    X_train, X_test, y_train, y_test = train_test_split(X_scaled, y_encoded, test_size=0.2, random_state=42)
    
    # Inisialisasi model Logistic Regression
    model = LogisticRegression(max_iter=1000, random_state=42)
    model.fit(X_train, y_train)
    
    # Prediksi
    y_pred = model.predict(X_test)
    
    # Evaluasi model
    print("\nClassification Report:")
    print(classification_report(y_test, y_pred, target_names=label_encoder.classes_))
    print(f"Accuracy: {accuracy_score(y_test, y_pred):.2f}")

def main():
    while True:
        print("\n===== PROGRAM DATA AIR QUALITY =====")
        print("1. Input Nama File")
        print("2. Lakukan Preprocessing")
        print("3. Analisis Data")
        print("4. Simpan Hasil ke File Baru")
        print("5. Keluar")
        print("6. Analisis Linear Regression")
        print("7. Analisis Logistic Regression")

        choice = input("Pilih menu (1-7): ")
        
        if choice == '1':
            file_name = input("Masukkan nama file (.xlsx): ")
            df = read_file(file_name)   
            
        elif choice == '2':
            if 'df' in locals() and df is not None:
                df = preprocess_data(df)
            else:
                print("Baca file terlebih dahulu!")
                
        elif choice == '3':
            if 'df' in locals() and df is not None:
                analyze_data(df)
            else:
                print("Baca dan preprocessing data terlebih dahulu!")
                
        elif choice == '4':
            if 'df' in locals() and df is not None:
                output_file = input("Masukkan nama file output (.xlsx): ")
                df.to_excel(output_file, index=False)
                print(f"Data berhasil disimpan ke {output_file}")
            else:
                print("Baca dan preprocessing data terlebih dahulu!")
                
        elif choice == '5':
            print("Program selesai. Terima kasih!")
            break
        
        elif choice == '6':
            if 'df' in locals() and df is not None:
                linear_regression_analysis(df)
            else:
                print("Baca dan preprocessing data terlebih dahulu!")
                
        elif choice == '7':
            if 'df' in locals() and df is not None:
                logistic_regression_analysis(df)
            else:
                print("Baca dan preprocessing data terlebih dahulu!")

        else:
            print("Pilihan tidak valid! Coba lagi.")

if __name__ == "__main__":
    main()



===== PROGRAM DATA AIR QUALITY =====
1. Input Nama File
2. Lakukan Preprocessing
3. Analisis Data
4. Simpan Hasil ke File Baru
5. Keluar
6. Analisis Linear Regression
7. Analisis Logistic Regression

File berhasil dibaca!

===== PROGRAM DATA AIR QUALITY =====
1. Input Nama File
2. Lakukan Preprocessing
3. Analisis Data
4. Simpan Hasil ke File Baru
5. Keluar
6. Analisis Linear Regression
7. Analisis Logistic Regression

--- Proses Preprocessing ---

Missing Values sebelum diatasi:
Temperature                      25
Humidity                         44
PM2.5                            36
PM10                             33
NO2                              49
SO2                              24
CO                               42
Proximity_to_Industrial_Areas    22
Population_Density               50
Air Quality                      45
dtype: int64

Missing values dan error pada 'Air Quality' telah diisi dengan modus.

Kolom 'Air Quality' telah dikonversi ke numerik.

Duplikat telah diha

ValueError: Input X contains NaN.
LogisticRegression does not accept missing values encoded as NaN natively. For supervised learning, you might want to consider sklearn.ensemble.HistGradientBoostingClassifier and Regressor which accept missing values encoded as NaNs natively. Alternatively, it is possible to preprocess the data, for instance by using an imputer transformer in a pipeline or drop samples with missing values. See https://scikit-learn.org/stable/modules/impute.html You can find a list of all estimators that handle NaN values at the following page: https://scikit-learn.org/stable/modules/impute.html#estimators-that-handle-nan-values