In [1]:

# Import of relevant packages
import numpy as np
import pandas as pd
import warnings
import matplotlib.pyplot as plt
import seaborn as sns
import random
import re
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_predict
from sklearn.feature_selection import mutual_info_regression
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score, confusion_matrix, roc_auc_score, roc_curve, RocCurveDisplay
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.dummy import DummyClassifier

In [None]:
# import dataset from CSV file
df = pd.read_csv('data/Train.csv')
df = df.drop(columns=["Place_ID X Date", "Date", "target_min", "target_max", "target_variance", "target_count"])
print(df.shape)
df.info()

In [32]:
# split into categorical and numerical columns
categorical_cols = df.select_dtypes(include=['object']).columns.tolist()
numerical_cols = df.select_dtypes(exclude=['object']).columns.tolist()  
df_encoded = df.copy()

# apply one hot encoding to categorical columns
one_hot_encoded = pd.get_dummies(df_encoded["Place_ID"], prefix="Place_ID", drop_first=True)
df_encoded = pd.concat([df_encoded, one_hot_encoded], axis=1)

In [None]:
# drop original categorical columns
numeric_columns = df.select_dtypes(include=[np.number]).columns

# Impute missing values in numerical columns
imputer = SimpleImputer(strategy='mean')

# Impute the numerical columns
df_filled = df.copy()
df_filled[numeric_columns] = imputer.fit_transform(df[numeric_columns])

print(f"Numerische Spalten: {len(numeric_columns)}")
print(f"NaN values before: {df.isna().sum().sum()}")
print(f"NaN values after: {df_filled.isna().sum().sum()}")

Numerische Spalten: 75
NaN values before: 388262
NaN values after: 0


In [None]:
categorical_columns = df.select_dtypes(include=[np.number]).columns.drop(['Place_ID'], errors='ignore')

df_filled = df.copy()

# fill NaN values in categorical columns with city-specific means
for col in categorical_columns:
    if col in df_filled.columns:
        # calculate city-specific means
        city_means = df_filled.groupby('Place_ID')[col].transform('mean')
        
        # calculate global mean
        global_mean = df_filled[col].mean()
        city_means = city_means.fillna(global_mean)
        
        # fill NaN values with city-specific means
        df_filled[col] = df_filled[col].fillna(city_means)

print(f"Numerische Spalten: {len(categorical_columns)}")
print(f"NaN values before: {df.isna().sum().sum()}")
print(f"NaN values after: {df_filled.isna().sum().sum()}")

Numerische Spalten: 79
NaN values before: 388262
NaN values after: 0


In [None]:
# calculate the sum and percentage of NaN values in each column
def nans_sum_perc(x):
    nans_sum_perc_d = {}
    for i in x.columns:
        sum_nan = x[i].isna().sum() 
        if  sum_nan > 0:
            perc_nan = round((sum_nan / x[i].shape[0]) * 100, 2)
            nans_sum_perc_d.update({i: [sum_nan, perc_nan]})
    return pd.DataFrame.from_dict(nans_sum_perc_d, orient='index', columns=['NaN Count', 'NaN %'])

nans_sum_perc(df_filled)

In [None]:
X = df_filled.drop(columns=["Place_ID X Date", "Place_ID", "Date", "target", "Date", "target_min", "target_max", "target_variance", "target_count"])
y = df["target"]

In [None]:
# Feature selection using mutual information regression
feature_ranking = mutual_info_regression(X, y, discrete_features="auto")
feature_ranking = pd.Series(feature_ranking, index=X.columns).sort_values(ascending=False)
feature_ranking

L3_CH4_CH4_column_volume_mixing_ratio_dry_air    0.410507
L3_CH4_solar_azimuth_angle                       0.409547
L3_CH4_sensor_zenith_angle                       0.398144
L3_CH4_sensor_azimuth_angle                      0.394324
L3_CH4_aerosol_optical_depth                     0.394247
                                                   ...   
L3_CLOUD_cloud_top_height                        0.022952
precipitable_water_entire_atmosphere             0.018861
L3_CLOUD_cloud_top_pressure                      0.018292
relative_humidity_2m_above_ground                0.013643
L3_AER_AI_absorbing_aerosol_index                0.011962
Length: 74, dtype: float64

In [33]:
# exclude last 10 features from ranking
feature_ranking = feature_ranking[:-10]
filter_list_features_1 = feature_ranking.index.tolist()