In [3]:
import pandas as pd
from sklearn.impute import KNNImputer
from sklearn.linear_model import LinearRegression

# Load data
df = pd.read_csv('data.csv')

# Detect missing values
print("Missing value matrix:\n", df.isnull())
print("Missing values per column:\n", df.isnull().sum())

# Drop rows and columns with missing values
df_dropped_rows = df.dropna()
df_dropped_cols = df.dropna(axis=1)

# Mean imputation for numerical column
if 'num_col' in df.columns:
    df['num_col'] = df['num_col'].fillna(df['num_col'].mean())

# Mode imputation for categorical column
if 'cat_col' in df.columns:
    mode_val = df['cat_col'].mode()
    df['cat_col'] = df['cat_col'].fillna(mode_val[0])

# Median imputation for skewed data
if 'skewed_col' in df.columns:
    df['skewed_col'] = df['skewed_col'].fillna(df['skewed_col'].median())

# KNN Imputation (only if numerical data)
knn_imputer = KNNImputer(n_neighbors=3)
df_knn = pd.DataFrame(knn_imputer.fit_transform(df.select_dtypes(include='number')), columns=df.select_dtypes(include='number').columns)

# Handling categorical with next frequent category
if 'cat_col' in df.columns:
    mode_vals = df['cat_col'].mode()
    next_mode = mode_vals[1] if len(mode_vals) > 1 else mode_vals[0]
    df['cat_col'] = df['cat_col'].fillna(next_mode)

# Predictive modeling imputation
if 'target' in df.columns and df['target'].isnull().any():
    features = ['feature1', 'feature2']
    if all(col in df.columns for col in features):
        train = df[df['target'].notnull()]
        test = df[df['target'].isnull()]
        
        # Drop rows in train with NaNs in features
        train = train.dropna(subset=features)

        # Optionally, fill missing feature values in test
        test = test.copy()
        for col in features:
            test[col] = test[col].fillna(train[col].mean())

        # Train model and predict
        model = LinearRegression()
        model.fit(train[features], train['target'])
        df.loc[df['target'].isnull(), 'target'] = model.predict(test[features])


# Time series forward and backward fill
if 'date_column' in df.columns and 'value' in df.columns:
    df = df.sort_values('date_column')
    df['value'] = df['value'].fillna(method='ffill').fillna(method='bfill')


Missing value matrix:
       id  num_col  cat_col  skewed_col  feature1  feature2  target  \
0  False    False    False       False     False     False   False   
1  False     True    False       False     False     False    True   
2  False    False     True       False      True     False   False   
3  False    False    False        True     False     False   False   
4  False    False    False       False     False     False    True   
5  False    False     True       False     False     False   False   
6  False    False    False       False     False      True   False   
7  False    False    False       False     False     False   False   
8  False    False    False       False     False     False    True   
9  False     True    False        True     False     False   False   

   date_column  value  
0        False  False  
1        False  False  
2        False   True  
3        False  False  
4        False   True  
5        False  False  
6        False  False  
7        False

  df['value'] = df['value'].fillna(method='ffill').fillna(method='bfill')
