In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
df = pd.read_csv('DataPreprocessingGraded_dataset.csv')

In [3]:
df.shape

(748, 6)

input features = shape[1]-1

If target is non-numeric → classification
Else if numeric but has small number of unique labels (<= 20) → classification
Else → regression

In [8]:


# List of input features
features = ['V1', 'V2', 'V3', 'V4', 'V5']

# Convert columns to numeric, coerce errors to NaN
df[features] = df[features].apply(pd.to_numeric, errors='coerce')

# Now compute variance ignoring NaN
variances = df[features].var(skipna=True)

# Feature with least variance
least_variance_feature = variances.idxmin()

print("Variances:\n", variances)
print("\nFeature with least variance:", least_variance_feature)


Variances:
 V1    6.550517e+01
V2    3.363720e+01
V3    2.131094e+06
V4    5.942242e+02
V5             NaN
dtype: float64

Feature with least variance: V2


In [9]:
features = ['V1', 'V2', 'V3']

# Function to count outliers using IQR method
def count_outliers(series):
    s = pd.to_numeric(series, errors='coerce').dropna()
    Q1 = s.quantile(0.25)
    Q3 = s.quantile(0.75)
    IQR = Q3 - Q1
    lower = Q1 - 1.5 * IQR
    upper = Q3 + 1.5 * IQR
    return ((s < lower) | (s > upper)).sum()

# Count outliers for each feature
outlier_counts = {f: count_outliers(df[f]) for f in features}
outlier_counts_series = pd.Series(outlier_counts)

# Feature with least number of outliers
least_outliers_feature = outlier_counts_series.idxmin()

print("Outlier counts per feature:\n", outlier_counts_series)
print("\nFeature with least number of outliers:", least_outliers_feature)

Outlier counts per feature:
 V1     7
V2    42
V3    45
dtype: int64

Feature with least number of outliers: V1


In [10]:
target_col = 'Target'  # replace with actual target column name
target_counts = df[target_col].value_counts()
most_common_target = target_counts.idxmax()

print("\nTarget variable counts:\n", target_counts)
print(f"\nMost frequent target value: {most_common_target}")


Target variable counts:
 Target
NO     570
YES    178
Name: count, dtype: int64

Most frequent target value: NO


In [12]:
missing_counts = (df == '?').sum()

# Number of features with at least one missing value
num_features_with_missing = (missing_counts > 0).sum()

# Total number of missing values
total_missing = missing_counts.sum()

print("Number of features with missing values:", num_features_with_missing)
print("Total number of missing values:", total_missing)

Number of features with missing values: 0
Total number of missing values: 0


In [13]:
from sklearn.impute import SimpleImputer, KNNImputer
numeric_cols = ['V1', 'V2', 'V3', 'V4', 'V5']  # update as per your dataset
df[numeric_cols] = df[numeric_cols].apply(pd.to_numeric, errors='coerce')

# --- 1. SimpleImputer (median) for V1 ---
imp_median = SimpleImputer(strategy='median')
V1_imputed_median = imp_median.fit_transform(df[['V1']])
avg_V1_median = V1_imputed_median.mean()
print("Average of V1 after SimpleImputer (median):", round(avg_V1_median, 3))

# --- 2. KNNImputer (n_neighbors=3) for V1 ---
knn_imp = KNNImputer(n_neighbors=3)
V1_imputed_knn = knn_imp.fit_transform(df[['V1']])
avg_V1_knn = V1_imputed_knn.mean()
print("Average of V1 after KNNImputer:", round(avg_V1_knn, 3))

# --- 3. SimpleImputer (mean) for V2 ---
imp_mean = SimpleImputer(strategy='mean')
V2_imputed_mean = imp_mean.fit_transform(df[['V2']])
avg_V2_mean = V2_imputed_mean.mean()
print("Average of V2 after SimpleImputer (mean):", round(avg_V2_mean, 3))

Average of V1 after SimpleImputer (median): 9.552
Average of V1 after KNNImputer: 9.563
Average of V2 after SimpleImputer (mean): 5.464


In [1]:
from sklearn.decomposition import PCA
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import Pipeline

estimators = [
    ('simpleImputer', SimpleImputer()),
    ('pca', PCA()),
    ('regressor', LinearRegression())
]

pipe = Pipeline(steps=estimators)
print(len(pipe.steps))


3


In [2]:
import numpy as np
from sklearn.preprocessing import add_dummy_feature

X = np.array([[1,4], [4, 6]])
X_new = add_dummy_feature(X)
print(X_new)


[[1. 1. 4.]
 [1. 4. 6.]]
