<a href="https://colab.research.google.com/github/ashish78905/OPTICONNECT_CALLL_CENTER_ANALYSIS-ASSIGNMENT/blob/main/clean_preprocessing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# HANDALING MISSING VALUE

In [None]:
# 1. To see how many missing values are there in each column
df.isnull().sum()

# 2. Handle missing values by dropping them row-wise (default)
df.dropna()

# 3. Handle missing values by dropping them column-wise
df.dropna(axis=1)

# 4. Impute missing values of any numerical column with its mean
df['any_numerical_column'].fillna(df['any_numerical_column'].mean())

# 5. Handle missing values by filling with any constant (e.g., 0 or random character)
df['any_column'].fillna(0)

# 6. Impute missing values of any numerical column with its median
df['any_numerical_column'] = df['any_numerical_column'].fillna(df['any_numerical_column'].median())

# 7. Check overall dataframe info (datatypes, non-null counts, memory usage)
df.info()

# 8. Check the shape of dataframe after dropping missing values
df.dropna(axis=0).shape

# 9. Visualize distribution of any numerical column
#    If normal distribution → impute with mean, else → with median
sns.histplot(df['any_numerical_column'], kde=True)

# 10. Visualize distribution after imputation (mean/median)
sns.distplot(df['any_numerical_column_after_imputation'], kde=True)

# 11. For categorical data, check mode of any categorical column
df[df['any_categorical_column'].notna()]['any_categorical_column'].mode()

# 12. Final check to see if any null values remain in categorical column
df['any_categorical_column'].isna().sum()


# HANDALING IMBALANCED DATASET

In [None]:
# ================================================================
# Handling Imbalanced Dataset (Any Dataset) - Same Sequence Preserved
# ================================================================

import numpy as np
import pandas as pd

np.random.seed(1)  # for reproducibility

# 1. Define dataset size and imbalance ratio
no_samples = 1000
class_0_ratio = 0.9
no_class_0 = int(no_samples * class_0_ratio)  # class 0 → 90% (900 samples)
no_class_1 = 100                              # class 1 → 10% (100 samples)

no_class_0, no_class_1  # check number of samples per class

len(np.random.normal(0, 1, no_class_0))  # length of generated majority class samples

# 2. Create majority class (class 0)
class_0 = {
    'feature1': np.random.normal(0, 1, no_class_0),
    'feature2': np.random.normal(0, 1, no_class_0),
    'target': [0] * no_class_0
}
class_0 = pd.DataFrame(class_0)
class_0  # check class 0 dataframe

# 3. Create minority class (class 1)
class_1 = pd.DataFrame({
    'feature1': np.random.normal(3, 1, no_class_1),
    'feature2': np.random.normal(3, 1, no_class_1),
    'target': [1] * no_class_1
})
class_1  # check class 1 dataframe

# 4. Combine both classes
df = pd.concat([class_0, class_1]).reset_index(drop=True)
df  # final dataset

# 5. Check imbalance in target
df.target.value_counts()  # shows ~900 vs 100

# ------------------------------------------------
# UPSAMPLING
# ------------------------------------------------

# 6. Separate minority and majority classes
df_minority = df[df.target == 1]
df_majority = df[df.target == 0]   # just demonstration

from sklearn.utils import resample  # resampling utility

# 7. Upsample minority class
df_minority_upsampled = resample(
    df_minority,
    replace=True,                     # with replacement
    n_samples=len(df_majority),       # match majority count
    random_state=1
)

df_minority_upsampled.shape  # check shape of minority after upsampling

# 8. Combine majority and upsampled minority
df_upsampled = pd.concat([df_majority, df_minority_upsampled])
df_upsampled  # final upsampled dataframe

df_upsampled.target.value_counts()  # now balanced (equal counts)

# ------------------------------------------------
# DOWNSAMPLING
# ------------------------------------------------

# 9. Downsample majority class
df_majority_downsampled = resample(
    df_majority,
    replace=False,                    # without replacement
    n_samples=len(df_minority),       # match minority count
    random_state=1
)

df_downsampled = pd.concat([df_majority_downsampled, df_minority])
df_downsampled  # downsampled dataset

df_downsampled.target.value_counts()  # balanced counts

# ------------------------------------------------
# SMOTE
# ------------------------------------------------

from sklearn.datasets import make_classification

# 10. Create imbalanced dataset for SMOTE demonstration
X, y = make_classification(
    n_samples=1000,
    n_redundant=0,
    n_features=2,
    n_clusters_per_class=1,
    weights=[0.90],   # imbalance
    random_state=1
)

X  # generated features
y  # generated target

len(y[y == 0])  # count of class 0

# 11. Convert to DataFrame
df1 = pd.DataFrame(X, columns=['f1', 'f2'])
df2 = pd.DataFrame(y, columns=['target'])
final_df = pd.concat([df1, df2], axis=1)
final_df  # dataset before SMOTE

final_df.target.value_counts()  # imbalance in target

import matplotlib.pyplot as plt
plt.scatter(final_df['f1'], final_df['f2'], c=final_df['target'])
# visualization before SMOTE

# 12. Apply SMOTE
from imblearn.over_sampling import SMOTE

oversample = SMOTE()  # create SMOTE object

X, y = oversample.fit_resample(final_df[['f1', 'f2']], final_df['target'])

X.shape  # resampled features shape
y.shape  # resampled target shape

len(y[y == 0])  # new class 0 count
len(y[y == 1])  # new class 1 count

# 13. Convert back to DataFrame
df1 = pd.DataFrame(X, columns=['f1', 'f2'])
df2 = pd.DataFrame(y, columns=['target'])
oversample_df = pd.concat([df1, df2], axis=1)
oversample_df  # dataset after SMOTE

plt.scatter(oversample_df['f1'], oversample_df['f2'], c=oversample_df['target'])
# visualization after SMOTE

oversample_df[oversample_df.target == 1]  # check minority samples


# INTERPOLATION

In [None]:
# ================================================================
# Interpolation Techniques (Any Dataset) - Same Sequence Preserved
# ================================================================

import numpy as np
import matplotlib.pyplot as plt
from scipy.interpolate import interp1d

# ------------------------------------------------
# 1. Linear Interpolation
# ------------------------------------------------

x = np.array([1, 2, 3, 4, 5])
y = np.array([1, 3, 5, 7, 9])

plt.scatter(x, y)  # original data points

# interpolate the data using linear interpolation
x_new = np.linspace(1, 5, 10)  # create new x values
y_interp = np.interp(x_new, x, y)  # linear interpolation

plt.scatter(x_new, y_interp)  # interpolated points

# ------------------------------------------------
# 2. Cubic Interpolation
# ------------------------------------------------

x = np.array([1, 2, 3, 4, 5])
y = np.array([1, 8, 27, 64, 125])

plt.scatter(x, y)  # original data points

# cubic interpolation function
f = interp1d(x, y, kind='cubic')

x_new = np.linspace(1, 5, 10)
y_interp = f(x_new)

y_interp  # interpolated values

plt.scatter(x, y)          # original points
plt.scatter(x_new, y_interp)  # cubic interpolated points

# ------------------------------------------------
# 3. Polynomial Interpolation
# ------------------------------------------------

x = np.array([1, 2, 3, 4, 5])
y = np.array([1, 4, 9, 1, 25])

# interpolate using polynomial interpolation (degree 2 here)
p = np.polyfit(x, y, 2)

x_new = np.linspace(1, 5, 10)
y_interp = np.polyval(p, x_new)

plt.scatter(x, y)          # original points
plt.scatter(x_new, y_interp)  # polynomial interpolated points


# HANDALING OUTLIERS

In [None]:
# ================================================================
# Outlier Detection and Handling Techniques (Any Dataset)
# ================================================================

import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# ------------------------------------------------
# 1. Create Data
# ------------------------------------------------
salary = [11, 40, 45, 68, 65, 68, 78, 90, 57, 74,
          91, 92, 88, 68, 57, 48, 99, 101, 68, 77,
          110, 140]

df = pd.DataFrame(salary, columns=['Salary'])
df.describe()

# ------------------------------------------------
# 2. Check Outliers → distplot & boxplot
# ------------------------------------------------
plt.figure(figsize=(12, 5))

plt.subplot(1, 2, 1)
sns.histplot(df['Salary'], kde=True)
plt.title("Dist plot")

plt.subplot(1, 2, 2)
sns.boxplot(data=df, x='Salary')
plt.title("Box plot")

# ------------------------------------------------
# 3. Dropping the Outlier using IQR Method
# ------------------------------------------------
Q1 = df['Salary'].quantile(0.25)
Q3 = df['Salary'].quantile(0.75)
IQR = Q3 - Q1

lower_fence = Q1 - 1.5 * IQR
upper_fence = Q3 + 1.5 * IQR

lower_fence   # lowest side outlier threshold
upper_fence   # highest side outlier threshold

df_filtered = df[(df.Salary >= lower_fence) & (df.Salary <= upper_fence)]
df_filtered.shape  # shape after removing outliers

# confirm there is no outlier after filtering
plt.figure(figsize=(12, 5))

plt.subplot(1, 2, 1)
sns.histplot(df_filtered['Salary'], kde=True)
plt.title("Dist plot after dropping outliers")

plt.subplot(1, 2, 2)
sns.boxplot(data=df_filtered, x='Salary')
plt.title("Box plot after dropping outliers")

# ------------------------------------------------
# 4. Imputation with Mean
# ------------------------------------------------
df['Salary_imputed_mean'] = np.where(
    (df.Salary >= upper_fence) | (df.Salary <= lower_fence),
    df['Salary'].mean(),
    df['Salary']
)

# check after mean imputation
plt.subplot(1, 2, 1)
sns.histplot(df['Salary_imputed_mean'], kde=True)
plt.title("Dist plot after mean imputation")

plt.subplot(1, 2, 2)
sns.boxplot(data=df, x='Salary_imputed_mean')
plt.title("Box plot after mean imputation")

# ------------------------------------------------
# 5. Imputation with Median
# ------------------------------------------------
df['Salary_imputed_median'] = np.where(
    (df.Salary >= upper_fence) | (df.Salary <= lower_fence),
    df['Salary'].median(),
    df['Salary']
)

# check after median imputation
plt.subplot(1, 2, 1)
sns.histplot(df['Salary_imputed_median'], kde=True)
plt.title("Dist plot after median imputation")

plt.subplot(1, 2, 2)
sns.boxplot(data=df, x='Salary_imputed_median')
plt.title("Box plot after median imputation")

plt.show()

# ------------------------------------------------
# 6. Capping Technique (Replace Outliers with 5th & 95th Percentiles)
# ------------------------------------------------
lower_cap = df['Salary'].quantile(0.05)  # lower cap
upper_cap = df['Salary'].quantile(0.95)  # upper cap

lower_cap, upper_cap

df['Salary_capped'] = np.where(
    df['Salary'] < lower_cap, lower_cap,
    np.where(df['Salary'] > upper_cap, upper_cap, df['Salary'])
)

# confirm after capping
plt.subplot(1, 2, 1)
sns.histplot(df['Salary_capped'], kde=True)
plt.title("Dist plot after capping")

plt.subplot(1, 2, 2)
sns.boxplot(data=df, x='Salary_capped')
plt.title("Box plot after capping")

plt.show()


# FEATURE SCALING

In [None]:
# ================================================================
# Feature Scaling Techniques (Any Dataset)
# ================================================================

import pandas as pd
import seaborn as sns
from sklearn.preprocessing import StandardScaler, MinMaxScaler, normalize

# ------------------------------------------------
# 1. Load Data
# ------------------------------------------------
df = sns.load_dataset('tips')
df

# Note:
# Feature scaling is optional, as it does not change the distribution
# of data. Distribution remains same before and after scaling.

# ------------------------------------------------
# 2. Standardization (mu=0, sigma=1)
# ------------------------------------------------
scaler = StandardScaler()   # create object of StandardScaler
scaler   # object created

# apply fit_transform on training data
scaler.fit_transform(df[['total_bill', 'tip']])

# show standardized data as DataFrame
pd.DataFrame(
    scaler.fit_transform(df[['total_bill', 'tip']]),
    columns=['total_bill', 'tip']
)

# transform a new value (NOTE: we do NOT fit on test data)
scaler.transform([[13, 1]])

# ------------------------------------------------
# 3. Min-Max Scaling (Normalization → values between 0 and 1)
# ------------------------------------------------
min_max = MinMaxScaler()   # create object of MinMaxScaler

# apply fit_transform on training data
min_max.fit_transform(df[['total_bill', 'tip']])

# transform a new value (again: only transform on test data)
min_max.transform([[10, 2]])

# ------------------------------------------------
# 4. Unit Vector Scaling
# ------------------------------------------------
# (normalize data vectors → length = 1)
uv = normalize(df[['total_bill']])

uv   # since it's 1D, all values become 1


#  Encoding

In [None]:
# ================================================================
# Data Encoding Techniques (Any Dataset)
# ================================================================

import pandas as pd
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, OrdinalEncoder

# ------------------------------------------------
# 1. One Hot Encoding (Nominal Data)
# ------------------------------------------------
df = pd.DataFrame({'status': ['single', 'married', 'separated',
                              'single', 'single', 'married', 'married']})
df   # original dataframe

encoder = OneHotEncoder()   # create OneHotEncoder object
encoder   # confirm object created

encoded = encoder.fit_transform(df[['status']]).toarray()   # encoded array
encoder.get_feature_names_out()   # column names after encoding

encoder_df = pd.DataFrame(encoded, columns=encoder.get_feature_names_out())
encoder_df   # encoded DataFrame

# new data transformation
encoder.transform([['single']]).toarray()

# combine encoded columns with original
pd.concat([df, encoder_df], axis=1)

# Note: In practice, we delete the original column and keep n-1 encoded columns.

# ------------------------------------------------
# 2. Label Encoding
# ------------------------------------------------
label_encoder = LabelEncoder()

label_encoder.fit_transform(df[['status']])   # encodes status column
label_encoder.transform([['single']])
label_encoder.transform([['separated']])

# ------------------------------------------------
# 3. Ordinal Encoding
# ------------------------------------------------
# Example: High School < Graduate < Post Graduate < PhD

df = pd.DataFrame({"qualification": ["HS", "GR", "PG", "PhD",
                                     "HS", "HS", "PhD", "PG"]})
df   # original dataframe

encoder = OrdinalEncoder(categories=[["HS", "GR", "PG", "PhD"]])   # define order
encoder.fit_transform(df[['qualification']])   # apply encoding

# new data transformation
encoder.transform([['PG']])

# ------------------------------------------------
# 4. Target Guided Ordinal Encoding
# ------------------------------------------------
# Replace category with mean (or median) of target variable

df = pd.DataFrame({'time': ['lunch', 'breakfast', 'dinner',
                            'lunch', 'breakfast', 'dinner',
                            'lunch', 'breakfast', 'dinner'],
                   'total_bill': [120, 130, 90, 125, 150,
                                  190, 160, 180, 189]})
df   # original dataframe

mean_price = df.groupby('time')['total_bill'].mean().to_dict()   # mean wrt target
mean_price

df['time_encoded'] = df['time'].map(mean_price)   # replace categories with means
df
