In [2]:
import pandas as pd
import numpy as np
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

In [5]:
# 1. Load Dataset
df = pd.read_csv('emails (1).csv')

In [6]:
# Pre-step: Create numerical features for analysis
df['text_len'] = df['text'].apply(len)
df['word_count'] = df['text'].apply(lambda x: len(x.split()))

In [7]:
# --- TASK 1: Identify Data Quality Issues ---
# Check for missing values and data types
print("Missing Values:\n", df.isnull().sum())
print("\nData Types:\n", df.dtypes)

Missing Values:
 text          0
spam          0
text_len      0
word_count    0
dtype: int64

Data Types:
 text            str
spam          int64
text_len      int64
word_count    int64
dtype: object


In [8]:
# --- TASK 2: Missing Value Strategy ---
# We will simulate a missing value and then impute it using the MEAN
df_missing = df.copy()
df_missing.loc[0:5, 'text_len'] = np.nan # Simulate missing data

In [9]:
# Strategy: Imputation with Mean
mean_value = df_missing['text_len'].mean()
df_missing['text_len'] = df_missing['text_len'].fillna(mean_value)
print(f"\nImputed missing values with mean: {mean_value}")


Imputed missing values with mean: 1557.746941628801


In [10]:
# --- TASK 3: Detect and Handle Outliers (IQR) ---
Q1 = df['text_len'].quantile(0.25)
Q3 = df['text_len'].quantile(0.75)
IQR = Q3 - Q1

In [11]:
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

In [12]:
# Filtering outliers
df_clean = df[(df['text_len'] >= lower_bound) & (df['text_len'] <= upper_bound)].copy()
print(f"\nOutliers removed. Rows reduced from {len(df)} to {len(df_clean)}")


Outliers removed. Rows reduced from 5728 to 5355


In [13]:
# --- TASK 4: Normalize Numerical Features ---
# Min-Max Normalization (Scaling to 0-1)
df_clean['len_minmax'] = (df_clean['text_len'] - df_clean['text_len'].min()) / (df_clean['text_len'].max() - df_clean['text_len'].min())

In [14]:
# Z-score Normalization (Standardization)
df_clean['len_zscore'] = (df_clean['text_len'] - df_clean['text_len'].mean()) / df_clean['text_len'].std()

In [16]:
# --- TASK 5: PCA and Explained Variance ---
# Standardize the features before PCA
features = ['text_len', 'word_count']
x = StandardScaler().fit_transform(df_clean[features])

In [17]:
pca = PCA(n_components=2)
principalComponents = pca.fit_transform(x)

In [18]:
pca = PCA(n_components=2)
principalComponents = pca.fit_transform(x)

In [19]:
print("Explained Variance Ratio:", pca.explained_variance_ratio_)

Explained Variance Ratio: [0.99151578 0.00848422]
