In [1]:
import pandas as pd
import numpy as np

# Load dataset
df = pd.read_csv("salaries.csv")

# Display the first few rows
df.head()


Unnamed: 0,company,job,degree,salary_more_than_100k
0,google,sales executive,bachelors,0
1,google,sales executive,masters,0
2,google,business manager,bachelors,1
3,google,business manager,masters,1
4,google,computer programmer,bachelors,0


In [2]:
# Only one numerical column: salary_more_than_100k
numerical_columns = df.select_dtypes(include=['int64', 'float64'])

std_deviation = numerical_columns.std()
variance = numerical_columns.var()

print("Standard Deviation:\n", std_deviation)
print("\nVariance:\n", variance)


Standard Deviation:
 salary_more_than_100k    0.5
dtype: float64

Variance:
 salary_more_than_100k    0.25
dtype: float64


In [3]:
# Step 3: Covariance and Correlation Coefficient

In [4]:
# Covariance
print("Covariance Matrix:\n", numerical_columns.cov())

# Correlation Coefficient
print("\nCorrelation Matrix:\n", numerical_columns.corr())


Covariance Matrix:
                        salary_more_than_100k
salary_more_than_100k                   0.25

Correlation Matrix:
                        salary_more_than_100k
salary_more_than_100k                    1.0


In [5]:
# Step 4: Identify Independent Features

In [6]:
# Drop target column
X = df.drop(columns=["salary_more_than_100k"])

# Count of independent features
print("Total independent features:", X.shape[1])
print("\nUnique values in each feature:\n", X.nunique())


Total independent features: 3

Unique values in each feature:
 company    3
job        3
degree     2
dtype: int64


In [8]:
#Step 5: Identify Unwanted Features (Redundant/Low Value)

# Checking for constant features
for col in X.columns:
    if len(X[col].unique()) == 1:
        print(f"'{col}' is constant and may be unwanted.")

# Optional: Check correlation after encoding
X_encoded = pd.get_dummies(X)
correlation_with_target = pd.concat([X_encoded, df['salary_more_than_100k']], axis=1).corr()['salary_more_than_100k'].sort_values(ascending=False)
print("\nCorrelation with Target:\n", correlation_with_target)



Correlation with Target:
 salary_more_than_100k      1.000000
company_facebook           0.600000
job_business manager       0.333333
degree_masters             0.258199
job_computer programmer   -0.034816
company_google            -0.200000
degree_bachelors          -0.258199
job_sales executive       -0.313340
company_abc pharma        -0.447214
Name: salary_more_than_100k, dtype: float64


In [9]:
 #Step 6: Data Discretization (Equi-Frequency Binning on Dummy 'Age')

In [10]:
# Simulate 'age' for demo purposes
np.random.seed(0)
df['age'] = np.random.randint(22, 60, size=len(df))

# Equi-frequency binning (4 bins)
df['age_binned'] = pd.qcut(df['age'], q=4, labels=['Low', 'Medium', 'High', 'Very High'])

df[['age', 'age_binned']]


Unnamed: 0,age,age_binned
0,22,Low
1,25,Low
2,25,Low
3,31,Medium
4,41,High
5,43,High
6,58,Very High
7,45,High
8,28,Medium
9,46,Very High


In [11]:
#Step 7: Normalize age using 3 method

In [12]:
# Min-Max Normalization
df['age_minmax'] = (df['age'] - df['age'].min()) / (df['age'].max() - df['age'].min())

# Z-Score Normalization
df['age_zscore'] = (df['age'] - df['age'].mean()) / df['age'].std()

# Decimal Scaling
def decimal_scaling(x):
    j = len(str(int(x.abs().max())))
    return x / (10 ** j)

df['age_decimal'] = decimal_scaling(df['age'])

df[['age', 'age_minmax', 'age_zscore', 'age_decimal']]


Unnamed: 0,age,age_minmax,age_zscore,age_decimal
0,22,0.0,-1.437751,0.22
1,25,0.083333,-1.156069,0.25
2,25,0.083333,-1.156069,0.25
3,31,0.25,-0.592706,0.31
4,41,0.527778,0.346234,0.41
5,43,0.583333,0.534022,0.43
6,58,1.0,1.942431,0.58
7,45,0.638889,0.72181,0.45
8,28,0.166667,-0.874387,0.28
9,46,0.666667,0.815704,0.46


# Custom Variance Function (Without Using .var())



In [13]:
import pandas as pd

# Load the salaries dataset
df = pd.read_csv("salaries.csv")

# Custom mean function
def mean_cal(col):
    s = 0
    for i in range(df[col].shape[0]):
        s += df[col][i]
    return s / df[col].shape[0]

# Custom variance function
def var_cal(col):
    k = 0
    mean_c = mean_cal(col)
    for i in range(df[col].shape[0]):
        k += (df[col][i] - mean_c) ** 2
    m = k / df[col].shape[0]  # Population variance (for sample use N-1)
    return m

# Apply to the column
variance = var_cal('salary_more_than_100k')
print("Calculated Variance (custom function):", variance)


Calculated Variance (custom function): 0.234375


In [14]:
# Loop through columns in X and try variance calculation
for i in X.columns:
    try:
        custom_var = var_cal(i)
        pandas_var = df[i].var()
        print(f"Variance of '{i}': Custom = {custom_var:.4f} | Pandas = {pandas_var:.4f}")
    except:
        print(f"Cannot calculate variance for '{i}' (maybe non-numeric)")

Cannot calculate variance for 'company' (maybe non-numeric)
Cannot calculate variance for 'job' (maybe non-numeric)
Cannot calculate variance for 'degree' (maybe non-numeric)


In [15]:
# Your Standard Deviation function (Population)
def std_cal(col): 
    k = 0  
    mean_c = mean_cal(col)
    for i in range(df[col].shape[0]):
        k += (df[col][i] - mean_c) ** 2
    variance = k / df[col].shape[0]
    std_dev = variance ** 0.5
    return std_dev

# Apply it to the numerical column
custom_std = std_cal('salary_more_than_100k')
pandas_std = df['salary_more_than_100k'].std()

print(f"Custom Std Dev (Population): {custom_std}")
print(f"Pandas Std Dev (Sample):     {pandas_std}")


Custom Std Dev (Population): 0.4841229182759271
Pandas Std Dev (Sample):     0.5
