In [8]:
import pandas as pd
from sklearn.preprocessing import OrdinalEncoder, StandardScaler
from sklearn.decomposition import PCA

# Step 1: Load your dataset (replace 'your_dataset.csv' with your file name)
data = pd.read_csv('train.csv')

# Step 2: Drop rows with missing values
data = data.dropna()

# Step 3: Separate features (X) and target vector (y)
X = data.drop(columns=['target'])  # Replace 'target_column' with your target column name
y = data['target']

# Step 4: Encode categorical columns using OrdinalEncoder
categorical_columns = X.select_dtypes(include=['object', 'category']).columns
encoder = OrdinalEncoder()
X[categorical_columns] = encoder.fit_transform(X[categorical_columns])

# Step 5: Scale the entire dataset using StandardScaler
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Step 6: Perform PCA with default parameters
pca = PCA()  # Default parameters
X_pca = pca.fit_transform(X_scaled)

# Print results
print("Explained Variance Ratio by each Principal Component:\n", pca.explained_variance_ratio_)
print("Cumulative Explained Variance:\n", pca.explained_variance_ratio_.cumsum())
print("Transformed Dataset (First 5 rows):\n", X_pca[:5])


Explained Variance Ratio by each Principal Component:
 [9.41279049e-02 7.70828774e-02 4.83420265e-02 4.76585205e-02
 4.53976277e-02 3.97132264e-02 3.47596681e-02 3.03668795e-02
 2.97979154e-02 2.71767810e-02 2.44059766e-02 2.32424681e-02
 2.26285439e-02 2.22246679e-02 2.05649154e-02 2.02878554e-02
 1.92480823e-02 1.62418309e-02 1.59513731e-02 1.55323496e-02
 1.47627057e-02 1.42461464e-02 1.41214873e-02 1.41034661e-02
 1.40595074e-02 1.40095927e-02 1.38500060e-02 1.37765456e-02
 1.32815229e-02 1.26782891e-02 1.24837515e-02 1.24003124e-02
 1.21071965e-02 1.16934854e-02 1.06621157e-02 1.05046091e-02
 9.38127120e-03 8.66796433e-03 8.33834774e-03 8.04261568e-03
 7.60543927e-03 7.05046489e-03 6.03255655e-03 5.48231456e-03
 5.23510420e-03 5.07453409e-03 4.91889086e-03 4.64268598e-03
 3.93945628e-03 3.76286361e-03 3.58155303e-03 3.19051728e-03
 3.12335010e-03 2.64671087e-03 2.19796215e-03 2.15053265e-03
 1.95978661e-03 1.84169504e-03 1.80899229e-03 1.34115176e-03
 1.13088582e-03 8.91979544e-04

In [9]:
import numpy as np
from sklearn.decomposition import PCA

# Assuming `X_scaled` is your preprocessed dataset after scaling
pca = PCA()
X_pca = pca.fit_transform(X_scaled)

# Calculate cumulative explained variance
cumulative_variance = np.cumsum(pca.explained_variance_ratio_)

# Find the number of components that explain at least 70% of the variance
num_components = np.argmax(cumulative_variance >= 0.70) + 1

print("Number of components required to explain at least 70% of the variance:", num_components)


Number of components required to explain at least 70% of the variance: 22


In [10]:
import numpy as np
from sklearn.metrics import mean_squared_error
from sklearn.decomposition import PCA

# Step 1: Perform PCA with the number of components required for 70% variance
num_components = np.argmax(np.cumsum(pca.explained_variance_ratio_) >= 0.70) + 1
pca_70 = PCA(n_components=num_components)
X_pca_70 = pca_70.fit_transform(X_scaled)

# Step 2: Reconstruct the data from the reduced components
X_reconstructed = pca_70.inverse_transform(X_pca_70)

# Step 3: Calculate Mean Squared Error (MSE)
mse = mean_squared_error(X_scaled, X_reconstructed)

print(f"Number of components used: {num_components}")
print(f"Mean Squared Error (MSE): {mse}")


Number of components used: 22
Mean Squared Error (MSE): 0.28044020908124623


In [11]:
import numpy as np
from sklearn.decomposition import PCA

# Assuming `X_scaled` is your preprocessed dataset after scaling
pca = PCA()
X_pca = pca.fit_transform(X_scaled)

# Calculate cumulative explained variance
cumulative_variance = np.cumsum(pca.explained_variance_ratio_)

# Get the variance explained by the first 40 components
variance_explained_40 = cumulative_variance[39]  # Index 39 corresponds to the 40th component

print(f"Variance explained by the first 40 components: {variance_explained_40:.2f}")


Variance explained by the first 40 components: 0.92


In [13]:
import pandas as pd
from sklearn.feature_selection import SelectKBest, f_classif

data = pd.read_csv('train.csv')

# Step 2: Drop rows with missing values
data = data.dropna()

# Step 3: Separate features (X) and target vector (y)
X = data.drop(columns=['target'])  # Replace 'target_column' with your target column name
y = data['target']

# Step 4: Encode categorical columns using OrdinalEncoder
categorical_columns = X.select_dtypes(include=['object', 'category']).columns
encoder = OrdinalEncoder()
X[categorical_columns] = encoder.fit_transform(X[categorical_columns])

# Step 5: Scale the entire dataset using StandardScaler
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Select the top 15 features using f_classif
selector = SelectKBest(score_func=f_classif, k=15)
X_selected = selector.fit_transform(X, y)

# Get the names of the selected features
selected_features = X.columns[selector.get_support()]

print("Selected Features:\n", selected_features)
print("Shape of dataset after feature selection:", X_selected.shape)


Selected Features:
 Index(['EngineVersion', 'RealTimeProtectionState', 'AntivirusConfigID',
       'NumAntivirusProductsInstalled', 'Processor', 'IsSystemProtected',
       'ProcessorCoreCount', 'PrimaryDiskCapacityMB', 'TotalPhysicalRAMMB',
       'PowerPlatformRole', 'OSArchitecture', 'IsTouchEnabled',
       'IsAlwaysOnAlwaysConnectedCapable', 'IsGamer', 'DateOS'],
      dtype='object')
Shape of dataset after feature selection: (96695, 15)


  f = msb / msw


In [14]:
import pandas as pd
from sklearn.feature_selection import SelectKBest, f_classif

import pandas as pd
from sklearn.feature_selection import SelectKBest, f_classif

data = pd.read_csv('train.csv')

# Step 2: Drop rows with missing values
data = data.dropna()

# Step 3: Separate features (X) and target vector (y)
X = data.drop(columns=['target'])  # Replace 'target_column' with your target column name
y = data['target']

# Step 4: Encode categorical columns using OrdinalEncoder
categorical_columns = X.select_dtypes(include=['object', 'category']).columns
encoder = OrdinalEncoder()
X[categorical_columns] = encoder.fit_transform(X[categorical_columns])

# Step 5: Scale the entire dataset using StandardScaler
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Perform feature selection using f_classif
selector = SelectKBest(score_func=f_classif, k=15)
selector.fit(X, y)

# Get the scores for all features
scores = selector.scores_

# Find the score of the best feature
best_feature_score = max(scores)

print(f"Score of the best feature: {best_feature_score:.2f}")


Score of the best feature: 2286.23


  f = msb / msw
