In [None]:
# Importing necessary libraries
from sklearn.feature_selection import VarianceThreshold
import pandas as pd

# Step 1: Load the dataset
# Provide the file path to your dataset
file_path = "/content/RegresiUTSTelkom.csv"
try:
    data = pd.read_csv(file_path)
    print("Dataset successfully loaded!")
except FileNotFoundError:
    print(f"File not found at {file_path}. Please check the file path and try again.")
    exit()

# Step 2: Rename columns to x1, x2, ..., xn
column_names = [f'x{i+1}' for i in range(data.shape[1])]
data.columns = column_names  # Ensure column renaming is clean
print("Columns renamed successfully to:", column_names)

# Show basic info and first few rows
print("Dataset Information:")
print(data.info())
print("\nFirst five rows:")
print(data.head())

# Descriptive statistics
print("\nDescriptive statistics:")
print(data.describe())

# Step 3: Drop duplicate rows (if not done already)
data = data.drop_duplicates()
print(f"Duplicates removed. Data now has {data.shape[0]} rows and {data.shape[1]} columns.")

# Step 4: Remove Target Column (Assuming target is 'x1') for Feature Selection
try:
    target = data['x1']
    data = data.drop(columns=['x1'])
    print("Target column 'x1' successfully separated.")
except KeyError:
    print("Target column 'x1' not found in the dataset.")
    exit()

# Step 5: Identifying and Removing Low Correlation Features with Target
correlation_threshold = 0.1  # Threshold for correlation
correlation_with_target = data.corrwith(target).abs()  # Correlation with target
selected_features = correlation_with_target[correlation_with_target > correlation_threshold].index

if selected_features.empty:
    print("No features found with correlation above the threshold.")
    exit()
else:
    data_selected = data[selected_features]
    print(f"Selected features based on correlation threshold: {selected_features.tolist()}")

# Step 6: Variance Threshold to Remove Low Variance Features
variance_threshold = 0.1  # Threshold for variance
selector = VarianceThreshold(threshold=variance_threshold)
try:
    data_high_variance = selector.fit_transform(data_selected)
    print("Low variance features removed.")
except ValueError:
    print("Error: No feature met the variance threshold. Adjust the threshold and try again.")
    exit()

# Convert data_high_variance back to DataFrame after Variance Threshold
data_final = pd.DataFrame(data_high_variance, columns=[col for col, keep in zip(data_selected.columns, selector.get_support()) if keep])
print(f"Data after variance thresholding has {data_final.shape[1]} features.")

# Step 7: Add the target column back to the processed data
data_final['x1'] = target.values[:data_final.shape[0]]  # Adjusting if rows were dropped during processing
print("Target column added back to the processed data.")

# Step 8: Save the processed dataset
processed_file_path = "/content/RegresiUTSTelkomNEW.csv"
try:
    data_final.to_csv(processed_file_path, index=False)
    print("Processed dataset saved successfully!")
    print("Processed dataset saved as:", processed_file_path)
except Exception as e:
    print(f"Error saving the processed dataset: {e}")

# Final output message
print(f"Final dataset shape: {data_final.shape}")

Dataset successfully loaded!
Columns renamed successfully to: ['x1', 'x2', 'x3', 'x4', 'x5', 'x6', 'x7', 'x8', 'x9', 'x10', 'x11', 'x12', 'x13', 'x14', 'x15', 'x16', 'x17', 'x18', 'x19', 'x20', 'x21', 'x22', 'x23', 'x24', 'x25', 'x26', 'x27', 'x28', 'x29', 'x30', 'x31', 'x32', 'x33', 'x34', 'x35', 'x36', 'x37', 'x38', 'x39', 'x40', 'x41', 'x42', 'x43', 'x44', 'x45', 'x46', 'x47', 'x48', 'x49', 'x50', 'x51', 'x52', 'x53', 'x54', 'x55', 'x56', 'x57', 'x58', 'x59', 'x60', 'x61', 'x62', 'x63', 'x64', 'x65', 'x66', 'x67', 'x68', 'x69', 'x70', 'x71', 'x72', 'x73', 'x74', 'x75', 'x76', 'x77', 'x78', 'x79', 'x80', 'x81', 'x82', 'x83', 'x84', 'x85', 'x86', 'x87', 'x88', 'x89', 'x90', 'x91']
Dataset Information:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7313 entries, 0 to 7312
Data columns (total 91 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   x1      7313 non-null   int64  
 1   x2      7313 non-null   float64
 2   x3      7312 non-null   floa

In [None]:
# Importing necessary libraries
from sklearn.feature_selection import VarianceThreshold
import pandas as pd

# Step 1: Load the dataset
# Provide the file path to your dataset
file_path = "/content/RegresiUTSTelkom.csv"
try:
    data = pd.read_csv(file_path)
    print("Dataset successfully loaded!")
except FileNotFoundError:
    print(f"File not found at {file_path}. Please check the file path and try again.")
    # exit() #Comment out exit to see if the file exists
    #If you get past this point and data is still not defined check to see if the file exists
    #If it exists then check the type and encoding when loading it with pd.read_csv
    print(f"The type of 'data' is: {type(data)}")
except Exception as e:
    print(f"An unexpected error occurred: {e}") #This helps see why the file potentially is not loaded
    # exit() #Comment out exit to see if the file exists


# Step 2: Rename columns to x1, x2, ..., xn
column_names = [f'x{i+1}' for i in range(data.shape[1])]
data.columns = column_names  # Ensure column renaming is clean
print("Columns renamed successfully to:", column_names)

# Show basic info and first few rows
print("Dataset Information:")
print(data.info())
print("\nFirst five rows:")
print(data.head())

# Descriptive statistics
print("\nDescriptive statistics:")
print(data.describe())

# Step 3: Drop duplicate rows (if not done already)
data = data.drop_duplicates()
print(f"Duplicates removed. Data now has {data.shape[0]} rows and {data.shape[1]} columns.")

# Step 4: Remove Target Column (Assuming target is 'x1') for Feature Selection
try:
    target = data['x1']
    data = data.drop(columns=['x1'])
    print("Target column 'x1' successfully separated.")
except KeyError:
    print("Target column 'x1' not found in the dataset.")
    exit()

# Step 5: Identifying and Removing Low Correlation Features with Target
correlation_threshold = 0.1  # Threshold for correlation
correlation_with_target = data.corrwith(target).abs()  # Correlation with target
selected_features = correlation_with_target[correlation_with_target > correlation_threshold].index

if selected_features.empty:
    print("No features found with correlation above the threshold.")
    exit()
else:
    data_selected = data[selected_features]
    print(f"Selected features based on correlation threshold: {selected_features.tolist()}")

# Step 6: Variance Threshold to Remove Low Variance Features
variance_threshold = 0.1  # Threshold for variance
selector = VarianceThreshold(threshold=variance_threshold)
try:
    data_high_variance = selector.fit_transform(data_selected)
    print("Low variance features removed.")
except ValueError:
    print("Error: No feature met the variance threshold. Adjust the threshold and try again.")
    exit()

# Convert data_high_variance back to DataFrame after Variance Threshold
data_final = pd.DataFrame(data_high_variance, columns=[col for col, keep in zip(data_selected.columns, selector.get_support()) if keep])
print(f"Data after variance thresholding has {data_final.shape[1]} features.")

# Step 7: Add the target column back to the processed data
data_final['x1'] = target.values[:data_final.shape[0]]  # Adjusting if rows were dropped during processing
print("Target column added back to the processed data.")

# Step 8: Save the processed dataset
processed

File not found at /content/RegresiUTSTelkom.csv. Please check the file path and try again.


NameError: name 'data' is not defined