In [None]:
import numpy as np 
import pandas as pd

In [None]:
df_train=pd.read_csv('train.csv')
df_test=pd.read_csv('test.csv')
pd.set_option('display.max_columns', None)

In [None]:
redundant_columns = [col for col in df_train.columns if df_train[col].nunique() == 1]

print("Redundant Columns:", redundant_columns)

In [None]:
# Convert DateAS to datetime and then to numeric (timestamp)
df_train['DateAS'] = pd.to_datetime(df_train['DateAS']).apply(lambda x: x.timestamp())
df_train['SignatureVersion'] = df_train['SignatureVersion'].apply(lambda x: int(x.replace('.', '')))

# Calculate correlations for the given column pairs
correlations = {
    "DateAS and SignatureVersion": df_train['DateAS'].corr(df_train['SignatureVersion']),
    "OSBuildLab and NumericOSVersion": df_train['OSBuildLab'].corr(df_train['NumericOSVersion']),
    "OSEdition and OSSkuFriendlyName": df_train['OSEdition'].corr(df_train['OSSkuFriendlyName']),
    "OSProductSuite and OSSkuFriendlyName": df_train['OSProductSuite'].corr(df_train['OSSkuFriendlyName'])
}

# Find the pair with the highest positive correlation
highest_corr_pair = max(correlations, key=correlations.get)

print("Correlations:", correlations)
print("Highest Positive Correlation:", highest_corr_pair)

In [None]:
from sklearn.datasets import fetch_california_housing
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OrdinalEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import accuracy_score
import pandas as pd
import numpy as np

X = df_train.drop(columns=['target'])
y = df_train['target']

# Convert the target variable into binary classification (e.g., median house value > 2.5 is 1, else 0)


# Step 1: Fill missing values with the most frequent strategy
imputer = SimpleImputer(strategy='most_frequent')
X_imputed = pd.DataFrame(imputer.fit_transform(X), columns=X.columns)

# Step 2: Encode all object datatype columns in X using OrdinalEncoder
object_columns = X_imputed.select_dtypes(include=['object']).columns
encoder = OrdinalEncoder()
if len(object_columns) > 0:
    X_imputed[object_columns] = encoder.fit_transform(X_imputed[object_columns])

# Step 3: Split the dataset into training and testing sets with test_size=0.2 and random_state=42
X_train, X_test, y_train, y_test = train_test_split(X_imputed, y, test_size=0.2, random_state=42)

# Step 4: Train SGDClassifier with default parameters and random_state=42
clf = SGDClassifier(random_state=42)
clf.fit(X_train, y_train)

# Step 5: Make predictions and calculate accuracy score on the test data
y_pred = clf.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)

print("Accuracy Score:", accuracy)


In [9]:
from sklearn.preprocessing import OneHotEncoder

# Step 1: Selecting only columns of datatype 'object'
cat_df = df_train.select_dtypes(include=['object'])

# Step 2: Using OneHotEncoder for columns with <= 10 unique values
encoder = OneHotEncoder(sparse_output=False, drop='first', handle_unknown='ignore')

# Identifying columns with <= 10 unique values
columns_to_encode = [col for col in cat_df.columns if cat_df[col].nunique() <= 10]

# Applying OneHotEncoder to the selected columns
encoded_data = encoder.fit_transform(cat_df[columns_to_encode])

# Creating a new DataFrame with the encoded columns
encoded_df = pd.DataFrame(encoded_data, columns=encoder.get_feature_names_out(columns_to_encode))

# Combining the encoded columns with the original DataFrame (dropping original encoded columns)
final_cat_df = pd.concat([cat_df.drop(columns=columns_to_encode), encoded_df], axis=1)

# Step 3: Determining the new number of columns in cat_df
new_column_count = final_cat_df.shape[1]
print("New number of columns in cat_df:", new_column_count)

New number of columns in cat_df: 80


In [10]:
from sklearn.preprocessing import MinMaxScaler


# Step 1: Create num_df with only columns of datatype int64 and float64
num_df = df_train.select_dtypes(include=['int64', 'float64'])

# Step 2: Apply MinMaxScaler on num_df
scaler = MinMaxScaler()
num_df_scaled = pd.DataFrame(scaler.fit_transform(num_df), columns=num_df.columns)

# Step 3: Calculate the sum of all values in num_df_scaled
total_sum = num_df_scaled.sum().sum()

print("Sum of all values in num_df:", total_sum)

Sum of all values in num_df: 1637476.0102631948
