In [2]:
import numpy as np 
import pandas as pd
import seaborn as sns

In [117]:
df_train=pd.read_csv('train.csv')
df_test=pd.read_csv('test.csv')
pd.set_option('display.max_columns', None)

In [None]:
df_train.head()

In [None]:
df_train.describe().T

In [None]:
df_train.isnull().sum().sum()

In [None]:
df_train.shape

In [None]:
df_train.columns

In [None]:
df_train.info()

In [45]:

object_col=df_train.select_dtypes(include=['object'])
intcol=df_train.select_dtypes(include=['int64','float64'])


In [130]:
df_train[object_col.columns] = df_train[object_col.columns].fillna(df_train[object_col.columns].mode().iloc[0])
df_train[intcol.columns] = df_train[intcol.columns].fillna(df_train[intcol.columns].mean()) 

In [195]:
from sklearn.preprocessing import LabelEncoder, StandardScaler

# # Scale numerical features
# numerical_columns = df_train.select_dtypes(include=['int64', 'float64']).columns
# scaler = StandardScaler()
# df_train[numerical_columns] = scaler.fit_transform(df_train[numerical_columns])

In [146]:
categorical_columns = df_train.select_dtypes(include=['object']).columns
label_encoders = {}
for col in categorical_columns:
    label_encoders[col] = LabelEncoder()
    df_train[col] = label_encoders[col].fit_transform(df_train[col])

In [132]:
# Step 1: Identify Binary Ordinal Features
binary_features = [col for col in df_train.columns if df_train[col].nunique() == 2]

# Step 2: Identify Ordinal Features (discrete numeric features with small integer values)
ordinal_features = []
for col in df_train.columns:
    if df_train[col].dtype in ['int64']: # Check for numeric columns
        unique_values = df_train[col].unique()
        if len(unique_values) > 2 and all(value == int(value) for value in unique_values): # Check if all values are integers
            ordinal_features.append(col)


In [125]:
# List of float columns to convert (replace with your actual column names)
float_columns = [
    'RealTimeProtectionState', 'AntivirusConfigID', 'NumAntivirusProductsInstalled', 
    'CityID', 'GeoRegionID', 'IsSystemProtected', 'SMode', 'IEVersionID', 
    'FirewallEnabled', 'EnableLUA', 'OEMNameID', 'OEMModelID',
    'ProcessorCoreCount', 'ProcessorManufacturerID', 'ProcessorModelID',
    'PrimaryDiskCapacityMB', 'SystemVolumeCapacityMB', 'TotalPhysicalRAMMB',
    'PrimaryDisplayDiagonalInches', 'PrimaryDisplayResolutionHorizontal',
    'PrimaryDisplayResolutionVertical', 'OSInstallLanguageID',
    'IsFlightsDisabled', 'FirmwareManufacturerID','NumAntivirusProductsEnabled','IsAlwaysOnAlwaysConnectedCapable', 
    'IsGamer','RegionIdentifier','IsVirtualDevice','FirmwareVersionID'

]

# Ensure all columns exist in the DataFrame
missing_columns = [col for col in float_columns if col not in df_train.columns]
if missing_columns:
    print(f"Missing columns: {missing_columns}")
else:
    # Fill missing values and convert to int
    df_train[float_columns] = df_train[float_columns].fillna(-1).astype(int)


In [149]:
X = df_train.drop(columns=['target'])
y = df_train['target']

In [None]:
from sklearn.model_selection import train_test_split

# Split data into training and testing sets (80:20 split)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42, stratify=y)

print(f"\nTraining Set Shape: {X_train.shape}")
print(f"Testing Set Shape: {X_test.shape}")


In [None]:
from sklearn.ensemble import GradientBoostingClassifier

# Initialize Gradient Boosting Classifier
classifier = GradientBoostingClassifier(random_state=42)

# Train the model on the training set
classifier.fit(X_train, y_train)

print("\nModel Training Completed.")


In [None]:
from sklearn.metrics import classification_report, confusion_matrix

# Predict on test data
y_pred = classifier.predict(X_test)

# Evaluate model performance
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))


In [None]:
from sklearn.linear_model import LogisticRegression

model1 = LogisticRegression()
model1.fit(X_train, y_train)
y_pred1 = model1.predict(X_test)

In [207]:
from sklearn.neighbors import KNeighborsClassifier

model2 = KNeighborsClassifier(n_neighbors=5)
model2.fit(X_train, y_train)
y_pred2 = model2.predict(X_test)

In [None]:
from sklearn.metrics import accuracy_score, classification_report

y_pred2 = model2.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))

In [None]:
y_pred1 = model1.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))

In [None]:
y_pred = classifier.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))

# Test Data

In [167]:
object_col_test=df_test.select_dtypes(include=['object'])
intcol_test=df_test.select_dtypes(include=['int64','float64'])

In [169]:
df_test[object_col_test.columns] = df_test[object_col_test.columns].fillna(df_test[object_col_test.columns].mode().iloc[0])
df_test[intcol_test.columns] = df_test[intcol_test.columns].fillna(df_test[intcol_test.columns].mean()) 

In [189]:
from sklearn.preprocessing import LabelEncoder, StandardScaler

# Scale numerical features
numerical_columns_test = df_test.select_dtypes(include=['int64', 'float64']).columns
scaler = StandardScaler()
df_test[numerical_columns_test] = scaler.fit_transform(df_test[numerical_columns_test])

In [191]:
categorical_columns_test = df_test.select_dtypes(include=['object']).columns
label_encoders = {}
for col_test in categorical_columns_test:
    label_encoders[col_test] = LabelEncoder()
    df_test[col_test] = label_encoders[col_test].fit_transform(df_test[col_test])

In [171]:
float_columns = [
    'RealTimeProtectionState', 'AntivirusConfigID', 'NumAntivirusProductsInstalled', 
    'CityID', 'GeoRegionID', 'IsSystemProtected', 'SMode', 'IEVersionID', 
    'FirewallEnabled', 'EnableLUA', 'OEMNameID', 'OEMModelID',
    'ProcessorCoreCount', 'ProcessorManufacturerID', 'ProcessorModelID',
    'PrimaryDiskCapacityMB', 'SystemVolumeCapacityMB', 'TotalPhysicalRAMMB',
    'PrimaryDisplayDiagonalInches', 'PrimaryDisplayResolutionHorizontal',
    'PrimaryDisplayResolutionVertical', 'OSInstallLanguageID',
    'IsFlightsDisabled', 'FirmwareManufacturerID','NumAntivirusProductsEnabled','IsAlwaysOnAlwaysConnectedCapable', 
    'IsGamer','RegionIdentifier','IsVirtualDevice','FirmwareVersionID'

]

# Ensure all columns exist in the DataFrame
missing_columns = [col for col in float_columns if col not in df_test.columns]
if missing_columns:
    print(f"Missing columns: {missing_columns}")
else:
    # Fill missing values and convert to int
    df_test[float_columns] = df_test[float_columns].fillna(-1).astype(int)

In [None]:
y_pred_test = classifier.predict(df_test)
print("Accuracy:", accuracy_score(y_test, y_pred_test))
print("Classification Report:\n", classification_report(y_test, y_pred_test))


In [None]:
y_pred1_test = model1.predict(df_test)
print("Accuracy:", accuracy_score(y_test, y_pred1_test))
print("Classification Report:\n", classification_report(y_test, y_pred1_test))

In [None]:
y_pred2_test = model2.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred2_test))
print("Classification Report:\n", classification_report(y_test, y_pred2_test))