In [10]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score


In [12]:
import pandas as pd
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
from sklearn.neighbors import KNeighborsClassifier

# 1. Select K-Best Features using Chi-Square
def selectkbest(indep_x, dep_y, k):
    # MinMaxScaler is used because Chi-Square requires non-negative data
    scaler = MinMaxScaler()
    indep_x_scaled = scaler.fit_transform(indep_x)
    test = SelectKBest(score_func=chi2, k=k)
    fit1 = test.fit(indep_x_scaled, dep_y)
    selected_features = fit1.transform(indep_x_scaled)
    return selected_features

# 2. Split and Scale the Data
def split_scalar(indep_x, dep_y):
    x_train, x_test, y_train, y_test = train_test_split(indep_x, dep_y, test_size=0.25, random_state=0)
    sc = StandardScaler()
    x_train = sc.fit_transform(x_train)
    x_test = sc.transform(x_test)
    return x_train, x_test, y_train, y_test

# 3. Prediction, Confusion Matrix, Accuracy, and Report
def cm_prediction(classifier, x_test, y_test):
    y_pred = classifier.predict(x_test)
    cm = confusion_matrix(y_test, y_pred)
    accuracy = accuracy_score(y_test, y_pred)
    report = classification_report(y_test, y_pred)
    return accuracy, report, cm

# 4. Define Classification Algorithms
def logistic(x_train, y_train, x_test, y_test):
    from sklearn.linear_model import LogisticRegression
    classifier = LogisticRegression(random_state=0)
    classifier.fit(x_train, y_train)
    return cm_prediction(classifier, x_test, y_test)

def svm_linear(x_train, y_train, x_test, y_test):
    from sklearn.svm import SVC
    classifier = SVC(kernel='linear', random_state=0)
    classifier.fit(x_train, y_train)
    return cm_prediction(classifier, x_test, y_test)

def svm_nl(x_train, y_train, x_test, y_test):
    from sklearn.svm import SVC
    classifier = SVC(kernel='rbf', random_state=0)
    classifier.fit(x_train, y_train)
    return cm_prediction(classifier, x_test, y_test)

def knn(x_train, y_train, x_test, y_test):
    from sklearn.neighbors import KNeighborsClassifier
    classifier = KNeighborsClassifier(n_neighbors=5, metric='minkowski', p=2)
    classifier.fit(x_train, y_train)
    return cm_prediction(classifier, x_test, y_test)

def naive(x_train, y_train, x_test, y_test):
    from sklearn.naive_bayes import GaussianNB
    classifier = GaussianNB()
    classifier.fit(x_train, y_train)
    return cm_prediction(classifier, x_test, y_test)

def decision_tree(x_train, y_train, x_test, y_test):
    from sklearn.tree import DecisionTreeClassifier
    classifier = DecisionTreeClassifier(criterion='entropy', random_state=0)
    classifier.fit(x_train, y_train)
    return cm_prediction(classifier, x_test, y_test)

def random_forest(x_train, y_train, x_test, y_test):
    from sklearn.ensemble import RandomForestClassifier
    classifier = RandomForestClassifier(n_estimators=10, criterion='entropy', random_state=0)
    classifier.fit(x_train, y_train)
    return cm_prediction(classifier, x_test, y_test)

# 5. Create Results DataFrame
def selectk_classification(acclog, accsvml, accsvmnl, accknn, accnav, accdes, accrf):
    dataframe = pd.DataFrame(index=["ChiSquare"], columns=['Logistic', 'SVMl', 'SVMnl', 'KNN', 'Navie', 'Decision', 'Random'])
    dataframe['Logistic']['ChiSquare'] = acclog
    dataframe['SVMl']['ChiSquare'] = accsvml
    dataframe['SVMnl']['ChiSquare'] = accsvmnl
    dataframe['KNN']['ChiSquare'] = accknn
    dataframe['Navie']['ChiSquare'] = accnav
    dataframe['Decision']['ChiSquare'] = accdes
    dataframe['Random']['ChiSquare'] = accrf
    return dataframe

# 6. Load Dataset
df = pd.read_csv("creditcard.csv")
df = df.drop(['Unnamed: 0'], axis=1, errors='ignore')  # Drop irrelevant columns
df = pd.get_dummies(df, drop_first=True)  # One-hot encoding if needed

indep_x = df.drop('Class', axis=1)
dep_y = df['Class']

# 7. Select K-Best Features
kbest = selectkbest(indep_x, dep_y, 6)

# 8. Initialize Accuracy Lists
acclog = []
accsvml = []
accsvmnl = []
accknn = []
accnav = []
accdes = []
accrf = []

# 9. Split Data
x_train, x_test, y_train, y_test = split_scalar(kbest, dep_y)

# 10. Call Classifiers and Append Results
accuracy, _, _ = logistic(x_train, y_train, x_test, y_test)
acclog.append(accuracy)

accuracy, _, _ = svm_linear(x_train, y_train, x_test, y_test)
accsvml.append(accuracy)

accuracy, _, _ = svm_nl(x_train, y_train, x_test, y_test)
accsvmnl.append(accuracy)

accuracy, _, _ = knn(x_train, y_train, x_test, y_test)
accknn.append(accuracy)

accuracy, _, _ = naive(x_train, y_train, x_test, y_test)
accnav.append(accuracy)

accuracy, _, _ = decision_tree(x_train, y_train, x_test, y_test)
accdes.append(accuracy)

accuracy, _, _ = random_forest(x_train, y_train, x_test, y_test)
accrf.append(accuracy)

# 11. Create Results DataFrame
result = selectk_classification(acclog, accsvml, accsvmnl, accknn, accnav, accdes, accrf)

# 12. Display Results
print(result)


                       Logistic                  SVMl                SVMnl  \
ChiSquare  [0.9991854161399961]  [0.9992556388865481]  [0.999522485323446]   

                            KNN                 Navie              Decision  \
ChiSquare  [0.9995786635206876]  [0.9948175613044578]  [0.9993399061824106]   

                         Random  
ChiSquare  [0.9995084407741356]  


You are setting values through chained assignment. Currently this works in certain cases, but when using Copy-on-Write (which will become the default behaviour in pandas 3.0) this will never work to update the original DataFrame or Series, because the intermediate object on which we are setting values will behave as a copy.
A typical example is when you are setting values in a column of a DataFrame, like:

df["col"][row_indexer] = value

Use `df.loc[row_indexer, "col"] = values` instead, to perform the assignment in a single step and ensure this keeps updating the original `df`.

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

  dataframe['Logistic']['ChiSquare'] = acclog
You are setting values through chained assignment. Currently this works in certain cases, but when using Copy-on-Write (which will become the default behaviour in pandas 3.0) this will never work to update the original DataFram

In [14]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# Assuming 'df' is already loaded and preprocessed
indep_x = df.drop('Class', axis=1)
dep_y = df['Class']

# Splitting and scaling the dataset
x_train, x_test, y_train, y_test = train_test_split(indep_x, dep_y, test_size=0.25, random_state=0)
sc = StandardScaler()
x_train = sc.fit_transform(x_train)
x_test = sc.transform(x_test)

# KNN Classifier
classifier = KNeighborsClassifier(n_neighbors=5, metric='minkowski', p=2)
classifier.fit(x_train, y_train)

# Now the classifier is defined and can be saved


In [15]:
import pickle

# Save the KNN model
with open("knn_model.pkl", 'wb') as file:
    pickle.dump(classifier, file)

print("KNN model saved as knn_model.pkl")


KNN model saved as knn_model.pkl


In [20]:
# Load the KNN model
with open("knn_model.pkl", 'rb') as file:
    loaded_model = pickle.load(file)

print("KNN model loaded successfully!")

# You can now use knn_model to make predictions


KNN model loaded successfully!


In [21]:
import numpy as np

# Assuming your original dataset had N features
num_features = 6

# Generate random values for new data
new_data = np.random.rand(1, num_features)  # Generating random data for one sample

# You can also manually specify values for new data
# new_data = [[value_1, value_2, ..., value_N]]

print("New data:")
print(new_data)

New data:
[[0.84047934 0.62520247 0.83227621 0.52469339 0.29734975 0.55030241]]


In [30]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# Example dataset
import numpy as np
X = np.random.rand(100, 6)  # Replace this with your actual feature data
y = np.random.randint(0, 2, 100)  # Replace this with your actual target data

# Split the dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Fit and transform the scaler
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)  # Scale the training data
X_test_scaled = scaler.transform(X_test)  # Scale the test data


In [31]:
import pickle

# Save the scaler
with open("scaler.pkl", "wb") as file:
    pickle.dump(scaler, file)

print("Scaler saved as scaler.pkl")


Scaler saved as scaler.pkl


In [33]:
from sklearn.feature_selection import SelectKBest, chi2

# Example training data (30 features)
X = np.random.rand(100, 30)
y = np.random.randint(0, 2, 100)

# Feature selection
selector = SelectKBest(score_func=chi2, k=6)  # Select top 6 features
X_new = selector.fit_transform(X, y)

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X_new, y, test_size=0.2, random_state=42)

# Train the model
from sklearn.neighbors import KNeighborsClassifier
model = KNeighborsClassifier()
model.fit(X_train, y_train)

# Save the model and selector
import pickle
with open("knn_model.pkl", "wb") as file:
    pickle.dump(model, file)
with open("feature_selector.pkl", "wb") as file:
    pickle.dump(selector, file)


In [34]:
# Load the model and feature selector
with open("knn_model.pkl", "rb") as file:
    loaded_model = pickle.load(file)
with open("feature_selector.pkl", "rb") as file:
    loaded_selector = pickle.load(file)

# New data with 30 features
new_data = np.random.rand(1, 30)

# Apply feature selection
new_data_selected = loaded_selector.transform(new_data)

# Predict
predictions = loaded_model.predict(new_data_selected)
print("Predictions:", predictions)


Predictions: [1]
