<a href="https://colab.research.google.com/github/agamduggal/Predictive-Analysis/blob/main/Sampling.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [16]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from imblearn.over_sampling import SMOTE
import warnings
warnings.filterwarnings('ignore')

# Load the data
data = pd.read_csv('/content/Creditcard_data.csv')

In [19]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 772 entries, 0 to 771
Data columns (total 31 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   Time    772 non-null    int64  
 1   V1      772 non-null    float64
 2   V2      772 non-null    float64
 3   V3      772 non-null    float64
 4   V4      772 non-null    float64
 5   V5      772 non-null    float64
 6   V6      772 non-null    float64
 7   V7      772 non-null    float64
 8   V8      772 non-null    float64
 9   V9      772 non-null    float64
 10  V10     772 non-null    float64
 11  V11     772 non-null    float64
 12  V12     772 non-null    float64
 13  V13     772 non-null    float64
 14  V14     772 non-null    float64
 15  V15     772 non-null    float64
 16  V16     772 non-null    float64
 17  V17     772 non-null    float64
 18  V18     772 non-null    float64
 19  V19     772 non-null    float64
 20  V20     772 non-null    float64
 21  V21     772 non-null    float64
 22  V2

In [17]:
# Prepare features and target
X = data.drop("Class", axis=1)
y = data["Class"]

# Balance the dataset using SMOTE
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X, y)



In [22]:
print(y.value_counts())


Class
0    763
1      9
Name: count, dtype: int64


In [23]:
print(y_resampled.value_counts())

Class
0    763
1    763
Name: count, dtype: int64


In [18]:
# Calculate sample size using confidence interval formula
# Using 95% confidence level (Z=1.96), margin of error=0.05
Z = 1.96
p = 0.5
E = 0.05
N = len(X_resampled)
sample_size = int((Z**2 * p * (1-p) * N) / (E**2 * (N-1) + Z**2 * p * (1-p)))

# Define sampling techniques
samples = {
    "Sampling1": X_resampled.sample(n=sample_size, random_state=42),
    "Sampling2": X_resampled.sample(n=sample_size, random_state=21),
    "Sampling3": X_resampled.iloc[::len(X_resampled)//sample_size],  # Systematic sampling
    "Sampling4": X_resampled.sample(n=sample_size, random_state=56),
    "Sampling5": X_resampled.sample(n=sample_size, random_state=99)
}



In [20]:
# Create sample datasets with corresponding target values
sample_datasets = {
    name: (sample, y_resampled.loc[sample.index])
    for name, sample in samples.items()
}

# Define models
models = {
    "M1": LogisticRegression(max_iter=1000, random_state=42),
    "M2": DecisionTreeClassifier(random_state=42),
    "M3": RandomForestClassifier(random_state=42),
    "M4": SVC(random_state=42),
    "M5": KNeighborsClassifier()
}



In [21]:
# Initialize results dictionary
results = {}

# Train and evaluate models
for sample_name, (X_sample, y_sample) in sample_datasets.items():
    print(f"Evaluating {sample_name}...")
    X_train, X_test, y_train, y_test = train_test_split(
        X_sample, y_sample, test_size=0.3, random_state=42
    )

    for model_name, model in models.items():
        model.fit(X_train, y_train)
        predictions = model.predict(X_test)
        accuracy = accuracy_score(y_test, predictions)
        if sample_name not in results:
            results[sample_name] = {}
        results[sample_name][model_name] = accuracy * 100  # Convert to percentage



Evaluating Sampling1...
Evaluating Sampling2...
Evaluating Sampling3...
Evaluating Sampling4...
Evaluating Sampling5...


In [24]:
# Create results matrix
results_matrix = pd.DataFrame(
    [[results[sample][model] for model in models.keys()]
     for sample in samples.keys()],
    index=samples.keys(),
    columns=models.keys()
)



In [25]:
# Print results
print("\nAccuracy Matrix (%):")
print(results_matrix)

# Find best sampling technique for each model
best_combinations = results_matrix.idxmax()
print("\nBest Sampling Technique for Each Model:")
print(best_combinations)

# Save results
results_matrix.to_csv("sampling_results.csv")


Accuracy Matrix (%):
                  M1         M2          M3         M4         M5
Sampling1  86.021505  87.096774   93.548387  65.591398  66.666667
Sampling2  92.473118  89.247312   96.774194  74.193548  80.645161
Sampling3  85.217391  91.304348   98.260870  64.347826  79.130435
Sampling4  89.247312  91.397849   92.473118  72.043011  77.419355
Sampling5  93.548387  95.698925  100.000000  73.118280  77.419355

Best Sampling Technique for Each Model:
M1    Sampling5
M2    Sampling5
M3    Sampling5
M4    Sampling2
M5    Sampling2
dtype: object
