In [30]:
import pandas as pd
import numpy as np
import math
from sklearn.svm import SVC
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, mean_squared_error

data=pd.read_csv("/content/sample_data/Creditcard_data.csv")
data.head()
data.info()
len(data)

def calculate_sample_size(population_size, z_score, margin_of_error, p=0.5):
    n = (z_score**2 * p * (1 - p)) / (margin_of_error**2)
    return int(n)

population_size = len(data)
z_score = 1.96  #95% confidence level
margin_of_error = 0.05  #5% margin of error
p = 0.5  #Proportion of population

sample_size=calculate_sample_size(population_size, z_score, margin_of_error, p)
print(sample_size)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 772 entries, 0 to 771
Data columns (total 31 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   Time    772 non-null    int64  
 1   V1      772 non-null    float64
 2   V2      772 non-null    float64
 3   V3      772 non-null    float64
 4   V4      772 non-null    float64
 5   V5      772 non-null    float64
 6   V6      772 non-null    float64
 7   V7      772 non-null    float64
 8   V8      772 non-null    float64
 9   V9      772 non-null    float64
 10  V10     772 non-null    float64
 11  V11     772 non-null    float64
 12  V12     772 non-null    float64
 13  V13     772 non-null    float64
 14  V14     772 non-null    float64
 15  V15     772 non-null    float64
 16  V16     772 non-null    float64
 17  V17     772 non-null    float64
 18  V18     772 non-null    float64
 19  V19     772 non-null    float64
 20  V20     772 non-null    float64
 21  V21     772 non-null    float64
 22  V2

In [31]:
#Simple Random Sampling
from sklearn.model_selection import train_test_split

simple_random_samples=data.sample(n=int(sample_size),random_state=42)
print(simple_random_samples)

     Time        V1        V2        V3        V4        V5        V6  \
538   404  0.638806  1.772451 -1.748258  1.297700  1.785872 -1.050197   
213   140  1.007947 -1.289492  0.666741 -0.741321 -1.290523  0.302172   
361   265  0.073631  1.051207 -0.281223  0.853749  1.065966  1.219197   
417   302 -0.986171  1.732934  0.857587  0.178950 -0.794223 -0.088469   
582   434 -0.679293  1.120837  1.319394  1.249827  1.147786 -0.086534   
..    ...       ...       ...       ...       ...       ...       ...   
771   581  1.153758  0.132273  0.382969  1.405063 -0.224287 -0.197295   
649   491  1.196288  0.217099  0.402596  0.921664 -0.252045 -0.392240   
107    71  1.331897 -0.579962  0.422606 -0.897752 -0.746254 -0.056273   
59     41  1.154312  0.265462  0.384871  0.575007 -0.217475 -0.391520   
581   433 -0.599759  1.383107  2.381261  2.686183  0.317047  0.801453   

           V7        V8        V9  ...       V21       V22       V23  \
538  1.485730 -0.780631  0.416991  ... -0.359613 -0

In [32]:
#Systematic Sampling

size=len(data)
k=int(math.sqrt(size))

#Selecting every "k"-row starting from a random index in the dataset
systematic_samples=data.iloc[::k]
print(systematic_samples)

     Time        V1        V2        V3        V4        V5        V6  \
0       0 -1.359807 -0.072781  2.536347  1.378155 -0.338321  0.462388   
27     23  1.322707 -0.174041  0.434555  0.576038 -0.836758 -0.831083   
54     37  1.295668  0.341483  0.081505  0.566746 -0.110459 -0.766325   
81     52  1.147369  0.059035  0.263632  1.211023 -0.044096  0.301067   
108    73  1.162281  1.248178 -1.581317  1.475024  1.138357 -1.020373   
135    84  1.119272 -0.669639  0.803807 -0.651693 -1.395666 -0.800698   
162   103 -0.940893  1.074155  1.759398 -0.601446  0.101693 -0.188520   
189   124 -1.710935 -1.366799  2.217311  0.404714 -0.114375 -0.075942   
216   142  1.288256  0.085828 -1.179482  0.064357  2.195225  3.383363   
243   164 -0.433211  1.020835  2.019730  3.003261  0.031308  0.187063   
270   190 -0.549414  0.676861  2.151950  1.014523 -0.620012  0.076154   
297   211 -0.247827 -0.282682  1.653354 -1.014865 -0.680433  0.886364   
324   237  1.260248 -0.020172 -1.164387  0.266251  

In [33]:
#Stratified Sampling

z=1.96 #for a 95% confidence interval
e=0.05 #5% error
st_sample_size=pow(z,2)*p*(1-p)/(pow(e,2)*size)
print(st_sample_size)

stratified_sample = pd.DataFrame()
strata = data['Class'].value_counts(normalize=True)

for stratum, proportion in strata.items():
  stratum_data = data[data['Class'] == stratum]
  stratum_sample_size = int(np.round(proportion * sample_size))
  stratum_sample = stratum_data.sample(stratum_sample_size, random_state=42)
  stratified_sample = pd.concat([stratified_sample, stratum_sample])
print(stratified_sample)

0.4976165803108807
     Time        V1        V2        V3        V4        V5        V6  \
360   265 -0.491003  0.906953  1.645423 -0.083531 -0.195560 -0.710165   
262   184 -0.143256  0.743649  1.534072  1.062170  0.208187 -0.838623   
757   566  1.166360  0.005061  0.497768  0.798920 -0.365524 -0.233421   
195   128  1.239495 -0.182609  0.155058 -0.928892 -0.746227 -1.235608   
336   246 -1.069200  1.239963  0.545157  1.005354 -0.025696 -0.910673   
..    ...       ...       ...       ...       ...       ...       ...   
562   421 -1.397776  0.004573  2.528964  0.648279 -0.196625  0.163190   
717   539 -1.738582  0.052740  1.187057 -0.656652  0.920623 -0.291788   
182   118  1.254914  0.350287  0.302488  0.693114 -0.371470 -1.070256   
639   484 -0.928088  0.398194  1.741131  0.182673  0.966387 -0.901004   
1       0  1.191857  0.266151  0.166480  0.448154  0.060018 -0.082361   

           V7        V8        V9  ...       V21       V22       V23  \
360  0.559119  0.116340 -0.53819

In [34]:
#Cluster Sampling

clusters = data['Class'].unique()
sampled_clusters = np.random.choice(clusters, 1, replace=False)
cluster_sample = data[data['Class'].isin(sampled_clusters)]

print(cluster_sample)

     Time        V1        V2        V3        V4        V5        V6  \
0       0 -1.359807 -0.072781  2.536347  1.378155 -0.338321  0.462388   
2       1 -1.358354 -1.340163  1.773209  0.379780 -0.503198  1.800499   
3       1 -0.966272 -0.185226  1.792993 -0.863291 -0.010309  1.247203   
4       2 -1.158233  0.877737  1.548718  0.403034 -0.407193  0.095921   
5       2 -0.425966  0.960523  1.141109 -0.168252  0.420987 -0.029728   
..    ...       ...       ...       ...       ...       ...       ...   
767   575 -0.572263  0.731748  1.541254  0.150506  1.108974  0.372152   
768   579 -1.296845 -0.511605  2.404726 -0.310762 -0.319551 -0.542842   
769   579  1.214170  0.210481  0.484651  0.479768 -0.261955 -0.527039   
770   580  1.267030 -0.071114  0.037680  0.512683  0.242392  0.705212   
771   581  1.153758  0.132273  0.382969  1.405063 -0.224287 -0.197295   

           V7        V8        V9  ...       V21       V22       V23  \
0    0.239599  0.098698  0.363787  ... -0.018307  0

In [36]:

models = {
    "Logistic Regression": LogisticRegression(),
    "Random Forest": RandomForestClassifier(),
    "SVM Classifier": SVC(probability=True),
    "Decision Tree": DecisionTreeClassifier(),
    "KNN": KNeighborsClassifier(n_neighbors=5)
}

# Sampling
sample_methods = {
    "Simple Random": simple_random_samples,
    "Stratified": stratified_sample,
    "Systematic": systematic_samples,
    "Cluster": cluster_sample
}

X=np.array(data.drop('Class',axis=1))
y=np.array(data['Class'])

# Evaluate models on different samples
results = []
sample_size = 200  # Adjust sample size as needed
for method_name, sampling_func in sample_methods.items():
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    for model_name, model in models.items():
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        accuracy = accuracy_score(y_test, y_pred)
        results.append({
            "Sampling Method": method_name,
            "Model": model_name,
            "Accuracy": accuracy
        })

# Convert results to DataFrame
results_df = pd.DataFrame(results)
print(results_df)

print(max(results_df))

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

   Sampling Method                Model  Accuracy
0    Simple Random  Logistic Regression  0.993548
1    Simple Random        Random Forest  0.993548
2    Simple Random       SVM Classifier  0.993548
3    Simple Random        Decision Tree  0.974194
4    Simple Random                  KNN  0.993548
5       Stratified  Logistic Regression  0.993548
6       Stratified        Random Forest  0.993548
7       Stratified       SVM Classifier  0.993548
8       Stratified        Decision Tree  0.974194
9       Stratified                  KNN  0.993548
10      Systematic  Logistic Regression  0.993548
11      Systematic        Random Forest  0.993548
12      Systematic       SVM Classifier  0.993548
13      Systematic        Decision Tree  0.980645
14      Systematic                  KNN  0.993548
15         Cluster  Logistic Regression  0.993548
16         Cluster        Random Forest  0.993548
17         Cluster       SVM Classifier  0.993548
18         Cluster        Decision Tree  0.980645
