<a href="https://colab.research.google.com/github/abhinav12345678901121221212/Sampling-Assignment/blob/main/sampling_methods.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Imports

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [None]:
from imblearn.over_sampling import RandomOverSampler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.preprocessing import MinMaxScaler,StandardScaler
from sklearn.metrics import accuracy_score

# Table of Contents
1. <a href="#1">Reading the data</a>
2. <a href="#2">Checking the class distribution</a>
3. <a href="#3">Balancing the dataset</a>
4. <a href="#4">Applying different sampling methods</a>
    + <a href="#5">Simple random sampling</a>
    + <a href="#6">Systematic sampling</a>
    + <a href="#7">Stratified sampling</a>
    + <a href="#8">Cluster sampling</a>
    + <a href="#9">Bootstrap sampling</a>
5. <a href="#10">Training on different models</a>    

<a id="1"><h1>Reading the data</h1></a>

In [None]:
df= pd.read_csv('/Creditcard_data.csv')

In [None]:
df.head()

In [None]:
df.info()

In [None]:
df.describe()

<a id="2"><h1>Checking the Class Distribution</h1></a>

In [None]:
sns.countplot(data= df, x= 'Class')

**It can be clearly observed that the dataset is unbalanced.**

In [None]:
df['Class'].value_counts()

<a id="3"><h1>Balancing the Dataset</h1></a>

In [None]:
X= df.drop('Class',axis=1)
y= df['Class']

ros= RandomOverSampler(random_state=42)                    # balancing the dataset through oversampling

X_resampled, y_resampled= ros.fit_resample(X,y)

resampled_df= pd.concat([pd.DataFrame(X_resampled,columns= X.columns), pd.Series(y_resampled,name= 'Class')], axis=1)

sns.countplot(data= resampled_df, x= 'Class')

In [None]:
resampled_df['Class'].value_counts()

<a id="4"><h1>Applying Different Sampling Methods</h1></a>

<a id="5"><h3>Simple Random Sampling</h3>

In [None]:
# z score
z= 1.96                 # assuming confidence value of 0.95

# estimated proportion of the population with the characteristic of interest
p= 0.5                  # using proportion of smaller class as estimate, i.e. 763/(763+763)

# margin of error
E= 0.05                 # 1 - (confidence value)

In [None]:
sample_size= int(np.round((z**2 * p * (1-p)) / (E**2)))
sample_size

In [None]:
random_sample= resampled_df.sample(n= sample_size, replace= False).reset_index(drop= True)

In [None]:
random_sample.head()

<a id="6"><h3>Systematic Sampling</h3></a>

In [None]:
# no. of elements after which an element is selected
k= 5
#k= int(np.sqrt(len(resampled_df)))

In [None]:
sample_size= len(resampled_df)//k
sample_size

In [None]:
indices= np.arange(0,len(resampled_df),k)[:sample_size]
systematic_sample= resampled_df.loc[indices].reset_index(drop= True)

In [None]:
systematic_sample.head()

<a id="7"><h3>Stratified Sampling</h3></a>

In [None]:
# z score
z= 1.96                 # assuming confidence value of 0.95

# estimated proportion of the population with the characteristic of interest
p= 0.5                  # using proportion of smaller class as estimate, i.e. 763/(763+763)

# margin of error
E= 0.05                 # 1 - (confidence value)

# number of strata
S= 2                    # no. of classes

In [None]:
sample_size= int(np.round((z**2 * p * (1-p)) / (E/S)**2))
sample_size

In [None]:
len(resampled_df)

In [None]:
#sample_size*100/len(resampled_df)

In [None]:
stratified_sample= resampled_df.groupby(by= 'Class', group_keys= False).apply(lambda x: x.sample(n= sample_size//2, replace= True)).reset_index(drop= True)

In [None]:
stratified_sample.head()

**We can perform stratified sampling by another simpler approach while also creating a test set on which we will compute the accuracy score of each model.**

In [None]:
train_stratified_sample, test_stratified_sample= train_test_split(resampled_df, test_size= 0.3, stratify= resampled_df['Class'])

In [None]:
train_stratified_sample= train_stratified_sample.reset_index(drop= True)
test_stratified_sample= test_stratified_sample.reset_index(drop= True)

In [None]:
train_stratified_sample

In [None]:
test_stratified_sample

In [None]:
X_test= test_stratified_sample.drop('Class',axis=1)
y_test= test_stratified_sample['Class']

<a id="8"><h3>Cluster Sampling</h3></a>

In [None]:
# # z score
# z= 1.96                 # assuming confidence value of 0.95

# # estimated proportion of the population with the characteristic of interest
# p= 0.5                  # using proportion of smaller class as estimate, i.e. 763/(763+763)

# # margin of error
# E= 0.05                 # 1 - (confidence value)

# # average size of clusters
# C= 50

In [None]:
# sample_size= int(np.round((z**2 * p * (1-p)) / (E/C)**2))
# sample_size

In [None]:
clusters= [0,1]
sample_size= 500
cluster_sample= pd.DataFrame()

for cluster in clusters:
    cluster_df= resampled_df[resampled_df['Class']==cluster]
    sample= cluster_df.sample(n= sample_size)
    cluster_sample= pd.concat([cluster_sample,sample])

In [None]:
cluster_sample= cluster_sample.reset_index(drop= True)

In [None]:
cluster_sample

<a id="9"><h3>Bootstrap Sampling</h3></a>

In [None]:
num_bootstrap_samples = 1

for _ in range(num_bootstrap_samples):
    # Generate a bootstrap sample by sampling with replacement from the resampled dataset
    bootstrap_sample = resampled_df.sample(n=len(resampled_df), replace=True)

In [None]:
bootstrap_sample= bootstrap_sample.reset_index(drop= True)

In [None]:
bootstrap_sample

<a id="10"><h1>Training on Different Models</h1></a>

In [None]:
lr_model= LogisticRegression(max_iter= 10000, solver= 'sag')
knn_model= KNeighborsClassifier(n_neighbors= 3)
rf_model= RandomForestClassifier()
dt_model= DecisionTreeClassifier()
nb_model= GaussianNB()

In [None]:
models= [lr_model,knn_model,rf_model,dt_model,nb_model]
model_names= ['Linear Regression','KNN','Random Forest','Decision Tree','Naive Bayes']

In [None]:
samples= [random_sample, systematic_sample, train_stratified_sample, cluster_sample, bootstrap_sample]
sample_names= ['Simple Random Sample', 'Systematic Sample', 'Stratified Sample', 'Cluster Sample', 'Bootstrap Sample']

In [None]:
test_samples= []

In [None]:
for sample in samples:
    scaler= StandardScaler()
    X_train= scaler.fit_transform(sample.drop('Class',axis=1))
    X_train= pd.DataFrame(X_train, columns= df.columns[:-1])
    sample= pd.concat([X_train,sample['Class']],axis=1)
    X_test= scaler.fit_transform(X_test)
    X_test= pd.DataFrame(X_test, columns= df.columns[:-1])
    test_samples.append(X_test)

In [None]:
accuracy_table= pd.DataFrame(index= model_names, columns= sample_names)

for i,model in enumerate(models):
    for j,sample in enumerate(samples):
        model.fit(sample.drop('Class',axis=1), sample['Class'])
        pred= model.predict(test_samples[j])
        accuracy_table.loc[model_names[i], sample_names[j]]= accuracy_score(y_test, pred)

In [None]:
accuracy_table

<h1>Abhinav Maheshwai (102103284)</h1>