In [0]:
import numpy as np
import pandas as pd
from sklearn import preprocessing
from scipy.sparse import coo_matrix
from sklearn.utils import shuffle
from sklearn.model_selection import KFold
from sklearn.model_selection import StratifiedKFold

from google.colab import drive
drive.mount('/content/drive/')

In [0]:
#normalizing the initial dataset for better result

def normalize(X,Y):
  normalized_X = preprocessing.normalize(X)
  normalized_data = pd.DataFrame(normalized_X)
  merged_df = pd.concat([normalized_data, Y ], axis = 1)
  x_new = merged_df.iloc[:,0:-1]
  y_new = merged_df.iloc[:,-1]
  X_sparse = coo_matrix(x_new)
  x_new, X_sparse, y_new = shuffle(x_new, X_sparse, y_new, random_state=0)
  final_df = pd.concat([x_new,y_new], axis = 1)
  np.savetxt("/content/drive/My Drive/PA_3/Data/normalizedData" + ".csv" , final_df, delimiter=",") #saving the normalized data to new file

In [0]:
#Applying the 10 Fold Cross Validation to create test and training dataset

def create_10_folds(source_path, destination_path):
  df = pd.read_csv(source_path)
  
  x = df.iloc[:,0:-1]
  y = df.iloc[:,-1]

  np_x = np.array(x.values)
  np_y = np.array(y.values)
  fold = 0
  kfold = KFold(n_splits=10)
  
  for train, test in kfold.split(x, y):
    fold+=1
    
    train_output = destination_path + 'train' + str(fold) + '.csv'
    test_output = destination_path + 'test' + str(fold) + '.csv'
    
    x_fold_train = np_x[train]
    y_fold_train = np_y[train]
  
    x_fold_test = np_x[test]
    y_fold_test = np_y[test]
  
    x_fold_train_frame = pd.DataFrame(x_fold_train)
    x_fold_test_frame = pd.DataFrame(x_fold_test)
  
    y_fold_train_frame = pd.DataFrame(y_fold_train)
    y_fold_test_frame = pd.DataFrame(y_fold_test)
  
    result_train = pd.concat([x_fold_train_frame, y_fold_train_frame], axis = 1)
    result_test = pd.concat([x_fold_test_frame, y_fold_test_frame], axis = 1)
  
    result_train = pd.DataFrame(result_train)
    result_train.to_csv(train_output, index=False)
  
    result_test = pd.DataFrame(result_test)
    result_test.to_csv(test_output, index=False)

In [0]:
#Extracting information from the original dataset
dataframe = pd.read_csv('/content/drive/My Drive/PA_3/Data/dataset.csv')
x = dataframe.iloc[:,0:-1]
y = dataframe.iloc[:,-1]
y = pd.DataFrame(y)

In [0]:
#calling the functions by passing required arguments to generate normalized dataset and then create 10 folds for cross validation
normalize(x,y)
create_10_folds('/content/drive/My Drive/PA_3/Data/normalizedData.csv', '/content/drive/My Drive/PA_3/FCV/')