In [0]:
import os
import math
import numpy as np
import pandas as pd
from sklearn import preprocessing
from scipy.sparse import coo_matrix
from sklearn.utils import shuffle
from sklearn.model_selection import KFold
from sklearn.model_selection import StratifiedKFold

from google.colab import drive
drive.mount('/content/drive/')

In [0]:
df =  pd.read_csv('/content/drive/My Drive/Colab Notebooks/dataset.csv')
features = df.iloc[:,0:-1]
y = df.iloc[:,-1]
y = pd.DataFrame(y)

In [0]:
def normalizeDataset(features,y,model):

  nfeatures = preprocessing.normalize(features)
  normalized_features = pd.DataFrame(nfeatures)
  FCV_data = pd.concat([normalized_features, y ], axis = 1)
  x_update = FCV_data.iloc[:,0:-1]
  y_update = FCV_data.iloc[:,-1]
  
  #shuffling data for random ordering to remove biasness 
  x_sparse = coo_matrix(x_update)
  x_update, x_sparse, y_update = shuffle(x_update, x_sparse, y_update, random_state=0)
  
  dataframe = pd.concat([x_update,y_update], axis = 1)
  
  np.savetxt("/content/drive/My Drive/Colab Notebooks/Processed Datasets/" + model + ".csv" , dataframe, delimiter=",")

In [0]:
def create_10Folds(source_path, destination_path):
  df =  pd.read_csv(source_path)
 
  
  features = df.iloc[:,0:-1]
  y = df.iloc[:,-1]

  np_features = np.array(features.values)
  np_area = np.array(y.values)
  kfold = KFold(n_splits=10)
  fold = 0

  for train, test in kfold.split(features, y):
    
    fold+=1
    
    train_output = destination_path + 'train' + str(fold) + '.csv'
    test_output = destination_path + 'test' + str(fold) + '.csv'
    
    feature_fold_train = np_features[train]
    area_fold_train = np_area[train]
  
    feature_fold_test = np_features[test]
    area_fold_test = np_area[test]
  
    feature_fold_train_frame = pd.DataFrame(feature_fold_train)
    feature_fold_test_frame = pd.DataFrame(feature_fold_test)
  
    area_fold_train_frame = pd.DataFrame(area_fold_train)
    area_fold_test_frame = pd.DataFrame(area_fold_test)
  
    result_train = pd.concat([feature_fold_train_frame, area_fold_train_frame], axis = 1)
    result_test = pd.concat([feature_fold_test_frame, area_fold_test_frame], axis = 1)
  
    result_train = pd.DataFrame(result_train)
    result_train.to_csv(train_output, index=False)
  
    result_test = pd.DataFrame(result_test)
    result_test.to_csv(test_output, index=False)
 

In [0]:
#Preparing data for different models

#dataset for M1 -> unchanged features
M1 = features
normalizeDataset(M1,y, "Model1_random")
#dfM1 =  pd.read_csv('/content/drive/My Drive/Colab Notebooks/Processed Datasets/Model1_random.csv')
create_10Folds('/content/drive/My Drive/Colab Notebooks/Processed Datasets/Model1_random.csv', '/content/drive/My Drive/Colab Notebooks/FCV/Model 1/')


#dataset for M2 -> feature selected for squaring: temp
temp = features.iloc[:,8]
tempSquare = np.power(temp,2)
M2 = pd.concat([features,tempSquare], axis = 1)
normalizeDataset(M2, y, "Model2_random")
#dfM2 =  pd.read_csv('/content/drive/My Drive/Colab Notebooks/Processed Datasets/Model2_random.csv')
create_10Folds('/content/drive/My Drive/Colab Notebooks/Processed Datasets/Model2_random.csv', '/content/drive/My Drive/Colab Notebooks/FCV/Model 2/')


#dataset for M4 -> feature selected for power 4: DMC
dmc = features.iloc[:,5]
dmcQuad = np.power(dmc,4)
M4 = pd.concat([features,dmcQuad], axis = 1)
normalizeDataset(M4, y, "Model4_random")
#dfM4 =  pd.read_csv('/content/drive/My Drive/Colab Notebooks/Processed Datasets/Model4_random.csv')
create_10Folds('/content/drive/My Drive/Colab Notebooks/Processed Datasets/Model4_random.csv', '/content/drive/My Drive/Colab Notebooks/FCV/Model 4/')

#dataset for M6 -> feature selected for power 6 : X
X = features.iloc[:,0]
XHex = np.power(X,6)
M6 = pd.concat([features,XHex], axis = 1)
normalizeDataset(M6, y, "Model6_random")
#dfM6 =  pd.read_csv('/content/drive/My Drive/Colab Notebooks/Processed Datasets/Model6_random.csv')
create_10Folds('/content/drive/My Drive/Colab Notebooks/Processed Datasets/Model6_random.csv', '/content/drive/My Drive/Colab Notebooks/FCV/Model 6/')

#dataset for M5 -> feature selected for power 5 : DC
dc = features.iloc[:,6]
dcPent = np.power(dc,5)
M5 = pd.concat([features,dcPent], axis = 1)
normalizeDataset(M5, y, "Model5_random")
#dfM5 =  pd.read_csv('/content/drive/My Drive/Colab Notebooks/Processed Datasets/Model5_random.csv')
create_10Folds('/content/drive/My Drive/Colab Notebooks/Processed Datasets/Model5_random.csv', '/content/drive/My Drive/Colab Notebooks/FCV/Model 5/')

