#Importing mathematical libraries

In [None]:
import numpy as np
import pandas as pd
import statistics as sts
import math
from operator import itemgetter

#Preprocessing class

In [6]:
class Preprocessing:
  def __init__(self,fileName : str):
    self.data=pd.read_csv(fileName , header = None)  
    self.y=self.data.iloc[:,-1].values
    self.x=self.data.iloc[:,:-1].values
  
  def dataInfo(self):
    '''function to describe our dataset'''
    print(self.data.describe())
    print(self.data.info())

  def handleMissingValues(self,index : int,strategy : str
                            , missingValue  = float("nan") ) -> None :
      '''function to handle missing values using mean startegy.'''
      if strategy == 'mean' or strategy == 'Mean' or strategy == 'MEAN' :

        sum=0
        for i in self.x:
          if (i[index] != missingValue):
            sum+=float(i[index])
        meanX=sum/len(self.x)
        i=0
        while i<len(self.x):
          if self.x[i][index] == missingValue:
            self.x[i][index]=meanX
          i+=1
        self.x[:,index] = self.x[:,index].astype("float64") 
      else:
        discreteValues = dict()
        for i in self.x :
          if i[index] not in discreteValues.keys() :
            discreteValues[i[index]] = 1
          else:
            discreteValues[i[index]] += 1
        myMode = max(discreteValues,key = itemgetter(0))[0]
        i=0
        while i<len(self.x):
          if self.x[i][index] == missingValue:
            self.x[i][index]=myMode
          i+=1

  def encoding(self,index : int) -> None:
      '''function to convert the categorical data into binary form.'''
      distinctValues=list()
      for i in range(len(self.x)):
        if self.x[i][index] not in distinctValues:
          distinctValues.append(self.x[i][index])

      newList=[[0  for j in range(len(self.x))] for i in range (len(distinctValues))]
      for i in range(len(self.x)):
          for j in range(len(distinctValues)):
            if self.x[i][index]==distinctValues[j]: 
              newList[j][i]=1
      self.x = np.delete(self.x,index,axis=1)
      last_col = self.x[ : , -1]
      last_col = np.reshape(last_col,(len(last_col),1))
      self.x = np.delete(self.x,-1,axis = 1)
      for i in range(len(newList)):
        col= np.reshape(newList[i],(len(self.x),1))
        self.x = np.append(self.x,col,axis=1)
      self.x = np.append(self.x,last_col,axis = 1)

  def encodingBinaryData(self, index : int ) -> None :
      '''function to convert the binary categorical values  into binary bits.'''
      binData = list()
      while len(binData) < 2 :
        for i in range(len(self.x)):
          if self.x[i][index] not in binData :
            binData.append(self.x[i][index])
      for i in range(len(self.x)):
        if self.x[i][index] == binData[0] :
          self.x[i][index] = 1
        else :
          self.x[i][index] = 0

  def featureScaling(self ) -> None:
      '''function to normalizes the values.'''
      col = 0
      while col < len(self.x[0]):
        std = sts.stdev(self.x[:,col])
        avg = sts.mean(self.x[:,col])
        for i in range(len(self.x)) :
          self.x[i][col]= (self.x[i][col] - avg) / std

        col+=1

  def split(self, per : int) -> list :
      '''function to split the data into training and test set respectively'''
      train_len = int((per/100) * len(self.x))
      x_train = self.x[: train_len][ : ]
      y_train = self.x[train_len :][ : ]
      x_test = self.y[:train_len]
      y_test = self.y[train_len :]
      return [x_train,x_test,y_train,y_test]

#Testing our preprocessing class on crx.data dataset

[Click here to download crx.data dataset](https://drive.google.com/file/d/1oea34uxl_PlRLo35l-Zt9cFHnBn4AdRQ/view?usp=sharing)

In [7]:
fileObj = Preprocessing("crx.data")

In [9]:
fileObj.dataInfo()

               2           7          10             14
count  690.000000  690.000000  690.00000     690.000000
mean     4.758725    2.223406    2.40000    1017.385507
std      4.978163    3.346513    4.86294    5210.102598
min      0.000000    0.000000    0.00000       0.000000
25%      1.000000    0.165000    0.00000       0.000000
50%      2.750000    1.000000    0.00000       5.000000
75%      7.207500    2.625000    3.00000     395.500000
max     28.000000   28.500000   67.00000  100000.000000
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 690 entries, 0 to 689
Data columns (total 16 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   0       690 non-null    object 
 1   1       690 non-null    object 
 2   2       690 non-null    float64
 3   3       690 non-null    object 
 4   4       690 non-null    object 
 5   5       690 non-null    object 
 6   6       690 non-null    object 
 7   7       690 non-null    float64
 8   8       690 non-

In [11]:
fileObj.data.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
0,b,30.83,0.0,u,g,w,v,1.25,t,t,1,f,g,202,0,+
1,a,58.67,4.46,u,g,q,h,3.04,t,t,6,f,g,43,560,+
2,a,24.5,0.5,u,g,q,h,1.5,t,f,0,f,g,280,824,+
3,b,27.83,1.54,u,g,w,v,3.75,t,t,5,t,g,100,3,+
4,b,20.17,5.625,u,g,w,v,1.71,t,f,0,f,s,120,0,+


In [12]:
col_1 = [0,3,4,5,6,8,9,11,12]
for i in col_1 :
  fileObj.handleMissingValues(i,'Mode','?')
col_2 = [1,2,7,10,13,14]
for j in col_2 :
  fileObj.handleMissingValues(j,'Mean','?')

In [14]:
col_3 = [0,3,4,6,8,9,11,12]
for i in col_3 :
  fileObj.encodingBinaryData(i)

In [15]:
fileObj.data.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
0,b,30.83,0.0,u,g,w,v,1.25,t,t,1,f,g,202,0,+
1,a,58.67,4.46,u,g,q,h,3.04,t,t,6,f,g,43,560,+
2,a,24.5,0.5,u,g,q,h,1.5,t,f,0,f,g,280,824,+
3,b,27.83,1.54,u,g,w,v,3.75,t,t,5,t,g,100,3,+
4,b,20.17,5.625,u,g,w,v,1.71,t,f,0,f,s,120,0,+


In [16]:
fileObj.encoding(5)

In [17]:
fileObj.data.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
0,b,30.83,0.0,u,g,w,v,1.25,t,t,1,f,g,202,0,+
1,a,58.67,4.46,u,g,q,h,3.04,t,t,6,f,g,43,560,+
2,a,24.5,0.5,u,g,q,h,1.5,t,f,0,f,g,280,824,+
3,b,27.83,1.54,u,g,w,v,3.75,t,t,5,t,g,100,3,+
4,b,20.17,5.625,u,g,w,v,1.71,t,f,0,f,s,120,0,+


In [18]:
fileObj.featureScaling()

In [19]:
fileObj.data.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
0,b,30.83,0.0,u,g,w,v,1.25,t,t,1,f,g,202,0,+
1,a,58.67,4.46,u,g,q,h,3.04,t,t,6,f,g,43,560,+
2,a,24.5,0.5,u,g,q,h,1.5,t,f,0,f,g,280,824,+
3,b,27.83,1.54,u,g,w,v,3.75,t,t,5,t,g,100,3,+
4,b,20.17,5.625,u,g,w,v,1.71,t,f,0,f,s,120,0,+


In [20]:
X_train, X_test, y_train, y_test = fileObj.split(70)

In [21]:
print(X_train)

[[0.6609583512060239 -0.061469072854104856 -0.9559197639988517 ...
  -0.28823935960160496 -0.12117990542976133 -0.19527168382016377]
 [-1.5107619456137686 2.2872062252679166 -0.06000699912150476 ...
  -0.28823935960160496 -0.12117990542976133 -0.08778819584001676]
 [-1.5107619456137686 -0.5954889951641763 -0.8554811132278487 ...
  -0.28823935960160496 -0.12117990542976133 -0.03711740864937603]
 ...
 [0.6609583512060239 -0.4267623214485139 -0.4115422768200154 ...
  -0.28823935960160496 -0.12117990542976133 -0.19527168382016377]
 [-1.5107619456137686 -1.2349630885465368 -0.8554811132278487 ...
  -0.28823935960160496 -0.12117990542976133 -0.18855396582140457]
 [0.6609583512060239 -0.6798523320220076 -0.32014310461840256 ...
  -0.28823935960160496 -0.12117990542976133 -0.1799168998229999]]


In [22]:
print(X_test)

['+' '+' '+' '+' '+' '+' '+' '+' '+' '+' '+' '+' '+' '+' '+' '+' '+' '+'
 '+' '+' '+' '+' '+' '+' '+' '+' '+' '+' '+' '+' '+' '+' '+' '+' '+' '+'
 '+' '+' '+' '+' '+' '+' '+' '+' '+' '+' '+' '+' '+' '+' '+' '+' '+' '+'
 '+' '+' '+' '+' '+' '+' '+' '+' '+' '+' '+' '+' '+' '+' '+' '+' '-' '-'
 '-' '-' '-' '-' '-' '-' '-' '-' '-' '-' '-' '-' '-' '-' '-' '-' '-' '-'
 '-' '-' '-' '-' '-' '-' '-' '-' '-' '-' '-' '-' '-' '-' '-' '-' '-' '-'
 '-' '-' '-' '-' '-' '-' '-' '-' '-' '+' '+' '+' '+' '+' '+' '+' '+' '+'
 '+' '+' '+' '+' '+' '+' '+' '+' '+' '+' '+' '+' '+' '+' '+' '+' '+' '+'
 '+' '+' '+' '+' '+' '+' '+' '+' '+' '+' '+' '+' '+' '+' '+' '+' '+' '+'
 '+' '+' '+' '+' '+' '+' '+' '+' '+' '+' '+' '+' '+' '+' '+' '+' '+' '+'
 '+' '+' '+' '+' '+' '+' '+' '+' '+' '+' '+' '+' '+' '+' '+' '+' '+' '+'
 '+' '+' '+' '+' '+' '+' '+' '+' '+' '+' '+' '+' '+' '+' '+' '+' '+' '+'
 '+' '+' '+' '+' '+' '+' '+' '+' '+' '+' '+' '+' '+' '+' '+' '+' '+' '+'
 '+' '+' '+' '+' '+' '+' '+' '+' '+' '+' '+' '+' '+

In [24]:
print(y_train)

[[-1.5107619456137686 -1.2003741204348264 0.9524146006502058 ...
  -0.28823935960160496 -0.12117990542976133 -0.19335233582051828]
 [0.6609583512060239 -0.6587614978075498 -0.8725556838589192 ...
  -0.28823935960160496 -0.12117990542976133 -0.19412007502037648]
 [0.6609583512060239 0.26248614067996734 -0.7389722783334852 ...
  -0.28823935960160496 -0.12117990542976133 -0.19527168382016377]
 ...
 [-1.5107619456137686 -0.5322164925208029 1.7559238068182301 ...
  3.4643107937023085 -0.12117990542976133 -0.19507974902019923]
 [0.6609583512060239 -1.1505997516887057 -0.9147399171827405 ...
  -0.28823935960160496 -0.12117990542976133 -0.0513205838467526]
 [0.6609583512060239 0.2903260418430515 -0.2779588712945813 ...
  -0.28823935960160496 -0.12117990542976133 -0.19527168382016377]]


In [25]:
print(y_test)

['-' '-' '-' '-' '-' '-' '-' '-' '+' '+' '+' '+' '+' '+' '+' '+' '+' '+'
 '+' '+' '+' '+' '+' '+' '+' '+' '+' '+' '+' '+' '+' '+' '+' '+' '+' '+'
 '+' '+' '+' '+' '+' '-' '-' '-' '-' '-' '-' '-' '-' '-' '-' '-' '-' '-'
 '-' '-' '-' '-' '-' '-' '-' '-' '-' '-' '-' '+' '+' '+' '+' '+' '+' '+'
 '+' '+' '+' '+' '+' '+' '+' '+' '+' '+' '+' '+' '+' '+' '+' '+' '+' '+'
 '+' '+' '+' '+' '+' '+' '+' '+' '+' '+' '+' '+' '+' '+' '+' '+' '+' '+'
 '+' '+' '+' '+' '+' '+' '+' '+' '+' '+' '+' '-' '-' '-' '-' '-' '+' '-'
 '-' '-' '-' '-' '-' '-' '-' '-' '-' '-' '-' '-' '-' '+' '+' '-' '-' '-'
 '-' '-' '-' '-' '-' '-' '-' '-' '-' '-' '-' '-' '-' '-' '-' '-' '-' '-'
 '-' '-' '-' '-' '-' '-' '-' '-' '-' '-' '-' '-' '-' '-' '-' '-' '-' '-'
 '-' '-' '-' '-' '-' '-' '-' '-' '-' '-' '-' '-' '-' '-' '-' '-' '-' '-'
 '-' '-' '-' '-' '-' '-' '-' '-' '-' '-']
