In [24]:
import pandas as pd
import numpy as np
from scipy import stats
from sklearn.ensemble import IsolationForest

In [33]:
data = pd.read_csv('Data.csv') # import the data
data

# check for NULL values and if there are NULL values print the quantity
if not data.isnull().values.any():
    print('No NULL values')
else:
    cnt = data.isnull().sum()
    print('Number of NULL values: ' + str(cnt))
encodedData = pd.DataFrame(data)
encodedData['mainroad'] = encodedData['mainroad'].map({'yes': int(1) , 'no': int(0)})
encodedData['guestroom'] = encodedData['guestroom'].map({'yes': 1 , 'no': 0})
encodedData['basement'] = encodedData['basement'].map({'yes': 1 , 'no': 0})
encodedData['hotwaterheating'] = encodedData['hotwaterheating'].map({'yes': 1 , 'no': 0})
encodedData['airconditioning'] = encodedData['airconditioning'].map({'yes': 1 , 'no': 0})
encodedData['prefarea'] = encodedData['prefarea'].map({'yes': 1 , 'no': 0})
encodedData = pd.get_dummies(encodedData)
# Shuffle data
shuffledData = encodedData.sample(frac=1).reset_index(drop=True)

# Ensure all data is numeric and there are no missing values
numericData = shuffledData.select_dtypes(include=[np.number]).dropna()
numericData_np = numericData.to_numpy()

# Multivariate Outlier Detection using Isolation Forest
iso_forest = IsolationForest(n_estimators=100, contamination=0.01)
outliers = iso_forest.fit_predict(numericData_np)
outlier_indices = (outliers == -1)
print(f'Number of identified outliers: {outlier_indices.sum()}/{len(numericData)}')

# Remove outliers from the dataset
numericData_clean = numericData[~outlier_indices]
numericData_clean.reset_index(drop=True, inplace=True)
print(f'Dataset size after outlier removal: {numericData_clean.shape[0]}')

# Optionally, view the cleaned data
print(numericData_clean)

No NULL values
Number of identified outliers: 6/545
Dataset size after outlier removal: 539
       price  area  bedrooms  bathrooms  stories  mainroad  guestroom  \
0    4200000  5500         3          1        2         1          0   
1    6720000  5020         3          1        4         1          0   
2    2660000  2800         3          1        1         1          0   
3    4830000  4815         2          1        1         1          0   
4    3360000  4120         2          1        2         1          0   
..       ...   ...       ...        ...      ...       ...        ...   
534  7560000  6000         3          2        3         1          0   
535  5250000  8520         3          1        1         1          0   
536  3150000  1650         3          1        2         0          0   
537  3990000  4100         4          1        1         0          0   
538  7350000  6000         3          1        2         1          0   

     basement  hotwaterheating 

In [34]:
train1=numericData_clean.sample(frac=0.9,random_state=90)
test1=numericData_clean.drop(train1.index)
train2=numericData_clean.sample(frac=0.9,random_state=80)
test2=numericData_clean.drop(train2.index)
train3=numericData_clean.sample(frac=0.9,random_state=70)
test3=numericData_clean.drop(train3.index)
train4=numericData_clean.sample(frac=0.9,random_state=60)
test4=numericData_clean.drop(train4.index)
train5=numericData_clean.sample(frac=0.9,random_state=50)
test5=numericData_clean.drop(train5.index)
train6=numericData_clean.sample(frac=0.9,random_state=40)
test6=numericData_clean.drop(train6.index)
train7=numericData_clean.sample(frac=0.9,random_state=30)
test7=numericData_clean.drop(train7.index)
train8=numericData_clean.sample(frac=0.9,random_state=20)
test8=numericData_clean.drop(train8.index)
train9=numericData_clean.sample(frac=0.9,random_state=10)
test9=numericData_clean.drop(train9.index)
train10=numericData_clean.sample(frac=0.9,random_state=0)
test10=numericData_clean.drop(train10.index)

In [35]:
train1.to_csv('data/train1.csv')
test1.to_csv('data/test1.csv')
train2.to_csv('data/train2.csv')
test2.to_csv('data/test2.csv')
train3.to_csv('data/train3.csv')
test3.to_csv('data/test3.csv')
train4.to_csv('data/train4.csv')
test4.to_csv('data/test4.csv')
train5.to_csv('data/train5.csv')
test5.to_csv('data/test5.csv')
train6.to_csv('data/train6.csv')
test6.to_csv('data/test6.csv')
train7.to_csv('data/train7.csv')
test7.to_csv('data/test7.csv')
train8.to_csv('data/train8.csv')
test8.to_csv('data/test8.csv')
train9.to_csv('data/train9.csv')
test9.to_csv('data/test9.csv')
train10.to_csv('data/train10.csv')
test10.to_csv('data/test10.csv')
