In [1]:
#import libraries
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

In [2]:
#read csv file
df = pd.read_csv('ParisHousing.csv')
df

Unnamed: 0,squareMeters,numberOfRooms,hasYard,hasPool,floors,cityCode,cityPartRange,numPrevOwners,made,isNewBuilt,hasStormProtector,basement,attic,garage,hasStorageRoom,hasGuestRoom,price
0,75523,3,0,1,63,9373,3,8,2005,0,1,4313,9005,956,0,7,7559081.5
1,80771,39,1,1,98,39381,8,6,2015,1,0,3653,2436,128,1,2,8085989.5
2,55712,58,0,1,19,34457,6,8,2021,0,0,2937,8852,135,1,9,5574642.1
3,32316,47,0,0,6,27939,10,4,2012,0,1,659,7141,359,0,3,3232561.2
4,70429,19,1,1,90,38045,3,7,1990,1,0,8435,2429,292,1,4,7055052.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,1726,89,0,1,5,73133,7,6,2009,0,1,9311,1698,218,0,4,176425.9
9996,44403,29,1,1,12,34606,9,4,1990,0,1,9061,1742,230,0,0,4448474.0
9997,83841,3,0,0,69,80933,10,10,2005,1,1,8304,7730,345,1,9,8390030.5
9998,59036,70,0,0,96,55856,1,3,2010,0,1,2590,6174,339,1,4,5905107.0


In [3]:
#show columns of the dataset
index = df.columns
print('Columns of ParisHousing.csv:\n----------------------------')
for _ in index:
    print(_)

Columns of ParisHousing.csv:
----------------------------
squareMeters
numberOfRooms
hasYard
hasPool
floors
cityCode
cityPartRange
numPrevOwners
made
isNewBuilt
hasStormProtector
basement
attic
garage
hasStorageRoom
hasGuestRoom
price


In [4]:
#dimension of dataset
print(f'Rows: {df.shape[0]}')
print(f'Columns: {df.shape[1]}')

Rows: 10000
Columns: 17


In [5]:
#data types
df.dtypes

squareMeters           int64
numberOfRooms          int64
hasYard                int64
hasPool                int64
floors                 int64
cityCode               int64
cityPartRange          int64
numPrevOwners          int64
made                   int64
isNewBuilt             int64
hasStormProtector      int64
basement               int64
attic                  int64
garage                 int64
hasStorageRoom         int64
hasGuestRoom           int64
price                float64
dtype: object

In [6]:
df.index

RangeIndex(start=0, stop=10000, step=1)

In [7]:
#check for null values
df.isnull().sum()

squareMeters         0
numberOfRooms        0
hasYard              0
hasPool              0
floors               0
cityCode             0
cityPartRange        0
numPrevOwners        0
made                 0
isNewBuilt           0
hasStormProtector    0
basement             0
attic                0
garage               0
hasStorageRoom       0
hasGuestRoom         0
price                0
dtype: int64

In [8]:
#missing values
df.isna().sum()

squareMeters         0
numberOfRooms        0
hasYard              0
hasPool              0
floors               0
cityCode             0
cityPartRange        0
numPrevOwners        0
made                 0
isNewBuilt           0
hasStormProtector    0
basement             0
attic                0
garage               0
hasStorageRoom       0
hasGuestRoom         0
price                0
dtype: int64

In [9]:
#duplicate values
df.duplicated().sum()

0

In [10]:
#to drop null values
df.dropna(inplace=True) #drop null values
df.drop_duplicates(inplace=True) #remove duplicates

df

Unnamed: 0,squareMeters,numberOfRooms,hasYard,hasPool,floors,cityCode,cityPartRange,numPrevOwners,made,isNewBuilt,hasStormProtector,basement,attic,garage,hasStorageRoom,hasGuestRoom,price
0,75523,3,0,1,63,9373,3,8,2005,0,1,4313,9005,956,0,7,7559081.5
1,80771,39,1,1,98,39381,8,6,2015,1,0,3653,2436,128,1,2,8085989.5
2,55712,58,0,1,19,34457,6,8,2021,0,0,2937,8852,135,1,9,5574642.1
3,32316,47,0,0,6,27939,10,4,2012,0,1,659,7141,359,0,3,3232561.2
4,70429,19,1,1,90,38045,3,7,1990,1,0,8435,2429,292,1,4,7055052.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,1726,89,0,1,5,73133,7,6,2009,0,1,9311,1698,218,0,4,176425.9
9996,44403,29,1,1,12,34606,9,4,1990,0,1,9061,1742,230,0,0,4448474.0
9997,83841,3,0,0,69,80933,10,10,2005,1,1,8304,7730,345,1,9,8390030.5
9998,59036,70,0,0,96,55856,1,3,2010,0,1,2590,6174,339,1,4,5905107.0


In [11]:
x = df.drop(['price'], axis=1)
y = df['price']
y

0       7559081.5
1       8085989.5
2       5574642.1
3       3232561.2
4       7055052.0
          ...    
9995     176425.9
9996    4448474.0
9997    8390030.5
9998    5905107.0
9999     146708.4
Name: price, Length: 10000, dtype: float64

In [12]:
x

Unnamed: 0,squareMeters,numberOfRooms,hasYard,hasPool,floors,cityCode,cityPartRange,numPrevOwners,made,isNewBuilt,hasStormProtector,basement,attic,garage,hasStorageRoom,hasGuestRoom
0,75523,3,0,1,63,9373,3,8,2005,0,1,4313,9005,956,0,7
1,80771,39,1,1,98,39381,8,6,2015,1,0,3653,2436,128,1,2
2,55712,58,0,1,19,34457,6,8,2021,0,0,2937,8852,135,1,9
3,32316,47,0,0,6,27939,10,4,2012,0,1,659,7141,359,0,3
4,70429,19,1,1,90,38045,3,7,1990,1,0,8435,2429,292,1,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,1726,89,0,1,5,73133,7,6,2009,0,1,9311,1698,218,0,4
9996,44403,29,1,1,12,34606,9,4,1990,0,1,9061,1742,230,0,0
9997,83841,3,0,0,69,80933,10,10,2005,1,1,8304,7730,345,1,9
9998,59036,70,0,0,96,55856,1,3,2010,0,1,2590,6174,339,1,4


In [13]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=23, shuffle=True)

In [14]:
train_frame = [x_train, y_train]

In [15]:
df_train = pd.concat(train_frame, axis=1, ignore_index=True)

In [16]:
df_train

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16
9391,66289,69,1,1,14,96579,2,5,2000,1,0,9744,2695,890,0,2,6635346.0
2730,55391,27,0,1,7,82603,4,1,1991,0,1,566,2845,777,0,2,5544879.1
1189,31810,81,0,0,20,48530,6,4,1994,0,0,1413,5512,137,0,5,3183159.4
7066,96134,2,0,0,61,97926,8,9,2016,0,1,8390,6136,400,1,1,9616397.6
8194,85490,75,0,1,63,64191,2,1,1992,1,1,392,3256,103,1,10,8554823.2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
347,34666,75,1,1,14,28826,5,2,2002,1,1,9405,9154,186,1,6,3477313.2
6175,98767,58,1,0,80,4181,5,7,1997,1,0,7301,7232,452,0,10,9881688.7
9704,37717,79,1,1,34,94775,5,3,1998,0,1,1872,7805,620,0,7,3779961.8
9256,68738,8,0,1,74,40326,8,4,2010,0,0,8272,3502,852,0,4,6884205.0


In [19]:
#split dataset into multiple dataset
df

Unnamed: 0,squareMeters,numberOfRooms,hasYard,hasPool,floors,cityCode,cityPartRange,numPrevOwners,made,isNewBuilt,hasStormProtector,basement,attic,garage,hasStorageRoom,hasGuestRoom,price
0,75523,3,0,1,63,9373,3,8,2005,0,1,4313,9005,956,0,7,7559081.5
1,80771,39,1,1,98,39381,8,6,2015,1,0,3653,2436,128,1,2,8085989.5
2,55712,58,0,1,19,34457,6,8,2021,0,0,2937,8852,135,1,9,5574642.1
3,32316,47,0,0,6,27939,10,4,2012,0,1,659,7141,359,0,3,3232561.2
4,70429,19,1,1,90,38045,3,7,1990,1,0,8435,2429,292,1,4,7055052.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,1726,89,0,1,5,73133,7,6,2009,0,1,9311,1698,218,0,4,176425.9
9996,44403,29,1,1,12,34606,9,4,1990,0,1,9061,1742,230,0,0,4448474.0
9997,83841,3,0,0,69,80933,10,10,2005,1,1,8304,7730,345,1,9,8390030.5
9998,59036,70,0,0,96,55856,1,3,2010,0,1,2590,6174,339,1,4,5905107.0


In [23]:
df = df.sample(frac=1) #shuffle dataset

In [24]:
##split dataset into multiple dataset
df_train = df[:7000]
df_test = df[7000:]

In [25]:
df_train

Unnamed: 0,squareMeters,numberOfRooms,hasYard,hasPool,floors,cityCode,cityPartRange,numPrevOwners,made,isNewBuilt,hasStormProtector,basement,attic,garage,hasStorageRoom,hasGuestRoom,price
3894,89671,71,0,0,15,19554,6,5,1994,0,0,9289,2312,583,0,0,8967749.4
3660,26872,88,1,0,24,11539,5,2,2001,1,1,511,3737,718,1,9,2692017.1
5889,69399,77,0,0,1,23261,2,6,2003,0,0,9456,8699,604,0,8,6940277.3
8519,78934,44,0,0,22,57594,4,3,1996,0,0,6556,7690,870,0,3,7895339.6
2726,51585,70,0,1,8,89259,8,7,2010,0,1,268,1459,831,1,4,5163838.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3908,67825,8,0,1,9,78465,9,5,2021,1,0,3764,1002,234,0,4,6787554.1
353,533,76,1,1,55,94923,6,8,2021,1,1,654,3585,301,0,9,65465.1
3993,49274,96,0,0,44,25305,5,5,2006,1,0,1721,8291,201,0,5,4932274.6
5092,33267,84,0,0,68,35693,1,6,2018,1,0,2950,8565,631,0,0,3329395.8


In [26]:
df_test

Unnamed: 0,squareMeters,numberOfRooms,hasYard,hasPool,floors,cityCode,cityPartRange,numPrevOwners,made,isNewBuilt,hasStormProtector,basement,attic,garage,hasStorageRoom,hasGuestRoom,price
9357,69665,77,1,1,3,43581,4,6,2009,1,0,4784,6906,329,1,0,6972179.9
4664,846,51,0,0,51,48599,2,8,1995,0,0,302,1509,946,0,2,88673.5
4758,16847,47,1,0,66,34799,2,2,2009,0,0,8115,9158,212,1,7,1687484.9
4448,72147,38,1,0,13,48522,5,2,2013,0,0,285,3733,882,0,9,7219271.3
4741,25124,97,0,1,25,83534,6,9,2014,0,1,397,8988,962,0,3,2517156.4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9169,39661,59,0,0,83,97719,1,9,2000,0,1,6114,6989,386,0,3,3974103.0
8134,84004,89,1,1,32,84934,9,1,2004,0,1,9296,4463,352,0,2,8409632.4
3639,12584,20,1,1,14,47048,7,3,1992,1,0,7808,8350,519,1,10,1266802.2
7031,63017,23,0,1,33,21265,10,10,1999,0,1,8223,5786,404,1,5,6307594.9


In [27]:
df_train.to_csv('train.csv', index=False)
df_test.to_csv('test.csv', index=False)