In [247]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [248]:
df = pd.read_csv('rent_data.csv')

In [249]:
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5928 entries, 0 to 5927
Data columns (total 9 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   district       5928 non-null   object
 1   neighborhood   5928 non-null   object
 2   room           5928 non-null   int64 
 3   living_room    5928 non-null   int64 
 4   area           5928 non-null   int64 
 5   age            5928 non-null   int64 
 6   floor          5928 non-null   int64 
 7   price          5928 non-null   int64 
 8   area_per_room  5928 non-null   int64 
dtypes: int64(7), object(2)
memory usage: 416.9+ KB
None


In [250]:
df['district'] = df['district'].astype('category')
df['neighborhood'] = df['neighborhood'].astype('category')
df['room'] = df['room'].astype('int')
df['living_room'] = df['living_room'].astype('int')
df['area'] = df['area'].astype('int')
df['age'] = df['age'].astype('int')
df['floor'] = df['floor'].astype('int')
df['price'] = df['price'].astype('int')

In [251]:
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5928 entries, 0 to 5927
Data columns (total 9 columns):
 #   Column         Non-Null Count  Dtype   
---  ------         --------------  -----   
 0   district       5928 non-null   category
 1   neighborhood   5928 non-null   category
 2   room           5928 non-null   int32   
 3   living_room    5928 non-null   int32   
 4   area           5928 non-null   int32   
 5   age            5928 non-null   int32   
 6   floor          5928 non-null   int32   
 7   price          5928 non-null   int32   
 8   area_per_room  5928 non-null   int64   
dtypes: category(2), int32(6), int64(1)
memory usage: 224.5 KB
None


In [252]:
columns = df.select_dtypes(include=[np.number]).columns
min_values = []
max_values = []
for column in columns:
    Q1 = df[column].quantile(0.25)
    Q3 = df[column].quantile(0.75)
    IQR = Q3 - Q1
    min_value = Q1 - 1.5 * IQR
    max_value = Q3 + 1.5 * IQR
    min_values.append(min_value)
    max_values.append(max_value)
    print(f"Column: {column}, min: {min_value}, max: {max_value}")

Column: room, min: -2.0, max: 6.0
Column: living_room, min: 1.0, max: 1.0
Column: area, min: 0.0, max: 200.0
Column: age, min: -25.0, max: 63.0
Column: floor, min: -3.5, max: 8.5
Column: price, min: -25000.0, max: 95000.0
Column: area_per_room, min: 13.0, max: 53.0


In [253]:
for i, column in enumerate(columns):
    df = df[(df[column] >= min_values[i]) & (df[column] <= max_values[i])]

In [254]:
print(df.info())

<class 'pandas.core.frame.DataFrame'>
Index: 4501 entries, 0 to 5926
Data columns (total 9 columns):
 #   Column         Non-Null Count  Dtype   
---  ------         --------------  -----   
 0   district       4501 non-null   category
 1   neighborhood   4501 non-null   category
 2   room           4501 non-null   int32   
 3   living_room    4501 non-null   int32   
 4   area           4501 non-null   int32   
 5   age            4501 non-null   int32   
 6   floor          4501 non-null   int32   
 7   price          4501 non-null   int32   
 8   area_per_room  4501 non-null   int64   
dtypes: category(2), int32(6), int64(1)
memory usage: 210.8 KB
None


In [255]:
df.describe()

Unnamed: 0,room,living_room,area,age,floor,price,area_per_room
count,4501.0,4501.0,4501.0,4501.0,4501.0,4501.0,4501.0
mean,2.013997,1.0,96.252166,21.361697,2.179071,31988.976894,31.723395
std,0.767698,0.0,30.004551,13.917257,2.329696,17308.437236,5.9248
min,1.0,1.0,27.0,0.0,-3.0,1300.0,13.0
25%,1.0,1.0,75.0,8.0,1.0,20000.0,28.0
50%,2.0,1.0,90.0,25.0,2.0,27000.0,31.0
75%,3.0,1.0,115.0,30.0,4.0,40000.0,35.0
max,6.0,1.0,200.0,63.0,8.0,95000.0,53.0


In [256]:
df = df[df['price'] >= 10000]

In [257]:
df = df[df['price'] <= 70000]

In [258]:
print(df.info())

<class 'pandas.core.frame.DataFrame'>
Index: 4247 entries, 2 to 5926
Data columns (total 9 columns):
 #   Column         Non-Null Count  Dtype   
---  ------         --------------  -----   
 0   district       4247 non-null   category
 1   neighborhood   4247 non-null   category
 2   room           4247 non-null   int32   
 3   living_room    4247 non-null   int32   
 4   area           4247 non-null   int32   
 5   age            4247 non-null   int32   
 6   floor          4247 non-null   int32   
 7   price          4247 non-null   int32   
 8   area_per_room  4247 non-null   int64   
dtypes: category(2), int32(6), int64(1)
memory usage: 200.1 KB
None


In [259]:
print(df.describe())

              room  living_room         area          age        floor  \
count  4247.000000       4247.0  4247.000000  4247.000000  4247.000000   
mean      1.984224          1.0    94.632917    21.525783     2.151872   
std       0.752440          0.0    28.763621    13.844801     2.320577   
min       1.000000          1.0    27.000000     0.000000    -3.000000   
25%       1.000000          1.0    75.000000     8.000000     1.000000   
50%       2.000000          1.0    90.000000    25.000000     2.000000   
75%       2.000000          1.0   110.000000    30.000000     4.000000   
max       5.000000          1.0   200.000000    63.000000     8.000000   

              price  area_per_room  
count   4247.000000    4247.000000  
mean   30012.723334      31.538733  
std    13709.090454       5.838165  
min    10000.000000      13.000000  
25%    20000.000000      28.000000  
50%    26000.000000      31.000000  
75%    37500.000000      35.000000  
max    70000.000000      53.000000  


In [260]:
df.to_csv('data_cleaned.csv',index=False)