In [1]:
# Connecting google colab with drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
# Importing required packages
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

In [3]:
# Reading the 'Person_Heights_Data.csv' file as data
data = pd.read_csv('/content/drive/MyDrive/Datasets/Person_Heights_Data.csv')
data.head()

Unnamed: 0,name,height
0,mohan,5.9
1,maria,5.2
2,sakib,5.1
3,tao,5.5
4,virat,4.9


In [4]:
# Check summary statistics of this data
data.describe()

Unnamed: 0,height
count,14.0
mean,6.05
std,2.779804
min,1.2
25%,5.25
50%,5.55
75%,6.175
max,14.5


In [6]:
# Get the 95% quantile value of height from data
max_limit = data['height'].quantile(0.95)
max_limit
# Meaning: 9.68 is 95% quantile - 95% of the 14 samples is less than 9.68, so anything above this number is considered as outlier

9.689999999999998

In [7]:
# Get the data point that is above the max limit
data[data['height'] > max_limit]

Unnamed: 0,name,height
9,imran,14.5


In [8]:
# Get the 5% quantile value of height from data
min_limit = data['height'].quantile(0.05)
min_limit

# Any value less than 3.605 is considered as outlier

3.6050000000000004

In [9]:
# Get the data point that is below that min_limit
data[data['height'] < min_limit]

Unnamed: 0,name,height
12,yoseph,1.2


In [11]:
# Obtain the outlier free data set from the original data set
clean_data = data[(data['height'] < max_limit) & (data['height'] > min_limit)]
clean_data

Unnamed: 0,name,height
0,mohan,5.9
1,maria,5.2
2,sakib,5.1
3,tao,5.5
4,virat,4.9
5,khusbu,5.4
6,dmitry,6.2
7,selena,6.5
8,john,7.1
10,jose,6.1


In [12]:
# Read the 'Bangalore_PropertPrice_Data.csv' file as df
df = pd.read_csv('/content/drive/MyDrive/Datasets/Bangalore_PropertyPrice_Data.csv')
df.head()

Unnamed: 0,location,size,total_sqft,bath,price,bhk,price_per_sqft
0,Electronic City Phase II,2 BHK,1056.0,2.0,39.07,2,3699
1,Chikka Tirupathi,4 Bedroom,2600.0,5.0,120.0,4,4615
2,Uttarahalli,3 BHK,1440.0,2.0,62.0,3,4305
3,Lingadheeranahalli,3 BHK,1521.0,3.0,95.0,3,6245
4,Kothanur,2 BHK,1200.0,2.0,51.0,2,4250


In [13]:
# Check for any missing values
df.isnull().sum()

Unnamed: 0,0
location,0
size,0
total_sqft,0
bath,0
price,0
bhk,0
price_per_sqft,0


In [14]:
# Check the shape of the dataframe
df.shape

(13200, 7)

In [15]:
# Get the statistical summary of the data frame
df.describe()

Unnamed: 0,total_sqft,bath,price,bhk,price_per_sqft
count,13200.0,13200.0,13200.0,13200.0,13200.0
mean,1555.302783,2.691136,112.276178,2.800833,7920.337
std,1237.323445,1.338915,149.175995,1.292843,106727.2
min,1.0,1.0,8.0,1.0,267.0
25%,1100.0,2.0,50.0,2.0,4267.0
50%,1275.0,2.0,71.85,3.0,5438.0
75%,1672.0,3.0,120.0,3.0,7317.0
max,52272.0,40.0,3600.0,43.0,12000000.0


In [16]:
# Find the minimum and maximum quantile based on the 'price_per_sqft' column
# Take 0.001 for minimu and 0.999 for maximum quantile values
min_limit, max_limit = df['price_per_sqft'].quantile([0.001,0.999])
min_limit, max_limit

(1366.184, 50959.36200000098)

In [17]:
# Find out the data entries having price_per_sqft less than min_limit
df[df['price_per_sqft'] < min_limit]

Unnamed: 0,location,size,total_sqft,bath,price,bhk,price_per_sqft
665,Yelahanka,3 BHK,35000.0,3.0,130.0,3,371
798,other,4 Bedroom,10961.0,4.0,80.0,4,729
1867,other,3 Bedroom,52272.0,2.0,140.0,3,267
2392,other,4 Bedroom,2000.0,3.0,25.0,4,1250
3934,other,1 BHK,1500.0,1.0,19.5,1,1300
5343,other,9 BHK,42000.0,8.0,175.0,9,416
5417,Ulsoor,4 BHK,36000.0,4.0,450.0,4,1250
5597,JP Nagar,2 BHK,1100.0,1.0,15.0,2,1363
7166,Yelahanka,1 Bedroom,26136.0,1.0,150.0,1,573
7862,JP Nagar,3 BHK,20000.0,3.0,175.0,3,875


In [18]:
# Find out the data entries having price_per_sqft greater than max_limit
df[df['price_per_sqft'] > max_limit]

Unnamed: 0,location,size,total_sqft,bath,price,bhk,price_per_sqft
345,other,3 Bedroom,11.0,3.0,74.0,3,672727
1005,other,1 BHK,15.0,1.0,30.0,1,200000
1106,other,5 Bedroom,24.0,2.0,150.0,5,625000
4044,Sarjapur Road,4 Bedroom,1.0,4.0,120.0,4,12000000
4924,other,7 BHK,5.0,7.0,115.0,7,2300000
5911,Mysore Road,1 Bedroom,45.0,1.0,23.0,1,51111
6356,Bommenahalli,4 Bedroom,2940.0,3.0,2250.0,4,76530
7012,other,1 BHK,650.0,1.0,500.0,1,76923
7575,other,1 BHK,425.0,1.0,750.0,1,176470
7799,other,4 BHK,2000.0,3.0,1063.0,4,53150


In [19]:
# Create a new data frame that is free from outliers
# Store the result in cleaned_df
cleaned_df = df[(df['price_per_sqft'] < max_limit) & (df['price_per_sqft'] > min_limit)]
cleaned_df

Unnamed: 0,location,size,total_sqft,bath,price,bhk,price_per_sqft
0,Electronic City Phase II,2 BHK,1056.0,2.0,39.07,2,3699
1,Chikka Tirupathi,4 Bedroom,2600.0,5.0,120.00,4,4615
2,Uttarahalli,3 BHK,1440.0,2.0,62.00,3,4305
3,Lingadheeranahalli,3 BHK,1521.0,3.0,95.00,3,6245
4,Kothanur,2 BHK,1200.0,2.0,51.00,2,4250
...,...,...,...,...,...,...,...
13195,Whitefield,5 Bedroom,3453.0,4.0,231.00,5,6689
13196,other,4 BHK,3600.0,5.0,400.00,4,11111
13197,Raja Rajeshwari Nagar,2 BHK,1141.0,2.0,60.00,2,5258
13198,Padmanabhanagar,4 BHK,4689.0,4.0,488.00,4,10407


In [20]:
# To get randomly choosen 10 sample rows, .sample(n) method can be used
# For example:
cleaned_df.sample(7)

Unnamed: 0,location,size,total_sqft,bath,price,bhk,price_per_sqft
9917,Chikka Tirupathi,3 Bedroom,2325.0,3.0,95.0,3,4086
9141,other,4 Bedroom,2400.0,4.0,89.0,4,3708
2476,Hosakerehalli,4 BHK,1500.0,3.0,70.0,4,4666
5885,Kanakpura Road,3 BHK,1450.0,3.0,62.4,3,4303
6784,other,2 Bedroom,1230.0,2.0,75.0,2,6097
12910,other,4 BHK,2710.0,5.0,142.0,4,5239
3650,Haralur Road,2 BHK,1300.0,2.0,79.0,2,6076
