# Importing libraries

In [None]:
import numpy as np
import pandas as pd # Data Manipulation, Data Cleaning, Data Exploration, Data Visualization
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score
from sklearn.model_selection import train_test_split

# Importing dataset

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
df = pd.read_csv("/content/Housing.csv")

In [None]:
df

Unnamed: 0,price,area,bedrooms,bathrooms,stories,mainroad,guestroom,basement,hotwaterheating,airconditioning,parking,prefarea,furnishingstatus
0,13300000,7420,4,2,3,yes,no,no,no,yes,2,yes,furnished
1,12250000,8960,4,4,4,yes,no,no,no,yes,3,no,furnished
2,12250000,9960,3,2,2,yes,no,yes,no,no,2,yes,semi-furnished
3,12215000,7500,4,2,2,yes,no,yes,no,yes,3,yes,furnished
4,11410000,7420,4,1,2,yes,yes,yes,no,yes,2,no,furnished
...,...,...,...,...,...,...,...,...,...,...,...,...,...
540,1820000,3000,2,1,1,yes,no,yes,no,no,2,no,unfurnished
541,1767150,2400,3,1,1,no,no,no,no,no,0,no,semi-furnished
542,1750000,3620,2,1,1,yes,no,no,no,no,0,no,unfurnished
543,1750000,2910,3,1,1,no,no,no,no,no,0,no,furnished


# Data Analysis Part

In [None]:
df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 545 entries, 0 to 544
Data columns (total 13 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   price             545 non-null    int64 
 1   area              545 non-null    int64 
 2   bedrooms          545 non-null    int64 
 3   bathrooms         545 non-null    int64 
 4   stories           545 non-null    int64 
 5   mainroad          545 non-null    object
 6   guestroom         545 non-null    object
 7   basement          545 non-null    object
 8   hotwaterheating   545 non-null    object
 9   airconditioning   545 non-null    object
 10  parking           545 non-null    int64 
 11  prefarea          545 non-null    object
 12  furnishingstatus  545 non-null    object
dtypes: int64(6), object(7)
memory usage: 55.5+ KB


In [None]:
#    total values (entries) - non-null values = null values
#    545 - 545 = 0,   so zero null values in dataset

In [None]:
# null data checking 
df.isnull()

Unnamed: 0,price,area,bedrooms,bathrooms,stories,mainroad,guestroom,basement,hotwaterheating,airconditioning,parking,prefarea,furnishingstatus
0,False,False,False,False,False,False,False,False,False,False,False,False,False
1,False,False,False,False,False,False,False,False,False,False,False,False,False
2,False,False,False,False,False,False,False,False,False,False,False,False,False
3,False,False,False,False,False,False,False,False,False,False,False,False,False
4,False,False,False,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...
540,False,False,False,False,False,False,False,False,False,False,False,False,False
541,False,False,False,False,False,False,False,False,False,False,False,False,False
542,False,False,False,False,False,False,False,False,False,False,False,False,False
543,False,False,False,False,False,False,False,False,False,False,False,False,False


In [None]:
df.isnull().any()   # it rturn bool value


price               False
area                False
bedrooms            False
bathrooms           False
stories             False
mainroad            False
guestroom           False
basement            False
hotwaterheating     False
airconditioning     False
parking             False
prefarea            False
furnishingstatus    False
dtype: bool

In [None]:
df.head(10)


Unnamed: 0,price,area,bedrooms,bathrooms,stories,mainroad,guestroom,basement,hotwaterheating,airconditioning,parking,prefarea,furnishingstatus
0,13300000,7420,4,2,3,yes,no,no,no,yes,2,yes,furnished
1,12250000,8960,4,4,4,yes,no,no,no,yes,3,no,furnished
2,12250000,9960,3,2,2,yes,no,yes,no,no,2,yes,semi-furnished
3,12215000,7500,4,2,2,yes,no,yes,no,yes,3,yes,furnished
4,11410000,7420,4,1,2,yes,yes,yes,no,yes,2,no,furnished
5,10850000,7500,3,3,1,yes,no,yes,no,yes,2,yes,semi-furnished
6,10150000,8580,4,3,4,yes,no,no,no,yes,2,yes,semi-furnished
7,10150000,16200,5,3,2,yes,no,no,no,no,0,no,unfurnished
8,9870000,8100,4,1,2,yes,yes,yes,no,yes,2,yes,furnished
9,9800000,5750,3,2,4,yes,yes,no,no,yes,1,yes,unfurnished


In [None]:
df.tail(10)

Unnamed: 0,price,area,bedrooms,bathrooms,stories,mainroad,guestroom,basement,hotwaterheating,airconditioning,parking,prefarea,furnishingstatus
535,2100000,3360,2,1,1,yes,no,no,no,no,1,no,unfurnished
536,1960000,3420,5,1,2,no,no,no,no,no,0,no,unfurnished
537,1890000,1700,3,1,2,yes,no,no,no,no,0,no,unfurnished
538,1890000,3649,2,1,1,yes,no,no,no,no,0,no,unfurnished
539,1855000,2990,2,1,1,no,no,no,no,no,1,no,unfurnished
540,1820000,3000,2,1,1,yes,no,yes,no,no,2,no,unfurnished
541,1767150,2400,3,1,1,no,no,no,no,no,0,no,semi-furnished
542,1750000,3620,2,1,1,yes,no,no,no,no,0,no,unfurnished
543,1750000,2910,3,1,1,no,no,no,no,no,0,no,furnished
544,1750000,3850,3,1,2,yes,no,no,no,no,0,no,unfurnished


In [None]:
# creating mapping 
# converting string value to numerical value

dataMappingMainroad = {
    
    "yes": 1,
     "no": 0  
    
}

dataMappingGuestroom = {
    "yes": 1,
    "no": 0
    
}

dataMappingBasement = {
    "yes": 1,
    "no": 0
}

dataMappingHotwaterheating = {
    "yes": 1,
    "no": 0    
}

dataMappingAirconditioning = {
    "yes": 1,
    "no": 0   

    
}


dataMappingPrefarea = {
    "yes": 1,
    "no": 0

}

dataMappingFurnishingstatus = {
    
    "furnished": 3,
    "semi-furnished": 2,
    "unfurnished": 0
    
}




In [None]:
df['numMainroad'] = df['mainroad'].map(dataMappingMainroad)
df['numGuestroom'] = df['guestroom'].map(dataMappingGuestroom)
df['numBasement'] = df['basement'].map(dataMappingBasement)

df['numHotwaterheating'] = df['hotwaterheating'].map(dataMappingHotwaterheating)
df['numAirconditioning'] = df['airconditioning'].map(dataMappingAirconditioning)
df['numPrefarea'] = df['prefarea'].map(dataMappingPrefarea)

df['numFurnishingStatus'] = df['furnishingstatus'].map(dataMappingFurnishingstatus)

df


Unnamed: 0,price,area,bedrooms,bathrooms,stories,mainroad,guestroom,basement,hotwaterheating,airconditioning,parking,prefarea,furnishingstatus,numMainroad,numGuestroom,numBasement,numHotwaterheating,numAirconditioning,numPrefarea,numFurnishingStatus
0,13300000,7420,4,2,3,yes,no,no,no,yes,2,yes,furnished,1,0,0,0,1,1,3
1,12250000,8960,4,4,4,yes,no,no,no,yes,3,no,furnished,1,0,0,0,1,0,3
2,12250000,9960,3,2,2,yes,no,yes,no,no,2,yes,semi-furnished,1,0,1,0,0,1,2
3,12215000,7500,4,2,2,yes,no,yes,no,yes,3,yes,furnished,1,0,1,0,1,1,3
4,11410000,7420,4,1,2,yes,yes,yes,no,yes,2,no,furnished,1,1,1,0,1,0,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
540,1820000,3000,2,1,1,yes,no,yes,no,no,2,no,unfurnished,1,0,1,0,0,0,0
541,1767150,2400,3,1,1,no,no,no,no,no,0,no,semi-furnished,0,0,0,0,0,0,2
542,1750000,3620,2,1,1,yes,no,no,no,no,0,no,unfurnished,1,0,0,0,0,0,0
543,1750000,2910,3,1,1,no,no,no,no,no,0,no,furnished,0,0,0,0,0,0,3


In [None]:
df.shape      # rtuen no. of rows and columns

(545, 20)

In [None]:
df.describe()

Unnamed: 0,price,area,bedrooms,bathrooms,stories,parking,numMainroad,numGuestroom,numBasement,numHotwaterheating,numAirconditioning,numPrefarea,numFurnishingStatus
count,545.0,545.0,545.0,545.0,545.0,545.0,545.0,545.0,545.0,545.0,545.0,545.0,545.0
mean,4766729.0,5150.541284,2.965138,1.286239,1.805505,0.693578,0.858716,0.177982,0.350459,0.045872,0.315596,0.234862,1.60367
std,1870440.0,2170.141023,0.738064,0.50247,0.867492,0.861586,0.348635,0.382849,0.477552,0.209399,0.46518,0.424302,1.186933
min,1750000.0,1650.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,3430000.0,3600.0,2.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,4340000.0,4600.0,3.0,1.0,2.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,2.0
75%,5740000.0,6360.0,3.0,2.0,2.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0,3.0
max,13300000.0,16200.0,6.0,4.0,4.0,3.0,1.0,1.0,1.0,1.0,1.0,1.0,3.0


In [None]:
df.columns    # it retuns colmns title name


Index(['price', 'area', 'bedrooms', 'bathrooms', 'stories', 'mainroad',
       'guestroom', 'basement', 'hotwaterheating', 'airconditioning',
       'parking', 'prefarea', 'furnishingstatus', 'numMainroad',
       'numGuestroom', 'numBasement', 'numHotwaterheating',
       'numAirconditioning', 'numPrefarea', 'numFurnishingStatus'],
      dtype='object')

In [None]:
df['price'].isnull().any()

False

In [None]:
df['area'].isnull().any()

False

In [None]:
df['bedrooms'].isnull().any()

False

In [None]:
df['bathrooms'].isnull().any()

False

In [None]:
df['stories'].isnull().any()

False

In [None]:
df['mainroad'].isnull().any()

False

In [None]:
df['guestroom'].isnull().any()


False

In [None]:
df['basement'].isnull().any()


False

In [None]:
df['hotwaterheating'].isnull().any()

False

In [None]:
df['airconditioning'].isnull().any()

False

In [None]:
df['parking'].isnull().any()


False

In [None]:
df['prefarea'].isnull().any()

False

In [None]:
df['furnishingstatus'].isnull().any()

False

In [None]:
df['numGuestroom'].isnull().any()


False

In [None]:
df['numBasement'].isnull().any()

False

In [None]:
df['numHotwaterheating'].isnull().any()

False

In [None]:
df['numAirconditioning'].isnull().any()

False

In [None]:
df['numPrefarea'].isnull().any()

False

In [None]:
df['numFurnishingStatus'].isnull().any()

False

# Machine Learning part

In [None]:
df.isnull().any() 

price                  False
area                   False
bedrooms               False
bathrooms              False
stories                False
mainroad               False
guestroom              False
basement               False
hotwaterheating        False
airconditioning        False
parking                False
prefarea               False
furnishingstatus       False
numMainroad            False
numGuestroom           False
numBasement            False
numHotwaterheating     False
numAirconditioning     False
numPrefarea            False
numFurnishingStatus    False
dtype: bool

In [None]:
# feature variable 
X = df[['area', 'bedrooms', 'bathrooms', 'stories','parking',  'numMainroad', 'numGuestroom', 'numBasement', 
        'numHotwaterheating', 'numAirconditioning', 'numPrefarea', 'numFurnishingStatus' ]]  
y = df.price # target variable


In [None]:
df.ndim

2

In [None]:
X.ndim

2

In [None]:
y.ndim

1

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size= 0.20 , random_state = 0)

In [None]:
len(X_train)

436

In [None]:
len(X_test)

109

In [None]:
model = LinearRegression()

In [None]:
model.fit(X_train, y_train) # np.array([X_train]).reshape(-1,1)

LinearRegression()

In [None]:
y_pred = model.predict(X_train) # np.array([X_test]).reshape(-1,1)

In [None]:
y_pred      # model predicted

array([2602013.4997675 , 2698252.60406492, 2017788.87225577,
       2596948.28375185, 4648898.09889123, 3151703.66375143,
       4466777.94362656, 7021264.64305091, 2607078.71578315,
       4245007.93439818, 8418269.6009044 , 2446743.78277267,
       5213152.42487392, 2730925.36325382, 4864890.5763216 ,
       2679780.33551624, 3246055.10676112, 7412885.15232944,
       5086578.68664967, 5075274.33395007, 3739259.30839929,
       4666844.05611724, 3225979.22688705, 5525989.85616987,
       2607078.71578315, 4725672.17594089, 8587591.40114406,
       4474802.66445172, 2967458.29048441, 4272049.40606109,
       2007658.44022447, 4898741.01099204, 4976166.71719998,
       6273304.75315391, 4824839.8983315 , 6584283.4640267 ,
       7297922.53095594, 4822955.0998762 , 7318242.95911165,
       6363566.60298997, 3768634.3137576 , 6007918.3976062 ,
       3250898.47857085, 2656589.34303379, 6840805.79268926,
       5877117.53692747, 5718398.58271225, 9736018.19052191,
       7648362.55890612,

In [None]:
y_test    # actual value

239    4585000
113    6083000
325    4007500
66     6930000
479    2940000
        ...   
76     6650000
132    5810000
311    4123000
464    3080000
155    5530000
Name: price, Length: 109, dtype: int64

In [None]:
# ['area', 'bedrooms', 'bathrooms', 'stories',
#     'parking',  'numMainroad', 'numGuestroom', 'numBasement', 
#       'numHotwaterheating', 'numAirconditioning', 'numPrefarea', 'numFurnishingStatus' ]

model.predict([[  7850, 3, 2, 2, 
                2, 0, 1, 0, 
                0, 1, 1, 2
                ]])


  "X does not have valid feature names, but"


array([7525640.88173442])

In [None]:
model.score(X,y)

0.678410259073752