# Libraries

In [1]:
import numpy as np
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import re
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# How to extract the dataset/dataframe from an url

* import the respective libraries to read the text from an url
* then check the data by displaying

showing only few lines to not clutter the notebook

In [2]:
import requests

response = requests.get('http://lib.stat.cmu.edu/datasets/boston')
data = response.text
for i, line in enumerate(data.split('\n')):
    if i<24:
        print(f'{i}   {line}' )
    elif i>1020:
        print(f'{i}   {line}' )

0    The Boston house-price data of Harrison, D. and Rubinfeld, D.L. 'Hedonic
1    prices and the demand for clean air', J. Environ. Economics & Management,
2    vol.5, 81-102, 1978.   Used in Belsley, Kuh & Welsch, 'Regression diagnostics
3    ...', Wiley, 1980.   N.B. Various transformations are used in the table on
4    pages 244-261 of the latter.
5   
6    Variables in order:
7    CRIM     per capita crime rate by town
8    ZN       proportion of residential land zoned for lots over 25,000 sq.ft.
9    INDUS    proportion of non-retail business acres per town
10    CHAS     Charles River dummy variable (= 1 if tract bounds river; 0 otherwise)
11    NOX      nitric oxides concentration (parts per 10 million)
12    RM       average number of rooms per dwelling
13    AGE      proportion of owner-occupied units built prior to 1940
14    DIS      weighted distances to five Boston employment centres
15    RAD      index of accessibility to radial highways
16    TAX      full-value proper

In [3]:
# enbale below two lines to view entire text data
# for i, line in enumerate(data.split('\n')):
#     print(f'{i}   {line}' )

**Analysis:**
* columns are located from the line 6 to 20
* data is from line 22 to end-1
* **Note**: the last line is empty so consider only upto last but one line

## Extract Columns
* Using regular expressions to extract the columns from the text

In [4]:
columns = []
for i,line in enumerate(data.split('\n')[:-1]):
    if i > 6 and i <21:
        if re.match('^\s*([A-Z]+)',line):
            columns.append(re.match('^\s*([A-Z]+)',line).groups()[0])
columns

['CRIM',
 'ZN',
 'INDUS',
 'CHAS',
 'NOX',
 'RM',
 'AGE',
 'DIS',
 'RAD',
 'TAX',
 'PTRATIO',
 'B',
 'LSTAT',
 'MEDV']

## Extract data
* now extracting the data into a dataframe

In [5]:
l = {}
for i,line in enumerate(data.split('\n')[:-1]):
    if i>21:
        x = re.findall('[0-9.]+',line)

        if len(x)>3:
            l[i]=x
        else:
            l[i-1].extend(x)

In [6]:
# visualizing the data in dictionary l
for key in list(l.keys())[:3]:
    print(l[key])

['0.00632', '18.00', '2.310', '0', '0.5380', '6.5750', '65.20', '4.0900', '1', '296.0', '15.30', '396.90', '4.98', '24.00']
['0.02731', '0.00', '7.070', '0', '0.4690', '6.4210', '78.90', '4.9671', '2', '242.0', '17.80', '396.90', '9.14', '21.60']
['0.02729', '0.00', '7.070', '0', '0.4690', '7.1850', '61.10', '4.9671', '2', '242.0', '17.80', '392.83', '4.03', '34.70']


## Create DataFrame
* now combine the columns with the data

In [7]:
df=pd.DataFrame(columns=columns)
df.columns=columns
for i in l:
    row = l[i]
    df.loc[i,:]=row
df.reset_index(drop=True,inplace=True)
df

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,MEDV
0,0.00632,18.00,2.310,0,0.5380,6.5750,65.20,4.0900,1,296.0,15.30,396.90,4.98,24.00
1,0.02731,0.00,7.070,0,0.4690,6.4210,78.90,4.9671,2,242.0,17.80,396.90,9.14,21.60
2,0.02729,0.00,7.070,0,0.4690,7.1850,61.10,4.9671,2,242.0,17.80,392.83,4.03,34.70
3,0.03237,0.00,2.180,0,0.4580,6.9980,45.80,6.0622,3,222.0,18.70,394.63,2.94,33.40
4,0.06905,0.00,2.180,0,0.4580,7.1470,54.20,6.0622,3,222.0,18.70,396.90,5.33,36.20
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
501,0.06263,0.00,11.930,0,0.5730,6.5930,69.10,2.4786,1,273.0,21.00,391.99,9.67,22.40
502,0.04527,0.00,11.930,0,0.5730,6.1200,76.70,2.2875,1,273.0,21.00,396.90,9.08,20.60
503,0.06076,0.00,11.930,0,0.5730,6.9760,91.00,2.1675,1,273.0,21.00,396.90,5.64,23.90
504,0.10959,0.00,11.930,0,0.5730,6.7940,89.30,2.3889,1,273.0,21.00,393.45,6.48,22.00


In [23]:
df.to_csv('..\input\Boston-house-price-data.csv',index=False,line_terminator='\r')

# Happy learning!!!