In [0]:
import numpy as np
import pandas as pd


## Read from text files

In [0]:
# pd.read_csv({filename},sep={sep},encoding={encoding})

## Read from relational database

In [0]:
import pymysql

# create a connection object 'conn'
conn = pymysql.connect(host = "localhost",
                      user="root",
                      passwd="12345",
                      db="information_schema")

# create a cursor object c
c = conn.cursor()

# execute query using c.execute
c.execute("select * from engines;")

# getting the first row of data as a tuple
all_rows = c.fetchall()

# to get only the first row, use c.fetchone() instead


df = pd.DataFrame(list(all_rows), columns=["engine", "support", "comment", 
                                           "transactions", "XA", "savepoints"])
df.head()

## Read from websites - Website scrapping

In [0]:
# 'BeautifulSoup' library is used to parse HTML files 
import requests, bs4

# getting HTML data from the Google play web page
url = "https://play.google.com/store/apps/details?id=com.facebook.orca&hl=en"
req = requests.get(url)

# create a bs4 object
# To avoid warnings, provide "html5lib" explicitly
soup = bs4.BeautifulSoup(req.text, "html5lib")
#print(soup)

# getting all the text inside class = "review-body"
reviews = soup.select('.review-body')
print(type(reviews))
print(len(reviews))
print("\n")

# printing an element of the reviews list
print(reviews[6])

## Read from APIs

In [0]:
import requests, json

# Make the request with the coordinates of San Francisco.
parameters = {"lat": 37.78, "lon": -122.41}
response = requests.get("http://api.open-notify.org/iss-pass.json", params=parameters)

# Get the response data as a python object.  Verify that it's a dictionary.
data = response.json()
print(type(data))
print(data)

print(response.headers)
print(response.headers["content-type"])


# Get the response from the API endpoint.
response = requests.get("http://api.open-notify.org/astros.json")
data = response.json()

# 9 people are currently in space.
print(data["number"])
print(data)

## Reading data from PDF files

In [0]:
import pyPDF2

# Mounting Google Drive locally

In [0]:
# from google.colab import drive
# drive.mount('/content/gdrive')

# Cleaning datasets

In [0]:
import numpy as np
import pandas as pd

df = pd.read_csv("/content/gdrive/My Drive/Colab Notebooks/melbourne.csv")
df.head()

In [0]:
print(df.shape)
print(df.info())

In [0]:
df.isnull()

# summing up the missing values (column-wise)
df.isnull().sum()

# columns having at least one missing value
df.isnull().any()

# above is equivalent to axis=0 (by default, any() operates on columns)
df.isnull().any(axis=0) #axis=0 => column-wise, axis=1 => row-wise

In [0]:
# check if rows which have all values missing
df.isnull().all(axis=1)
df.isnull().all(axis=1).sum() # 0 => There are no rows with all column values missing

# sum of missing values in each row
df.isnull().sum(axis=1)

## Treat missing values


1.   Do nothing if algorithm doesn't complaing about missing values
2.   Delete them
3.   Replace with value such as Mean, median, mode, etc. 



In [0]:
# Summing up the missing values (column-wise) %
round(100*(df.isnull().sum()/len(df.index)),2)

In [0]:
# remove columns with more than 30% missing values
df = df.drop('BuildingArea', axis=1)
df = df.drop('YearBuilt', axis=1)
df = df.drop('CouncilArea', axis=1)


In [0]:
 # Summing up the missing values (column-wise) %
round(100*(df.isnull().sum()/len(df.index)),2)

In [0]:
# check rows with more than 5 missing values
df[df.isnull().sum(axis=1) > 5]
len(df[df.isnull().sum(axis=1) > 5].index)
100*(len(df[df.isnull().sum(axis=1) > 5].index)/len(df.index))

In [0]:
# remove rows with more than 5 missing values
df = df[df.isnull().sum(axis=1) <= 5]

 # Summing up the missing values (column-wise) %
round(100*(df.isnull().sum()/len(df.index)),2)

In [0]:
# remove rows with Price null
df = df[~np.isnan(df['Price'])]

 # Summing up the missing values (column-wise) %
round(100*(df.isnull().sum()/len(df.index)),2)

In [0]:
# check Landsize column and try to impute values for this column
df['Landsize'].describe()

Notice that min is 0, max is 433014, the mean is 558, median (50%) is 440. There's a significant variation in the 25th and 75th percentile as well. (176 to 651)

Thus imputing this with mean/median seems quite biased, and so we should remove the NaNs.

In [0]:
# removing NaNs in Landsize
df = df[~np.isnan(df['Landsize'])]

 # Summing up the missing values (column-wise) %
round(100*(df.isnull().sum()/len(df.index)),2)

There's still marginal data with missing values. Bathroom, Car, Latitude, Longitude. Let's first look at Latitude and Longitude

In [0]:
# rows with missing Latitude, Longitude
df[np.isnan(df['Lattitude'])]

In [0]:
# Let's look at the summary stats of Latitude/Longitude columns
df.loc[:,['Lattitude','Longtitude']].describe()

Since, Std deviation is very small, we can impute these missing values with mean of corresponding columns. 

In [0]:
# imputing Lattitude and Longtitude by mean values
df.loc[np.isnan(df['Lattitude']), ['Lattitude']] = df['Lattitude'].mean()
df.loc[np.isnan(df['Longtitude']), ['Longtitude']] = df['Longtitude'].mean()

 # Summing up the missing values (column-wise) %
round(100*(df.isnull().sum()/len(df.index)),2)

We have Bathroom, Car have missing values with 0.01% and 0.46%. Let's look at the statistics

In [0]:
df.loc[:, ['Bathroom','Car']].describe()

These 2 are integer type variables and thus have values 0, 1, 2, etc. You cannot impute the NaNs by mean or median. Thus, you need to impute them by the mode - the most common occuring value

In [0]:
# converting to type'category
df['Car']  = df['Car'].astype('category')

# displaying frequencies of each category
df['Car'].value_counts()

In [0]:
# the most common occuring is 2, so let's impute NaNs by that. 

#imputing NaNs by 2.0
df.loc[pd.isnull(df['Car']), ['Car']] = 2

 # Summing up the missing values (column-wise) %
round(100*(df.isnull().sum()/len(df.index)),2)

In [0]:
# Similar to Bathroom

df['Bathroom'] = df['Bathroom'].astype('category')

df['Bathroom'].value_counts()

In [0]:
df.loc[pd.isnull(df['Bathroom']),['Bathroom']] = 1

 # Summing up the missing values (column-wise) %
round(100*(df.isnull().sum()/len(df.index)),2)

There are no missing values now. Let's finally look at how many rows (apart from 3 columns) we have lost in the process(originally we had 23547):

In [0]:
df.shape

In [0]:
100 * (len(df.index)/23547)

Thus we have lost about 42% of observations in cleaning the missing values. 