# Explore and wrangling open data

## Import library

In [1]:
#import library
import pandas as pd
import requests
import re
import codecs
import urllib.request
import html
import os

## Examining file and load data

In [2]:
#open two files and have a look first
#this is the csv file about all endangered animals in Australia
endangered = pd.read_csv('20200402spcs.csv')
endangered

Unnamed: 0,Scientific Name,Common Name,Current Scientific Name,Threatened status,ACT,NSW,NT,QLD,SA,TAS,...,Profile,Date extracted,NSL Name,Family,Genus,Species,Infraspecific Rank,Infraspecies,Species Author,Infraspecies Author
0,Neophoca cinerea,"Australian Sea-lion, Australian Sea Lion",-,Vulnerable,-,-,-,-,Yes,-,...,http://www.environment.gov.au/cgi-bin/sprat/pu...,2020-Apr-02,-,Otariidae,Neophoca,cinerea,-,-,"(Peron,1816)",-
1,Mirounga leonina,Southern Elephant Seal,-,Vulnerable,-,-,-,-,Yes,Yes,...,http://www.environment.gov.au/cgi-bin/sprat/pu...,2020-Apr-02,-,Phocidae,Mirounga,leonina,-,-,"(Linnaeus,1758)",-
2,Balaenoptera borealis,Sei Whale,-,Vulnerable,-,Yes,-,Yes,Yes,Yes,...,http://www.environment.gov.au/cgi-bin/sprat/pu...,2020-Apr-02,-,Balaenopteridae,Balaenoptera,borealis,-,-,"Lesson, 1828",-
3,Balaenoptera musculus,Blue Whale,-,Endangered,-,Yes,Yes,Yes,Yes,Yes,...,http://www.environment.gov.au/cgi-bin/sprat/pu...,2020-Apr-02,-,Balaenopteridae,Balaenoptera,musculus,-,-,"(Linnaeus, 1758)",-
4,Balaenoptera physalus,Fin Whale,-,Vulnerable,-,Yes,-,Yes,Yes,Yes,...,http://www.environment.gov.au/cgi-bin/sprat/pu...,2020-Apr-02,-,Balaenopteridae,Balaenoptera,physalus,-,-,"(Linnaeus, 1758)",-
5,Megaptera novaeangliae,Humpback Whale,-,Vulnerable,-,Yes,Yes,Yes,Yes,Yes,...,http://www.environment.gov.au/cgi-bin/sprat/pu...,2020-Apr-02,-,Balaenopteridae,Megaptera,novaeangliae,-,-,"(Borowski, 1781)",-
6,Eubalaena australis,Southern Right Whale,-,Endangered,-,Yes,-,Yes,Yes,Yes,...,http://www.environment.gov.au/cgi-bin/sprat/pu...,2020-Apr-02,-,Balaenidae,Eubalaena,australis,-,-,"(Desmoulins, 1822)",-
7,Xeromys myoides,"Water Mouse, False Water Rat, Yirrkoo",-,Vulnerable,-,Yes,Yes,Yes,-,-,...,http://www.environment.gov.au/cgi-bin/sprat/pu...,2020-Apr-02,-,Muridae,Xeromys,myoides,-,-,"Thomas, 1889",-
8,Zyzomys pedunculatus,"Central Rock-rat, Antina",-,Critically Endangered,-,-,Yes,-,-,-,...,http://www.environment.gov.au/cgi-bin/sprat/pu...,2020-Apr-02,-,Muridae,Zyzomys,pedunculatus,-,-,"(Waite,1896)",-
9,Pseudomys shortridgei,"Heath Mouse, Dayang, Heath Rat",-,Endangered,-,-,-,-,Yes,-,...,http://www.environment.gov.au/cgi-bin/sprat/pu...,2020-Apr-02,-,Muridae,Pseudomys,shortridgei,-,-,"(Thomas, 1907)",-


In [3]:
# this is the excel file about endangered animals which are affected by bushfire in Australia
bushfire = pd.read_excel('bushfire open data.xlsx',sheet_name = 'Protected Species',header = 46)
bushfire

Unnamed: 0,SPRAT ID,Scientific Name,Scientific Name with links to Species Profile and Threats (SPRAT) database for map and conservation documents,Common Name,Percentage of the species modelled likely and known distribution within fire affected areas,Type,EPBC Act listed Threatened Status,EPBC Act listed Migratory Status,Range states and territories
0,55562,Acacia awestoniana,Acacia awestoniana,Stirling Range Wattle,≥80%,Plant,Vulnerable,,WA
1,10798,Acacia constablei,Acacia constablei,Narrabarba Wattle,≥80%,Plant,Vulnerable,,NSW
2,16916,Andersonia axilliflora,Andersonia axilliflora,Giant Andersonia,≥80%,Plant,Endangered,,WA
3,64838,Baeckea kandos,Baeckea kandos,a shrub,≥80%,Plant,Endangered,,NSW
4,89125,Bertmainius colonus,Bertmainius colonus,Eastern Stirling Range Pygmy Trapdoor Spider,≥80%,Spider,Vulnerable,,WA
5,55850,Budawangia gnidioides,Budawangia gnidioides,Budawangs Cliff-heath,≥80%,Plant,Vulnerable,,NSW
6,56501,Callistemon forresterae,Callistemon forresterae,Forrester's Bottlebrush,≥80%,Plant,Vulnerable,,"NSW, Vic"
7,64862,Callistemon kenmorrisonii,Callistemon kenmorrisonii,Betka Bottlebrush,≥80%,Plant,Vulnerable,,Vic
8,15694,Darwinia squarrosa,Darwinia squarrosa,"Fringed Mountain Bell, Pink Mountain Bell",≥80%,Plant,Vulnerable,,WA
9,76351,Eidothea hardeniana,Eidothea hardeniana,Nightcap Oak,≥80%,Plant,Critically Endangered,,NSW


## Drop columns
* Drop the colmuns we don't need, such as, Current Scientific Name.

In [4]:
# drop the columns which doesn't need in two files
# we only need Name, Threatened status, state location, Kingdom, Class and Profile
df1 = endangered.drop(columns = ['Current Scientific Name',
                                 'Current SPRAT TaxonID',
                                 'Date extracted',
                                 'NSL Name',
                                 'Infraspecific Rank',
                                 'Infraspecies',
                                 'Species Author',
                                 'Infraspecies Author',
                                ], axis=1)
df1

Unnamed: 0,Scientific Name,Common Name,Threatened status,ACT,NSW,NT,QLD,SA,TAS,VIC,...,HMI,AAT,CMA,Listed SPRAT TaxonID,Kingdom,Class,Profile,Family,Genus,Species
0,Neophoca cinerea,"Australian Sea-lion, Australian Sea Lion",Vulnerable,-,-,-,-,Yes,-,-,...,-,-,Yes,22,Animalia,Mammalia,http://www.environment.gov.au/cgi-bin/sprat/pu...,Otariidae,Neophoca,cinerea
1,Mirounga leonina,Southern Elephant Seal,Vulnerable,-,-,-,-,Yes,Yes,-,...,Yes,Yes,Yes,26,Animalia,Mammalia,http://www.environment.gov.au/cgi-bin/sprat/pu...,Phocidae,Mirounga,leonina
2,Balaenoptera borealis,Sei Whale,Vulnerable,-,Yes,-,Yes,Yes,Yes,Yes,...,Yes,Yes,Yes,34,Animalia,Mammalia,http://www.environment.gov.au/cgi-bin/sprat/pu...,Balaenopteridae,Balaenoptera,borealis
3,Balaenoptera musculus,Blue Whale,Endangered,-,Yes,Yes,Yes,Yes,Yes,Yes,...,Yes,Yes,Yes,36,Animalia,Mammalia,http://www.environment.gov.au/cgi-bin/sprat/pu...,Balaenopteridae,Balaenoptera,musculus
4,Balaenoptera physalus,Fin Whale,Vulnerable,-,Yes,-,Yes,Yes,Yes,Yes,...,Yes,Yes,Yes,37,Animalia,Mammalia,http://www.environment.gov.au/cgi-bin/sprat/pu...,Balaenopteridae,Balaenoptera,physalus
5,Megaptera novaeangliae,Humpback Whale,Vulnerable,-,Yes,Yes,Yes,Yes,Yes,Yes,...,Yes,Yes,Yes,38,Animalia,Mammalia,http://www.environment.gov.au/cgi-bin/sprat/pu...,Balaenopteridae,Megaptera,novaeangliae
6,Eubalaena australis,Southern Right Whale,Endangered,-,Yes,-,Yes,Yes,Yes,Yes,...,Yes,Yes,Yes,40,Animalia,Mammalia,http://www.environment.gov.au/cgi-bin/sprat/pu...,Balaenidae,Eubalaena,australis
7,Xeromys myoides,"Water Mouse, False Water Rat, Yirrkoo",Vulnerable,-,Yes,Yes,Yes,-,-,-,...,-,-,-,66,Animalia,Mammalia,http://www.environment.gov.au/cgi-bin/sprat/pu...,Muridae,Xeromys,myoides
8,Zyzomys pedunculatus,"Central Rock-rat, Antina",Critically Endangered,-,-,Yes,-,-,-,-,...,-,-,-,68,Animalia,Mammalia,http://www.environment.gov.au/cgi-bin/sprat/pu...,Muridae,Zyzomys,pedunculatus
9,Pseudomys shortridgei,"Heath Mouse, Dayang, Heath Rat",Endangered,-,-,-,-,Yes,-,Yes,...,-,-,-,77,Animalia,Mammalia,http://www.environment.gov.au/cgi-bin/sprat/pu...,Muridae,Pseudomys,shortridgei


## Filter endangered wildlife affected by bushfire data

* Since two files both have SPRAT TaxonID that is the SPRAT species identifier under which the species is listed on the EPBC threatened species list. We can use this as the index to find endangered animals which infected by bushfire

In [5]:
# filter all endangered animals by SPRAT ID in bushfire endangered animals
# extract SPRAT_ID from bushfire endangered animals data
SPRAT_ID = bushfire['SPRAT ID']

In [6]:
SPRAT_ID.iloc[['330']]

330    4146
Name: SPRAT ID, dtype: int64

In [7]:
# filter endangered animals affected by bushfire
final_data = pd.DataFrame(columns=['Scientific Name','Common Name',
                                  'Threatened status','ACT',
                                  'NSW','NT','QLD','SA',
                                  'TAS','VIC','WA','ACI','CKI','CI','CSI','JBT','NFI',
                                   'HMI','AAT','CMA','Listed SPRAT TaxonID',
                                   'Kingdom','Class','Profile',                                   'Family','Genus','Species'])
#find SPRAT_ID in Listed SPRAT TaxonID column in df1 file and add them into a list
n = 0
for j in SPRAT_ID:
    for i, row in df1.iterrows():
        if j == row['Listed SPRAT TaxonID']:
            n += 1
            dict1 = {}
            values = df1.ix[i].values
            dict1['Scientific Name'] = values[0]
            dict1['Common Name'] = values[1]
            dict1['Threatened status'] = values[2]
            dict1['ACT'] = values[3]
            dict1['NSW'] = values[4]
            dict1['NT'] = values[5]
            dict1['QLD'] = values[6]
            dict1['SA'] = values[7]
            dict1['TAS'] = values[8]
            dict1['VIC'] = values[9]
            dict1['WA'] = values[10]
            dict1['ACI'] = values[11]
            dict1['CKI'] = values[12]
            dict1['CI'] = values[13]
            dict1['CSI'] = values[14]
            dict1['JBT'] = values[15]
            dict1['NFI'] = values[16]
            dict1['HMI'] = values[17]
            dict1['AAT'] = values[18]
            dict1['CMA'] = values[19]
            dict1['Listed SPRAT TaxonID'] = values[20]
            dict1['Kingdom'] = values[21]
            dict1['Class'] = values[22]
            dict1['Profile'] = values[23]
            dict1['Family'] = values[24]
            dict1['Genus'] = values[25]
            dict1['Species'] = values[26]
            final_data = final_data.append(dict1,ignore_index = True) #append all dictionary format data of one wildlife in final_data
print(n) #check how many wildlife we got

.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
  from ipykernel import kernelapp as app


325


In [8]:
# check the result
final_data

Unnamed: 0,Scientific Name,Common Name,Threatened status,ACT,NSW,NT,QLD,SA,TAS,VIC,...,HMI,AAT,CMA,Listed SPRAT TaxonID,Kingdom,Class,Profile,Family,Genus,Species
0,Acacia awestoniana,Stirling Range Wattle,Vulnerable,-,-,-,-,-,-,-,...,-,-,-,55562,Plantae,Magnoliopsida,http://www.environment.gov.au/cgi-bin/sprat/pu...,Fabaceae,Acacia,awestoniana
1,Acacia constablei,Narrabarba Wattle,Vulnerable,-,Yes,-,-,-,-,-,...,-,-,-,10798,Plantae,Magnoliopsida,http://www.environment.gov.au/cgi-bin/sprat/pu...,Fabaceae,Acacia,constablei
2,Andersonia axilliflora,Giant Andersonia,Endangered,-,-,-,-,-,-,-,...,-,-,-,16916,Plantae,Magnoliopsida,http://www.environment.gov.au/cgi-bin/sprat/pu...,Epacridaceae,Andersonia,axilliflora
3,Baeckea kandos,a shrub,Endangered,-,Yes,-,-,-,-,-,...,-,-,-,64838,Plantae,Magnoliopsida,http://www.environment.gov.au/cgi-bin/sprat/pu...,Myrtaceae,Baeckea,kandos
4,Bertmainius colonus,Eastern Stirling Range Pygmy Trapdoor Spider,Vulnerable,-,-,-,-,-,-,-,...,-,-,-,89125,Animalia,Arachnida,http://www.environment.gov.au/cgi-bin/sprat/pu...,Migidae,Bertmainius,colonus
5,Budawangia gnidioides,Budawangs Cliff-heath,Vulnerable,-,Yes,-,-,-,-,-,...,-,-,-,55850,Plantae,Magnoliopsida,http://www.environment.gov.au/cgi-bin/sprat/pu...,Epacridaceae,Budawangia,gnidioides
6,Callistemon forresterae,Forrester's Bottlebrush,Vulnerable,-,Yes,-,-,-,-,Yes,...,-,-,-,56501,Plantae,Magnoliopsida,http://www.environment.gov.au/cgi-bin/sprat/pu...,Myrtaceae,Callistemon,forresterae
7,Callistemon kenmorrisonii,Betka Bottlebrush,Vulnerable,-,-,-,-,-,-,Yes,...,-,-,-,64862,Plantae,Magnoliopsida,http://www.environment.gov.au/cgi-bin/sprat/pu...,Myrtaceae,Callistemon,kenmorrisonii
8,Darwinia squarrosa,"Fringed Mountain Bell, Pink Mountain Bell",Vulnerable,-,-,-,-,-,-,-,...,-,-,-,15694,Plantae,Magnoliopsida,http://www.environment.gov.au/cgi-bin/sprat/pu...,Myrtaceae,Darwinia,squarrosa
9,Eidothea hardeniana,Nightcap Oak,Critically Endangered,-,Yes,-,-,-,-,-,...,-,-,-,76351,Plantae,Magnoliopsida,http://www.environment.gov.au/cgi-bin/sprat/pu...,Proteaceae,Eidothea,hardeniana


## Filter VIC endangered wildlife data

In [9]:
df1

Unnamed: 0,Scientific Name,Common Name,Threatened status,ACT,NSW,NT,QLD,SA,TAS,VIC,...,HMI,AAT,CMA,Listed SPRAT TaxonID,Kingdom,Class,Profile,Family,Genus,Species
0,Neophoca cinerea,"Australian Sea-lion, Australian Sea Lion",Vulnerable,-,-,-,-,Yes,-,-,...,-,-,Yes,22,Animalia,Mammalia,http://www.environment.gov.au/cgi-bin/sprat/pu...,Otariidae,Neophoca,cinerea
1,Mirounga leonina,Southern Elephant Seal,Vulnerable,-,-,-,-,Yes,Yes,-,...,Yes,Yes,Yes,26,Animalia,Mammalia,http://www.environment.gov.au/cgi-bin/sprat/pu...,Phocidae,Mirounga,leonina
2,Balaenoptera borealis,Sei Whale,Vulnerable,-,Yes,-,Yes,Yes,Yes,Yes,...,Yes,Yes,Yes,34,Animalia,Mammalia,http://www.environment.gov.au/cgi-bin/sprat/pu...,Balaenopteridae,Balaenoptera,borealis
3,Balaenoptera musculus,Blue Whale,Endangered,-,Yes,Yes,Yes,Yes,Yes,Yes,...,Yes,Yes,Yes,36,Animalia,Mammalia,http://www.environment.gov.au/cgi-bin/sprat/pu...,Balaenopteridae,Balaenoptera,musculus
4,Balaenoptera physalus,Fin Whale,Vulnerable,-,Yes,-,Yes,Yes,Yes,Yes,...,Yes,Yes,Yes,37,Animalia,Mammalia,http://www.environment.gov.au/cgi-bin/sprat/pu...,Balaenopteridae,Balaenoptera,physalus
5,Megaptera novaeangliae,Humpback Whale,Vulnerable,-,Yes,Yes,Yes,Yes,Yes,Yes,...,Yes,Yes,Yes,38,Animalia,Mammalia,http://www.environment.gov.au/cgi-bin/sprat/pu...,Balaenopteridae,Megaptera,novaeangliae
6,Eubalaena australis,Southern Right Whale,Endangered,-,Yes,-,Yes,Yes,Yes,Yes,...,Yes,Yes,Yes,40,Animalia,Mammalia,http://www.environment.gov.au/cgi-bin/sprat/pu...,Balaenidae,Eubalaena,australis
7,Xeromys myoides,"Water Mouse, False Water Rat, Yirrkoo",Vulnerable,-,Yes,Yes,Yes,-,-,-,...,-,-,-,66,Animalia,Mammalia,http://www.environment.gov.au/cgi-bin/sprat/pu...,Muridae,Xeromys,myoides
8,Zyzomys pedunculatus,"Central Rock-rat, Antina",Critically Endangered,-,-,Yes,-,-,-,-,...,-,-,-,68,Animalia,Mammalia,http://www.environment.gov.au/cgi-bin/sprat/pu...,Muridae,Zyzomys,pedunculatus
9,Pseudomys shortridgei,"Heath Mouse, Dayang, Heath Rat",Endangered,-,-,-,-,Yes,-,Yes,...,-,-,-,77,Animalia,Mammalia,http://www.environment.gov.au/cgi-bin/sprat/pu...,Muridae,Pseudomys,shortridgei


In [10]:
#get all indexs of wildlife which are not in Victoria
indexs = []
for i,row in df1.iterrows():
    if row['VIC'] == '-':
        indexs.append(i)

In [11]:
#drop these rows
df1.drop(index = indexs, inplace = True)

In [12]:
#reset the number of rows
df1.reset_index(drop = True, inplace = True)

* We get 280 endangered wildlife based on location of Victoria

In [14]:
#drop useless location columns
df1 = df1.drop(columns = ['ACT',
                        'NSW',
                        'NT',
                        'QLD',
                        'SA',
                        'TAS',
                        'VIC',
                        'WA',
                        'ACI',
                        'CKI',
                        'CI',
                        'CSI',
                        'JBT',
                        'NFI',
                        'HMI',
                        'AAT',
                        'CMA'
                                ], axis=1)

In [15]:
#check final dataframe we got
df1

Unnamed: 0,Scientific Name,Common Name,Threatened status,Listed SPRAT TaxonID,Kingdom,Class,Profile,Family,Genus,Species
0,Balaenoptera borealis,Sei Whale,Vulnerable,34,Animalia,Mammalia,http://www.environment.gov.au/cgi-bin/sprat/pu...,Balaenopteridae,Balaenoptera,borealis
1,Balaenoptera musculus,Blue Whale,Endangered,36,Animalia,Mammalia,http://www.environment.gov.au/cgi-bin/sprat/pu...,Balaenopteridae,Balaenoptera,musculus
2,Balaenoptera physalus,Fin Whale,Vulnerable,37,Animalia,Mammalia,http://www.environment.gov.au/cgi-bin/sprat/pu...,Balaenopteridae,Balaenoptera,physalus
3,Megaptera novaeangliae,Humpback Whale,Vulnerable,38,Animalia,Mammalia,http://www.environment.gov.au/cgi-bin/sprat/pu...,Balaenopteridae,Megaptera,novaeangliae
4,Eubalaena australis,Southern Right Whale,Endangered,40,Animalia,Mammalia,http://www.environment.gov.au/cgi-bin/sprat/pu...,Balaenidae,Eubalaena,australis
5,Pseudomys shortridgei,"Heath Mouse, Dayang, Heath Rat",Endangered,77,Animalia,Mammalia,http://www.environment.gov.au/cgi-bin/sprat/pu...,Muridae,Pseudomys,shortridgei
6,Pseudomys fumeus,"Smoky Mouse, Konoom",Endangered,88,Animalia,Mammalia,http://www.environment.gov.au/cgi-bin/sprat/pu...,Muridae,Pseudomys,fumeus
7,Pseudomys gouldii,"Gould's Mouse, Koontin",Extinct,89,Animalia,Mammalia,http://www.environment.gov.au/cgi-bin/sprat/pu...,Muridae,Pseudomys,gouldii
8,Pseudomys novaehollandiae,"New Holland Mouse, Pookila",Vulnerable,96,Animalia,Mammalia,http://www.environment.gov.au/cgi-bin/sprat/pu...,Muridae,Pseudomys,novaehollandiae
9,Conilurus albipes,"White-footed Rabbit-rat, Parroo, White-footed ...",Extinct,131,Animalia,Mammalia,http://www.environment.gov.au/cgi-bin/sprat/pu...,Muridae,Conilurus,albipes


* We get 280 endangered wildlife in Victoria

## Extract profile data from URL

In [89]:
# a function to extract description part from content of html
# first it extract from description tag and Australian Distribution tag
# second it extract each paragraph of description and combine them together
def extract_description(self):
    description_part_pattern = r"(<a name=\"description\">[\w|\W]*?</p>([\w|\W]*)Australian Distribution)"
    try:
        matched = re.search(description_part_pattern, self)
        matched = html.unescape(matched)
        Details = ''
        description_pattern = r"(<p>[\w|\W]*</p>)"
        description = re.search(description_pattern, matched.group(0))
        List1 = re.split("<p>|</p>|\n",description.group(0))
        for i in List1:
            if i != '':
                Details = Details+i
        return Details
    except:
        return ''

In [46]:
# extract description from URL
urls = []
for i in df1['Profile'].__iter__():
    urls.append(i)

In [47]:
# we have 280 profile urls
len(urls)

280

In [166]:
# we extract description part if it has that part, otherwise add empty string and pass
Description = []
for url in urls:
    try:
        r = requests.get(url, allow_redirects=True)
        inputdata = bytes.decode(r.content)
        result = extract_description(inputdata)
        Description.append(result)
    except:
        Description.append('')
        print(len(Description))
        pass
# print(len(Description))

44
47
48
97
98
152
187
196
197
220
222
240
257
263


In [167]:
# we have the correct number of description
print(len(Description))

280


In [169]:
# store unnormal rows data unnormal = [44,47,48,97,98,152,187,196,197,220,222,240,257,263]
# add description column in dataframe
df1["Description"] = Description
df1

Unnamed: 0,Scientific Name,Common Name,Threatened status,Listed SPRAT TaxonID,Kingdom,Class,Profile,Family,Genus,Species,Description
0,Balaenoptera borealis,Sei Whale,Vulnerable,34,Animalia,Mammalia,http://www.environment.gov.au/cgi-bin/sprat/pu...,Balaenopteridae,Balaenoptera,borealis,Sei whales are dark grey or blue-grey on their...
1,Balaenoptera musculus,Blue Whale,Endangered,36,Animalia,Mammalia,http://www.environment.gov.au/cgi-bin/sprat/pu...,Balaenopteridae,Balaenoptera,musculus,Both the Antarctic blue whale and pygmy blue w...
2,Balaenoptera physalus,Fin Whale,Vulnerable,37,Animalia,Mammalia,http://www.environment.gov.au/cgi-bin/sprat/pu...,Balaenopteridae,Balaenoptera,physalus,Fin whales are dark grey to brownish black dor...
3,Megaptera novaeangliae,Humpback Whale,Vulnerable,38,Animalia,Mammalia,http://www.environment.gov.au/cgi-bin/sprat/pu...,Balaenopteridae,Megaptera,novaeangliae,The humpback whale is a moderately large balee...
4,Eubalaena australis,Southern Right Whale,Endangered,40,Animalia,Mammalia,http://www.environment.gov.au/cgi-bin/sprat/pu...,Balaenidae,Eubalaena,australis,The southern right whale&nbsp;is a&nbsp;large ...
5,Pseudomys shortridgei,"Heath Mouse, Dayang, Heath Rat",Endangered,77,Animalia,Mammalia,http://www.environment.gov.au/cgi-bin/sprat/pu...,Muridae,Pseudomys,shortridgei,
6,Pseudomys fumeus,"Smoky Mouse, Konoom",Endangered,88,Animalia,Mammalia,http://www.environment.gov.au/cgi-bin/sprat/pu...,Muridae,Pseudomys,fumeus,The Konoom is an Australian native rodent with...
7,Pseudomys gouldii,"Gould's Mouse, Koontin",Extinct,89,Animalia,Mammalia,http://www.environment.gov.au/cgi-bin/sprat/pu...,Muridae,Pseudomys,gouldii,
8,Pseudomys novaehollandiae,"New Holland Mouse, Pookila",Vulnerable,96,Animalia,Mammalia,http://www.environment.gov.au/cgi-bin/sprat/pu...,Muridae,Pseudomys,novaehollandiae,"A small, burrowing native rodent, the New Holl..."
9,Conilurus albipes,"White-footed Rabbit-rat, Parroo, White-footed ...",Extinct,131,Animalia,Mammalia,http://www.environment.gov.au/cgi-bin/sprat/pu...,Muridae,Conilurus,albipes,The White-footed Rabbit-rat was an attractive ...


In [164]:
len(Description)

269

In [125]:
f1=open("test.txt",'r',encoding = "cp1252").read()
inputdata

['b\'<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"\\r\\n    "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">\\r\\n<html lang="en" xml:lang="en" xmlns="http://www.w3.org/1999/xhtml">\\r\\n\\r\\n<!-- \\r\\n$Revision: 5.19 $\\r\\nThis file is under software revision control. \\r\\nDo not edit in-situ.  All changes WILL be lost!\\r\\n-->\\r\\n\\r\\n<head>\\r\\n<meta http-equiv="Content-Type" content="text/html; charset=ISO-8859-1" />\\r\\n<title>Phoebetria fusca &mdash; Sooty Albatross</title>\\r\\n<link rel="schema.AGLS" href="http://www.naa.gov.au/recordkeeping/gov_online/agls/1.2" />\\r\\n<meta name="DC.Identifier" scheme="URI" content="http://www.environment.gov.au/cgi-bin/sprat/public/publicspecies.pl" />\\r\\n<meta name="DC.Title" lang="en" xml:lang="en" content="Phoebetria fusca &mdash; Sooty Albatross" />\\r\\n<meta name="DC.Creator" scheme="AglsAgent" content="jurisdiction=Commonwealth of Australia; corporateName=Department of the Environment" />\\r\\n<m

In [20]:
# First description of first animal

Sei whales are dark grey or blue-grey on their back and sides. The undersides and sides may appear mottled with light coloured circular scars caused by various types of parasites, including scars from the bite of the 'cookie-cutter' shark (<i>Isistius brasiliensis</i>) (Aguilar 2002).At sexual maturity, sei whales are approximately 12&ndash;16 m long, although they can reach lengths of 17.7 m in males and 21 m in females (Gambell 1985). Adult females are about 0.5&ndash;0.6 m longer than males, and sei whales of the Southern Hemisphere are larger than those of the Northern Hemisphere (Horwood 1987). The body of the sei whale is slim, streamlined and laterally compressed in the caudal (hind) region.Sounds of the sei whale consist of a series of short pulses with peak energy in the 1.5&ndash;3.5 kHz range (Richardson et al. 1995). During a recent Southern Ocean Global Ocean Ecosystems Dynamics (SO-GLOBEC) cruise, acoustic recordings accompanied by photographs and acoustic tracking of a g

## Generate CSV file

In [176]:
# write dataframe into csv file
df1.to_csv('VIC_Endangered_Wildlife.csv',index=False)

## Generate JSON file

In [187]:
# read csv file we create and check empty description
readfile = pd.read_csv('VIC_Endangered_Wildlife.csv',encoding = "cp1252")
readfile['Description'].isnull().sum()

152

In [279]:
# read updated csv file we create and check empty description we get less empth description
readfile = pd.read_csv('VIC_Endangered_2.csv',encoding = "cp1252")
readfile['Description'].isnull().sum()

108

In [280]:
# delete unrecognised html tag manully in excel
# replace "&ndash;" by -
# replace "&quot;" by "
# replace '&amp;'	 or '&#38;' by &
# replace '&lt;' by <
# replace '&nbsp;' by space
# replace '&mdash;' by —
# delete <li> and </i>
# delete <br/>
# delete <b> and </b>

In [281]:
#readfile = pd.read_csv('VIC_backup.csv',encoding = "cp1252")

In [282]:
#generate JSON file and read it to check
readfile.to_json(r'Victoria endangered wildlife.json',orient='records')
readfile

Unnamed: 0,Scientific Name,Common Name,Threatened status,Listed SPRAT TaxonID,Kingdom,Class,Profile,Family,Genus,Species,Description,Image
0,Balaenoptera borealis,Sei Whale,Vulnerable,34,Animalia,Mammalia,http://www.environment.gov.au/cgi-bin/sprat/pu...,Balaenopteridae,Balaenoptera,borealis,Sei whales are dark grey or blue-grey on their...,https://en.wikipedia.org/wiki/Sei_whale#/media...
1,Balaenoptera musculus,Blue Whale,Endangered,36,Animalia,Mammalia,http://www.environment.gov.au/cgi-bin/sprat/pu...,Balaenopteridae,Balaenoptera,musculus,Both the Antarctic blue whale and pygmy blue w...,https://en.wikipedia.org/wiki/File:Anim1754_-_...
2,Balaenoptera physalus,Fin Whale,Vulnerable,37,Animalia,Mammalia,http://www.environment.gov.au/cgi-bin/sprat/pu...,Balaenopteridae,Balaenoptera,physalus,Fin whales are dark grey to brownish black dor...,https://en.wikipedia.org/wiki/File:Finhval_(1)...
3,Megaptera novaeangliae,Humpback Whale,Vulnerable,38,Animalia,Mammalia,http://www.environment.gov.au/cgi-bin/sprat/pu...,Balaenopteridae,Megaptera,novaeangliae,The humpback whale is a moderately large balee...,https://en.wikipedia.org/wiki/File:Humpback_Wh...
4,Eubalaena australis,Southern Right Whale,Endangered,40,Animalia,Mammalia,http://www.environment.gov.au/cgi-bin/sprat/pu...,Balaenidae,Eubalaena,australis,The southern right whaleÊis aÊlarge baleen wha...,https://en.wikipedia.org/wiki/Southern_right_w...
5,Pseudomys shortridgei,"Heath Mouse, Dayang, Heath Rat",Endangered,77,Animalia,Mammalia,http://www.environment.gov.au/cgi-bin/sprat/pu...,Muridae,Pseudomys,shortridgei,The heath mouse occurs in species-rich and str...,
6,Pseudomys fumeus,"Smoky Mouse, Konoom",Endangered,88,Animalia,Mammalia,http://www.environment.gov.au/cgi-bin/sprat/pu...,Muridae,Pseudomys,fumeus,The Konoom is an Australian native rodent with...,
7,Pseudomys gouldii,"Gould's Mouse, Koontin",Extinct,89,Animalia,Mammalia,http://www.environment.gov.au/cgi-bin/sprat/pu...,Muridae,Pseudomys,gouldii,,https://en.wikipedia.org/wiki/Gould%27s_mouse#...
8,Pseudomys novaehollandiae,"New Holland Mouse, Pookila",Vulnerable,96,Animalia,Mammalia,http://www.environment.gov.au/cgi-bin/sprat/pu...,Muridae,Pseudomys,novaehollandiae,"A small, burrowing native rodent, the New Holl...",https://en.wikipedia.org/wiki/New_Holland_mous...
9,Conilurus albipes,"White-footed Rabbit-rat, Parroo, White-footed ...",Extinct,131,Animalia,Mammalia,http://www.environment.gov.au/cgi-bin/sprat/pu...,Muridae,Conilurus,albipes,The White-footed Rabbit-rat was an attractive ...,https://en.wikipedia.org/wiki/White-footed_rab...


In [178]:
# read file
with open('Victoria endangered wildlife.json', 'r') as myfile:
    data=myfile.read()

# parse file
obj = json.loads(data)

In [181]:
print(type(obj))

<class 'list'>


In [None]:
from  bs4 import  BeautifulSoup