# Cleaning Data

In [80]:
# importing packages
import pandas as pd
import matplotlib.pyplot as plt

In [81]:
# reading the raw data csv, dropped one data point 
dino_rawr_data = pd.read_csv('DinoData/data.csv').drop(['taxonomy', 'named_by'], axis = 1)
dino_rawr_data

Unnamed: 0,name,diet,period,lived_in,type,length,species,link
0,aardonyx,herbivorous,Early Jurassic 199-189 million years ago,South Africa,sauropod,8.0m,celestae,https://www.nhm.ac.uk/discover/dino-directory/...
1,abelisaurus,carnivorous,Late Cretaceous 74-70 million years ago,Argentina,large theropod,9.0m,comahuensis,https://www.nhm.ac.uk/discover/dino-directory/...
2,achelousaurus,herbivorous,Late Cretaceous 83-70 million years ago,USA,ceratopsian,6.0m,horneri,https://www.nhm.ac.uk/discover/dino-directory/...
3,achillobator,carnivorous,Late Cretaceous 99-84 million years ago,Mongolia,large theropod,5.0m,giganteus,https://www.nhm.ac.uk/discover/dino-directory/...
4,acrocanthosaurus,carnivorous,Early Cretaceous 115-105 million years ago,USA,large theropod,12.0m,atokensis,https://www.nhm.ac.uk/discover/dino-directory/...
...,...,...,...,...,...,...,...,...
304,yuanmousaurus,herbivorous,Mid Jurassic 180-159 million years ago,China,sauropod,17.0m,jiangyiensis,https://www.nhm.ac.uk/discover/dino-directory/...
305,yunnanosaurus,omnivorous,Early Jurassic 205-190 million years ago,China,sauropod,7.0m,huangi,https://www.nhm.ac.uk/discover/dino-directory/...
306,zalmoxes,herbivorous,Late Cretaceous 69 million years ago,Romania,euornithopod,3.0m,robustus,https://www.nhm.ac.uk/discover/dino-directory/...
307,zephyrosaurus,herbivorous,Early Cretaceous 120-110 million years ago,USA,euornithopod,1.8m,schaffi,https://www.nhm.ac.uk/discover/dino-directory/...


In [82]:
# removing all rows with NaN values
nan_index = [125]
for i in range(dino_rawr_data.shape[0]):
    if str(dino_rawr_data.get('length')[i]) == 'nan':
        nan_index.append(i)
dino_data = dino_rawr_data.drop(nan_index).reset_index().drop(['index'], axis = 1)
dino_data

Unnamed: 0,name,diet,period,lived_in,type,length,species,link
0,aardonyx,herbivorous,Early Jurassic 199-189 million years ago,South Africa,sauropod,8.0m,celestae,https://www.nhm.ac.uk/discover/dino-directory/...
1,abelisaurus,carnivorous,Late Cretaceous 74-70 million years ago,Argentina,large theropod,9.0m,comahuensis,https://www.nhm.ac.uk/discover/dino-directory/...
2,achelousaurus,herbivorous,Late Cretaceous 83-70 million years ago,USA,ceratopsian,6.0m,horneri,https://www.nhm.ac.uk/discover/dino-directory/...
3,achillobator,carnivorous,Late Cretaceous 99-84 million years ago,Mongolia,large theropod,5.0m,giganteus,https://www.nhm.ac.uk/discover/dino-directory/...
4,acrocanthosaurus,carnivorous,Early Cretaceous 115-105 million years ago,USA,large theropod,12.0m,atokensis,https://www.nhm.ac.uk/discover/dino-directory/...
...,...,...,...,...,...,...,...,...
285,yinlong,herbivorous,Mid Jurassic 159-154 million years ago,China,ceratopsian,1.2m,downsi,https://www.nhm.ac.uk/discover/dino-directory/...
286,yuanmousaurus,herbivorous,Mid Jurassic 180-159 million years ago,China,sauropod,17.0m,jiangyiensis,https://www.nhm.ac.uk/discover/dino-directory/...
287,yunnanosaurus,omnivorous,Early Jurassic 205-190 million years ago,China,sauropod,7.0m,huangi,https://www.nhm.ac.uk/discover/dino-directory/...
288,zalmoxes,herbivorous,Late Cretaceous 69 million years ago,Romania,euornithopod,3.0m,robustus,https://www.nhm.ac.uk/discover/dino-directory/...


In [101]:
# splitting the col. periods by ' ', to separate the period name (str), and the estimated existance time (int).
dino_data_years_split = dino_data.get('period').str.split(' ')
dino_data_years_split[97]

['Early', 'Cretaceous']

In [103]:
# initializing empty lists to be filled during the for-loop
dino_data_periods = [] # stores period name as str (eg. 'Early Cretaceous') in dino_data_periods
dino_data_years_str = [] # stores years as a str in the form ('a-b') in dino_data_years_str


for i in range(0, 290, 1):
    dino_data_periods.append(dino_data_years_split[i][0] + ' ' + dino_data_years_split[i][1])
    if len(dino_data_years_split[i]) >= 3:
        dino_data_years_str.append(dino_data_years_split[i][2])
    else:
        # for rows with only a period, and no specific time, p
        dino_data_years_str.append(0)
dino_data_years_str[97]

0

In [106]:
# initializing empty lists to be filled during the for-loop
dino_data_start_year = [] # stores start year as int 
dino_data_end_year = [] # stores end year as int
dino_data_span = [] #stores start_year-end_year as int

for i in range(0,290,1):
    if dino_data_years_str[i] == 0:
        dino_data_start_year.append(0)
        dino_data_end_year.append(0)        
    else:
        years_split = dino_data_years_str[i].split('-')
        start_year = int(years_split[0])
        dino_data_start_year.append(start_year)
        if len(years_split) > 1:
            end_year = int(years_split[1])
            dino_data_end_year.append(end_year)        
        else:
            end_year = 0
            dino_data_end_year.append(0)
    dino_data_span.append(start_year-end_year)
        
type(dino_data_start_year)

list

In [108]:
# adding the columns start year, end year, existance span
# dropping columns 'taxonomy' and 'named_by'
dino_data_clean = dino_data.assign(start_year = dino_data_start_year, \
                                           end_year = dino_data_end_year, \
                                           period = dino_data_periods, \
                                           existance_span = dino_data_span)
# re-naming columns
dino_data_clean.rename(columns = {'start_year':'start year (mil. years ago)'}, inplace = True)
dino_data_clean.rename(columns = {'end_year':'end year (mil. years ago)'}, inplace = True)
dino_data_clean.rename(columns = {'lived_in':'lived in'}, inplace = True)
dino_data_clean.rename(columns = {'existance_span':'existance span (mil. years)'}, inplace = True)
dino_data_clean

Unnamed: 0,name,diet,period,lived in,type,length,species,link,start year (mil. years ago),end year (mil. years ago),existance span (mil. years)
0,aardonyx,herbivorous,Early Jurassic,South Africa,sauropod,8.0m,celestae,https://www.nhm.ac.uk/discover/dino-directory/...,199,189,10
1,abelisaurus,carnivorous,Late Cretaceous,Argentina,large theropod,9.0m,comahuensis,https://www.nhm.ac.uk/discover/dino-directory/...,74,70,4
2,achelousaurus,herbivorous,Late Cretaceous,USA,ceratopsian,6.0m,horneri,https://www.nhm.ac.uk/discover/dino-directory/...,83,70,13
3,achillobator,carnivorous,Late Cretaceous,Mongolia,large theropod,5.0m,giganteus,https://www.nhm.ac.uk/discover/dino-directory/...,99,84,15
4,acrocanthosaurus,carnivorous,Early Cretaceous,USA,large theropod,12.0m,atokensis,https://www.nhm.ac.uk/discover/dino-directory/...,115,105,10
...,...,...,...,...,...,...,...,...,...,...,...
285,yinlong,herbivorous,Mid Jurassic,China,ceratopsian,1.2m,downsi,https://www.nhm.ac.uk/discover/dino-directory/...,159,154,5
286,yuanmousaurus,herbivorous,Mid Jurassic,China,sauropod,17.0m,jiangyiensis,https://www.nhm.ac.uk/discover/dino-directory/...,180,159,21
287,yunnanosaurus,omnivorous,Early Jurassic,China,sauropod,7.0m,huangi,https://www.nhm.ac.uk/discover/dino-directory/...,205,190,15
288,zalmoxes,herbivorous,Late Cretaceous,Romania,euornithopod,3.0m,robustus,https://www.nhm.ac.uk/discover/dino-directory/...,69,0,69


In [109]:
# save cleaned dataframe as csv
dino_data.to_csv('dino_data_cleaned.csv')

# visualizing data

## groupby different categories
grouping dino_data by different columns to see how many data points are in each group
we can use this to decide where we want to perform data analysis, as it would be pointless to analyze a group with only one single individual in it for trends

In [None]:
# prints the categories, and the number of unique counts in each group, so we can see here, 
# the different number of count when grouping different categories
print('species:', dino_data.groupby('species').count().get('index').unique())
print('types:', dino_data.groupby('type').count().get('index').unique())
print('period:', dino_data.groupby('period').count().get('index').unique())
print('diet:', dino_data.groupby('diet').count().get('index').unique())
print('lived in:', dino_data.groupby('lived in').count().get('index').unique())
print('existance span:', dino_data.groupby('existance span (mil. years)').count().get('index').unique())