In [1]:
# Clean dataset, but keep Dead/Dormant Individuals in the population
import numpy as np
import pandas as pd
import seaborn as sns
Trill = pd.read_excel('Trillium.xlsx')

In [2]:
# Remove unecessary columns
# Next, remove some columns that will not be used for the project
Trill = Trill.drop(columns = ['Site','Plot','subID','Tag','ID','Dormancy',
                              'Species','Species2'])

In [3]:
# Replace 'na' with NAN
Trill.replace('na',np.nan,inplace = True)
Trill = Trill[Trill.Length05 != 'chewed'] 

# Change datatypes as needed
dataTypes = {'Habitat': 'category','Trans0304':'category',
             'Trans0405':'category','Length05':'float64'}
Trill = Trill.astype(dataTypes)

In [4]:
# Next, perform additional cleaning on data set by filling in missing values,
# and correcting certain labels

numericData = ['Length','Width','LeafArea']
years = ['03','04','05']
stages = ['flowering','nonflowering','oneleaf']

# correct any missing stage data that can be determined necessarily
for i in range(1,len(years)):
    stageName = 'Stage' + years[i]
    prevStage = 'Stage' + years[i - 1]
    transStage = 'Trans' + years[i - 1] + years[i]
    Trill.loc[(Trill[stageName] == 'oneleaf') & (Trill[prevStage].isnull() == True),
              prevStage] = 'seedling'
    Trill.loc[(Trill[stageName] == 'oneleaf') & (Trill[prevStage] == 'seedling'),
              transStage] = 'progression'
    Trill.loc[Trill[stageName] == 'seedling',prevStage] = 'not emerged'

# fill in seedling data with unit measurements
# fill in other missing data with mean measurements
for i in range(0,len(years)):
    for j in range(0,len(numericData)):
        colName = numericData[j] + years[i]
        stageName = 'Stage' + years[i]
        Trill.loc[Trill[stageName] == 'seedling',colName] = 1.0
        Trill.loc[Trill[stageName] == 'not emerged',colName] = 0.0
        Trill.loc[Trill[stageName] == 'eaten',colName] = 0.0
        for z in range(0,len(stages)):
            stageType = stages[z]
            Trill.loc[(Trill[stageName]== stageType)&(Trill[colName].isnull() == True),
                      colName] = Trill[colName].mean()

    
# give dead/dormant individuals 0 size
for i in range(1,len(years)):
    for j in range(0,len(numericData)):
        colName = numericData[j] + years[i]
        prevName = numericData[j] + years[i - 1]
        

# create a new variable for eaten plants, change stage of eaten plants to 
# not emerged
for i in range(0,len(years)):
    stageName = 'Stage' + years[i]
    eatenYear = 'isEaten' + years[i]
    Trill.loc[Trill[stageName] == 'eaten',eatenYear] = 1
    Trill.loc[Trill[stageName] != 'eaten',eatenYear] = 0
    Trill.loc[Trill[eatenYear] == 1, stageName] = 'not emerged'
    
# Correct final datatypes
moreDataTypes = {'Stage03':'category','Stage04':'category',
                 'Stage05':'category'}
Trill = Trill.astype(moreDataTypes)
    
    

In [5]:
# Let's check size and datatypes of our data
print("Shape of data frame: ",Trill.shape)
Trill.dtypes

Shape of data frame:  (1401, 18)


Habitat       category
Stage03       category
Stage04       category
Stage05       category
Trans0304     category
Trans0405     category
Length03       float64
Width03        float64
LeafArea03     float64
Length04       float64
Width04        float64
LeafArea04     float64
Length05       float64
Width05        float64
LeafArea05     float64
isEaten03      float64
isEaten04      float64
isEaten05      float64
dtype: object

In [6]:
# next build training, validation, and test sets
# The split between training and test data will be made in time
# 03 data will be used to predcict 04 stage (training)
# 04 data will be used to predict 05 stage
Train = Trill[['Habitat','Stage03','Stage04','Trans0304',
               'Length03','Width03','LeafArea03']].copy()

Test = Trill[['Habitat','Stage04','Stage05','Trans0405',
               'Length04','Width04','LeafArea04']].copy()

# remove missing data that cannot be filled in
Train = Train.dropna(subset = ['Length03','Width03','LeafArea03','Stage03',
                               'Stage04'])

Test = Test.dropna(subset = ['Length04','Width04','LeafArea04',
                                         'Stage04','Stage05'])


# Now, split test data into 2 separate dataframes: validation and test data
# We will have the validation data be 30% of the test set
from sklearn.model_selection import train_test_split
Test, Validation = train_test_split(Test, test_size=0.3)

# Next, let's check the sizes of each dataframe
print("Train dataframe:", Train.shape)
print("Test dataframe: ", Test.shape)
print("Validation dataframe: ",Validation.shape)

Train dataframe: (1152, 7)
Test dataframe:  (939, 7)
Validation dataframe:  (403, 7)


In [7]:
# Finally, write the data to csv's to be used later
Train.to_csv('Train.csv',index = False)
Test.to_csv('Test.csv',index = False)
Validation.to_csv('Validation.csv',index = False)