# Classifier for Noxious Plant Species in North America

* Student names: Natasha Kacoroski, Jacob Crabb
* Student pace: full time
* Scheduled project review date/time: 
* Instructor name: Miles Erickson, Greg Damico


## Load Libraries and Data

In [1]:
# Import necessary libraries
from sklearn_pandas import DataFrameMapper, FunctionTransformer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.decomposition import PCA
import pandas as pd
import numpy as np

In [2]:
# Import data. Manually removed backslash symbols while troubleshooting data import (write function?)

plant_data = pd.read_csv("plants.csv", skiprows=9, low_memory=False)
plant_data.head()

Unnamed: 0,"outl0strokewidth0 strokec2 ""Accepted Symbol""",Synonym Symbol,Scientific Name,Common Name,Category,Duration,Growth Habit,Native Status,State Noxious Status,Active Growth Period,...,Propogated by Cuttings,Propogated by Seed,Propogated by Sod,Propogated by Sprigs,Propogated by Tubers,Seeds per Pound,Seed Spread Rate,Seedling Vigor,Small Grain,Vegetative Spread Rate
0,ABELI,,Abelia,abelia,Dicot,,,L48(I),,,...,,,,,,,,,,
1,ABGR4,,Abelia 'd7grandiflora,glossy abelia,Dicot,Perennial,Shrub,L48(I),,"Spring, Summer, Fall",...,Yes,No,No,No,No,,,,No,
2,ABELM,,Abelmoschus,okra,Dicot,,,L48(I),,,...,,,,,,,,,,
3,ABES,,Abelmoschus esculentus,okra,Dicot,"Annual, Perennial","Subshrub, Forb/herb",L48(I)PR(I)VI(I),,,...,,,,,,,,,,
4,ABIES,,Abies,fir,Gymnosperm,,,"L48(I,N)CAN(N)SPM(N)",,,...,,,,,,,,,,


In [3]:
plant_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 38186 entries, 0 to 38185
Data columns (total 78 columns):
outl0strokewidth0 strokec2 "Accepted Symbol"    38186 non-null object
Synonym Symbol                                  66 non-null object
Scientific Name                                 38185 non-null object
Common Name                                     36115 non-null object
Category                                        38046 non-null object
Duration                                        27808 non-null object
Growth Habit                                    34269 non-null object
Native Status                                   38093 non-null object
State Noxious Status                            461 non-null object
Active Growth Period                            2027 non-null object
After Harvest Regrowth Rate                     1039 non-null object
Bloat                                           2063 non-null object
C:N Ratio                                       1840 non-nu

Column information found here https://plants.usda.gov/charinfo.html.

## Clean Data

Many columns have the same 2,063 entries, subset to 2,063 entries for classification model.

In [4]:
plant_data.dropna(subset=['Bloat'], inplace=True) # Bloat is first column with 2,063 entries
plant_data.reset_index(drop=True, inplace=True) # Reset index

State Noxious Status column contains target data. Add column where 1 is noxious and 0 is not.

In [5]:
plant_data['invasive'] = plant_data['State Noxious Status'].notnull().astype('int')

For numeric features, include the following columns.

In [6]:
def select_features(dataframe, dtype, exclude_list):
    """Returns list of columns names to include as features"""
    columns = list(dataframe.select_dtypes(include=dtype).columns.values)
    features = [e for e in columns if e not in exclude_list]
    return features

def common_value(x):
    """Return most common value"""
    return x.split(',')[0]
        

In [7]:
gh = plant_data['Growth Habit']
gh.value_counts()

Forb/herb                           554
Graminoid                           490
Tree                                227
Tree, Shrub                         212
Shrub                               173
Subshrub, Forb/herb                  76
Subshrub, Shrub                      58
Shrub, Tree                          50
Subshrub                             33
Vine, Forb/herb                      20
Vine                                 20
Forb/herb, Subshrub                  18
Shrub, Subshrub                      16
Forb/herb, Vine                      10
Subshrub, Shrub, Forb/herb            9
Shrub, Vine                           5
Vine, Subshrub                        4
Subshrub, Shrub, Graminoid            3
Tree, Shrub, Vine                     3
Subshrub, Forb/herb, Shrub            2
Shrub, Forb/herb, Subshrub            2
Tree, Subshrub, Shrub                 2
Graminoid, Shrub, Vine                1
Subshrub, Shrub, Forb/herb, Tree      1
Shrub, Subshrub, Forb/herb            1


In [8]:
def partition_stings(x):
    head, sep, tail = x.partition(',')
    return head

In [11]:
cat_df = plant_data[categorical_features]

In [12]:
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(missing_values=np.nan, strategy='most_frequent')),
    ('comval', FunctionTransformer(common_value)),
    ('onehot', OneHotEncoder(categories='auto'))
])

In [20]:
pd.DataFrame(categorical_transformer.fit_transform(cat_df).toarray())

0      0
1      0
2      0
3      0
4      0
5      0
6      0
7      0
8      0
9      0
10     0
11     0
12     0
13     0
14     0
15     0
16     0
17     0
18     0
19     0
20     0
21     0
22     0
23     0
24     0
25     0
26     0
27     0
28     0
29     0
      ..
215    0
216    0
217    0
218    0
219    0
220    0
221    0
222    0
223    0
224    0
225    0
226    0
227    0
228    0
229    0
230    0
231    0
232    0
233    0
234    0
235    0
236    0
237    0
238    0
239    0
240    0
241    0
242    0
243    0
244    0
Length: 245, dtype: int64

In [16]:
numeric_exclude = ['Height at Base Age, Maximum (feet)','Planting Density per Acre, Minimum','Planting Density per Acre, Maximum']
numeric_features = select_features(plant_data, 'float64', numeric_exclude)

numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(missing_values=np.nan, strategy='median')),
    ('scaler',  StandardScaler())])

categorical_exclude = ['outl0strokewidth0 strokec2 "Accepted Symbol"', 'Synonym Symbol', 'Scientific Name', 'Common Name',
                   'Native Status', 'State Noxious Status', 'After Harvest Regrowth Rate', 'Commercial Availability']
categorical_features = select_features(plant_data, 'object', categorical_exclude)
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(missing_values=np.nan, strategy='most_frequent')),
    ('onehot', OneHotEncoder(categories='auto'))
])

preprocessor = ColumnTransformer(transformers=[('num', numeric_transformer, numeric_features),
                                               ('cat', categorical_transformer, categorical_features)])


In [17]:
X = plant_data.drop('invasive', axis=1)
y = plant_data['invasive']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [19]:
xtrain = preprocessor.fit_transform(X_train)

In [None]:
xtest = preprocessor.transform(X_test)

In [None]:
# from https://github.com/scikit-learn/scikit-learn/issues/12525
def get_column_names_from_ColumnTransformer(column_transformer):    
    col_name = []
    for transformer_in_columns in column_transformer.transformers_[:-1]:#the last transformer is ColumnTransformer's 'remainder'
        raw_col_name = transformer_in_columns[2]
        if isinstance(transformer_in_columns[1],Pipeline): 
            transformer = transformer_in_columns[1].steps[-1][1]
        else:
            transformer = transformer_in_columns[1]
        try:
            names = transformer.get_feature_names()
        except AttributeError: # if no 'get_feature_names' function, use raw column name
            names = raw_col_name
        if isinstance(names,np.ndarray): # eg.
            col_name += names.tolist()
        elif isinstance(names,list):
            col_name += names    
        elif isinstance(names,str):
            col_name.append(names)
    return col_name

In [None]:
column_names = get_column_names_from_ColumnTransformer(preprocessor)
len(column_names)

In [None]:
xtest = preprocessor.transform(X_test)

In [None]:
Xtrain = pd.DataFrame(x.toarray(), columns=column_names)

In [None]:
import matplotlib.pyplot as plt
#Fitting the PCA algorithm with our Data
pca = PCA().fit(Xtrain)
#Plotting the Cumulative Summation of the Explained Variance
plt.figure()
plt.plot(np.cumsum(pca.explained_variance_ratio_))
plt.xlabel('Number of Components')
plt.ylabel('Variance (%)') #for each component
plt.title('Pulsar Dataset Explained Variance')
plt.show()

In [None]:
from imblearn.over_sampling import SMOTE

In [None]:
pca = PCA(n_components=100)
X_train_pca = pca.fit_transform(Xtrain)
pca_df = pd.DataFrame(data=X_train_pca)

sm = SMOTE(random_state=2)
X_res, y_res = sm.fit_resample(pca_df, y_train)

In [None]:
X_res.shape, y_res.shape

In [None]:
from sklearn.ensemble import RandomForestClassifier

rfc = RandomForestClassifier(n_estimators=100, max_depth=10, random_state=2)
rfc.fit(X_res, y_res)

In [None]:
train_pred = rfc.predict(X_res)

In [None]:
y_res.shape

In [None]:
train_pred.shape

In [None]:
rfc.score(X_res, y_res)

In [None]:
pca.transform()