In [20]:
import pandas as pd
import numpy as np
import json
import warnings
warnings.simplefilter('ignore')

# Data Preparation 

## Reading datasets

In [2]:
def read_csv(class_):
    dataFrames = []
    for i in range(1, 8):
        df = pd.read_csv('datasets/' + class_ + '.Cleaned.k' + str(i) + '.csv')
        dataFrames.append(df)
    return dataFrames

### Chiroptera Class: k=1, 2, ...., 7

In [3]:
chirop_dfs = read_csv('Chiroptera')

### Rodentia Class: k=1, 2, ...., 7

In [4]:
rodent_dfs = read_csv('Rodentia')

### Aves Class: k=1, 2, ...., 7

In [5]:
aves_dfs = read_csv('Aves')

In [6]:
polypod_dfs = read_csv('Polypodiopsida')

In [7]:
pucci_dfs = read_csv('Pucciniomycetes')

# Learning Models

In [8]:
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.metrics import f1_score

In [9]:
def get_acc_scores(X, y):
    skf = StratifiedKFold(n_splits=10, random_state=0, shuffle=True)
    acc_scores = {'rf test': [], 'rf train': []}
    for train_index, test_index in skf.split(X, y):
        X_train, X_test = X.iloc[train_index, :], X.iloc[test_index, :]
        y_train, y_test = y.iloc[train_index, :], y.iloc[test_index, :]
        rf = RandomForestClassifier(n_estimators=10, n_jobs=-1)
        rf.fit(X_train, y_train)
        rf_test = rf.score(X_test, y_test)
        rf_train = rf.score(X_train, y_train)
        acc_scores['rf test'].append(rf_test)
        acc_scores['rf train'].append(rf_train)
    return acc_scores 

In [12]:
def main(class_, name):
    clfs_acc = {}
    for i in range(1, len(class_)+1):
        X = class_[i-1].iloc[:, 3:]   # different for animal kingdom
        features = X.columns
        sc = StandardScaler()
        X = sc.fit_transform(X)
        y = pd.DataFrame(class_[i-1].iloc[:, 2])
        X = pd.DataFrame(X, columns=features)
        clfs_acc[name+'.k'+str(i)] = get_acc_scores(X, y)
    return clfs_acc

## Merged dataset

In [17]:
def combine_dataset():
    dataFrames = []
    for i in range(1, 8):
        chirop_df = pd.read_csv('datasets/Chiroptera.Cleaned.k' + str(i) + '.csv')
        rodent_df = pd.read_csv('datasets/Rodentia.Cleaned.k' + str(i) + '.csv')
        aves_df = pd.read_csv('datasets/Aves.Cleaned.k' + str(i) + '.csv')
        pucci_df = pd.read_csv('datasets/Pucciniomycetes.Cleaned.k' + str(i) + '.csv')
        pucci_df.insert(0, '', np.arange(len(pucci_df)))
        polypod_df = pd.read_csv('datasets/Polypodiopsida.Cleaned.k' + str(i) + '.csv')
        polypod_df.insert(0, '', np.arange(len(polypod_df)))
        frames = [chirop_df, rodent_df, aves_df]
        merged_df = pd.concat(frames)
        merged_df.index = range(len(merged_df))
        dataFrames.append(merged_df)
    return dataFrames

In [18]:
dfs = combine_dataset()

In [21]:
def dump_results():
    results = main(dfs, 'combined_datasets')
    with open('non-hierarchical acc score(combined).json', 'w') as fr:
        json.dump(results, fr)
dump_results()