In [1]:
from scipy.io import arff
import pandas as pd
import os
from sklearn import tree
from sklearn.metrics import accuracy_score, mean_absolute_error,mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier
import seaborn as sns
import matplotlib.pyplot as plt
import pydot
import numpy as np

In [2]:
results = pd.DataFrame()
results_columns = pd.DataFrame()

for dirpath, dirnames, filenames in os.walk("../data_sets/data/", topdown=False):
    for name in filenames:
        fullpath = os.path.join(dirpath, name)
        data = pd.DataFrame(arff.loadarff(fullpath)[0])
        data['class'] = data['class'].str.decode('utf-8')
        data['year'] = float(name[0])
        results = pd.concat([results,data],axis=0).reset_index(drop=True)
        
                

In [3]:
#sort by year
results = results.sort_values(by=['year'])

In [4]:
results.head()

Unnamed: 0,Attr1,Attr2,Attr3,Attr4,Attr5,Attr6,Attr7,Attr8,Attr9,Attr10,...,Attr57,Attr58,Attr59,Attr60,Attr61,Attr62,Attr63,Attr64,class,year
43404,0.014946,0.94648,0.03211,1.0363,-20.581,0.0,0.01526,0.056357,2.9694,0.053341,...,0.28021,0.97443,1.1792,15.036,4.1741,108.64,3.3599,35.118,1,1.0
38710,-0.083327,0.94288,-0.17625,0.6931,-92.912,-0.095666,-0.078056,-0.005651,0.87141,-0.005328,...,15.64,1.1476,-69.182,18.401,2.1509,278.57,1.3103,1.2501,0,1.0
38711,0.16124,0.65754,0.15056,1.4513,-9.5086,0.49851,0.20649,0.49893,1.1084,0.32807,...,0.49147,0.90223,0.98741,10.732,10.046,56.648,6.4433,4.167,0,1.0
38712,0.031386,0.038397,0.19497,6.1245,358.73,0.010658,0.031643,25.044,1.1886,0.9616,...,0.03264,0.8413,0.000365,5.3563,2.8605,66.169,5.5162,0.27362,0,1.0
38713,-0.092135,1.1327,-0.19627,0.82672,-99.452,-0.14032,-0.092135,-0.11712,0.95725,-0.13266,...,0.69452,1.0447,0.0,5.966,2.9435,252.19,1.4473,25.774,0,1.0


In [None]:
results.info(verbose=True)

In [None]:
missing_cols = results.columns[results.isnull().any()]

In [None]:
results.describe()

In [None]:

plt.figure(figsize=(10,10));
sns.displot(
    data=results[missing_cols].isna().melt(value_name="missing"),
    y="variable",
    hue="missing",
    multiple="fill"
);

***
# Impute missing data

In [None]:
mean_imp = SimpleImputer(missing_values = pd.NA,strategy='mean')
median_imp = SimpleImputer(missing_values = pd.NA,strategy = 'median')
freq_imp = SimpleImputer(missing_values = pd.NA,strategy = 'most_frequent')

In [None]:
df_impute = results


df_mean = pd.DataFrame(mean_imp.fit_transform(df_impute),columns = df_impute.columns)
df_median = pd.DataFrame(median_imp.fit_transform(df_impute),columns = df_impute.columns)
df_freq = pd.DataFrame(freq_imp.fit_transform(df_impute),columns = df_impute.columns)

# Begin Modeling

In [None]:

X = df_mean.loc[:, df_mean.columns != 'class']
y = df_mean['class']
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.2,
                                                    shuffle=False,
                                                    random_state=0)

In [None]:
model = RandomForestClassifier()

# Train the model on training data
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

In [None]:
model_results=pd.DataFrame({'Actual':y_test, 'Predicted':y_pred})
model_results

In [None]:
print('Mean Absolute Error:', mean_absolute_error(y_test, y_pred))
print('Mean Squared Error:', mean_squared_error(y_test, y_pred))
print('Root Mean Squared Error:', np.sqrt(mean_squared_error(y_test, y_pred)))

In [None]:

accuracy = accuracy_score(y_test,y_pred)
print('Accuracy:', round(accuracy, 2), '%.')

In [None]:

plt.figure(figsize=(5, 7))


ax = sns.distplot(y, hist=False, color="r", label="Actual Value")
sns.distplot(y_pred, hist=False, color="b", label="Fitted Values" , ax=ax)


plt.title('Actual vs Fitted Values')


plt.show()
plt.close()

In [None]:

# Pull out one tree from the forest
Tree =  model.estimators_[5]
# Export the image to a dot file

plt.figure(figsize=(25,15))
tree.plot_tree(Tree,filled=True, 
              rounded=True, 
              fontsize=14);