In [1]:
import numpy as np
import pandas as pd 

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn import svm
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

### Split Training / Test into X and y separately for full decade & half decades samples 

In [2]:
train10 = pd.read_csv('train10.csv')
test10 = pd.read_csv('test10.csv')   
X_train10 = train10.iloc[:,3:]
X_test10 = test10.iloc[:,3:]
y_train10 = train10['Decade released']
y_test10 = test10['Decade released']


train5 = pd.read_csv('train5.csv')
test5 = pd.read_csv('test5.csv') 
X_train5 = train5.iloc[:,3:]
X_test5 = test5.iloc[:,3:]
y_train5 = train5['Half decade released']
y_test5 = test5['Half decade released']


#define the class labels separately for the full decade and half decade model - these are used to construct confusion matrices
classes10 = [1970, 1980, 1990, 2000]
classes5 = [1970, 1975, 1980, 1985, 1990, 1995, 2000, 2005]

# Model Evaluation Functions

In [3]:
def cm2df(cm, labels): 
    """converts a numpy confusion matrix to a pandas dataframe, with class labels"""
    df = pd.DataFrame()
    # rows
    for i, row_label in enumerate(labels):
        rowdata={}
        # columns
        for j, col_label in enumerate(labels): 
            rowdata[col_label]=cm[i,j]
        df = df.append(pd.DataFrame.from_dict({row_label:rowdata}, orient='index'))
    return df[labels]

In [4]:
def model_eval(y_test, y_pred, classes): #classes is a list of target variable labels
    """prints out avg model accuracy, as well as a confusion matrix and a classification
    report specific to each individual class"""
    conf_mat = confusion_matrix(y_test, y_pred)
    cm_as_df=cm2df(conf_mat,classes)
    print('CONFUSION MATRIX (predicted along top, actual along side): ')
    display(cm_as_df)
    print(classification_report(y_test,y_pred))

# Support Vector Classifiers

### Using Full Decade Samples

In [5]:
#RBF KERNEL
clf = svm.SVC(kernel='rbf', C=0.001) #max_iter=500
clf.fit(X_train10, y_train10)
y_pred10 = clf.predict(X_test10)

model_eval(y_test10, y_pred10, classes10)

CONFUSION MATRIX (predicted along top, actual along side): 


Unnamed: 0,1970,1980,1990,2000
1970,1194,365,135,702
1980,1165,1251,395,1390
1990,3123,2292,1436,5729
2000,5699,3838,1683,18665


             precision    recall  f1-score   support

       1970       0.11      0.50      0.18      2396
       1980       0.16      0.30      0.21      4201
       1990       0.39      0.11      0.18     12580
       2000       0.70      0.62      0.66     29885

avg / total       0.55      0.46      0.48     49062



In [6]:
#SIGMOID KERNEL
clf = svm.SVC(kernel='sigmoid', C=0.001, coef0=0.5) #max_iter=500
clf.fit(X_train10, y_train10)
y_pred10 = clf.predict(X_test10)

model_eval(y_test10, y_pred10, classes10)

CONFUSION MATRIX (predicted along top, actual along side): 


Unnamed: 0,1970,1980,1990,2000
1970,849,1329,103,115
1980,795,2965,216,225
1990,2755,7032,1013,1780
2000,6272,13553,1791,8269


             precision    recall  f1-score   support

       1970       0.08      0.35      0.13      2396
       1980       0.12      0.71      0.20      4201
       1990       0.32      0.08      0.13     12580
       2000       0.80      0.28      0.41     29885

avg / total       0.58      0.27      0.31     49062



### Using Half Decade Samples 

In [7]:
# RBF KERNEL
clf = svm.SVC(kernel='rbf', C=0.001) #max_iter=100
clf.fit(X_train5, y_train5)
y_pred5 = clf.predict(X_test5)

model_eval(y_test5, y_pred5, classes5)

CONFUSION MATRIX (predicted along top, actual along side): 


Unnamed: 0,1970,1975,1980,1985,1990,1995,2000,2005
1970,579,142,55,85,18,22,245,26
1975,413,187,109,146,19,31,284,35
1980,354,153,234,257,27,52,475,60
1985,636,180,204,642,87,92,668,80
1990,1263,262,221,877,151,244,1465,193
1995,1751,485,372,998,210,422,3091,575
2000,2278,565,537,1269,198,444,5037,1673
2005,3405,830,748,1658,214,451,7173,3405


             precision    recall  f1-score   support

       1970       0.05      0.49      0.10      1172
       1975       0.07      0.15      0.09      1224
       1980       0.09      0.15      0.11      1612
       1985       0.11      0.25      0.15      2589
       1990       0.16      0.03      0.05      4676
       1995       0.24      0.05      0.09      7904
       2000       0.27      0.42      0.33     12001
       2005       0.56      0.19      0.28     17884

avg / total       0.34      0.22      0.22     49062



In [8]:
#SIGMOID KERNEL
clf = svm.SVC(kernel='sigmoid', C=0.001, coef0=0.5) #max_iter=100
clf.fit(X_train5, y_train5)
y_pred5 = clf.predict(X_test5)

model_eval(y_test5, y_pred5, classes5)

CONFUSION MATRIX (predicted along top, actual along side): 


Unnamed: 0,1970,1975,1980,1985,1990,1995,2000,2005
1970,260,102,487,261,16,3,3,40
1975,202,89,679,208,13,2,2,29
1980,180,57,973,319,16,9,4,54
1985,366,68,1266,734,42,13,5,95
1990,821,125,1898,1355,123,34,13,307
1995,1290,268,3310,1661,178,131,51,1015
2000,1931,435,4797,2082,179,138,68,2371
2005,2731,610,6996,2783,208,150,91,4315


             precision    recall  f1-score   support

       1970       0.03      0.22      0.06      1172
       1975       0.05      0.07      0.06      1224
       1980       0.05      0.60      0.09      1612
       1985       0.08      0.28      0.12      2589
       1990       0.16      0.03      0.05      4676
       1995       0.27      0.02      0.03      7904
       2000       0.29      0.01      0.01     12001
       2005       0.52      0.24      0.33     17884

avg / total       0.33      0.14      0.14     49062



# Decision Tree Classifier (CART)

#### Decision rule generation function 

In [9]:
from sklearn.tree import _tree

def tree_to_code(tree, feature_names):
    """outputs a function detailing decision rules for model fit in if else form"""
    tree_ = tree.tree_
    feature_name = [
        feature_names[i] if i != _tree.TREE_UNDEFINED else "undefined!"
        for i in tree_.feature
    ]
    print("def tree({}):".format(", ".join(feature_names)))

    def recurse(node, depth):
        indent = "  " * depth
        if tree_.feature[node] != _tree.TREE_UNDEFINED:
            name = feature_name[node]
            threshold = tree_.threshold[node]
            print("{}if {} <= {}:".format(indent, name, threshold))
            recurse(tree_.children_left[node], depth + 1)
            print("{}else:  # if {} > {}".format(indent, name, threshold))
            recurse(tree_.children_right[node], depth + 1)
        else:
            print("{}return {}".format(indent, tree_.value[node]))

    recurse(0, 1)

### Using Full Decade Samples 

In [10]:
CART = DecisionTreeClassifier()
CART.fit(X_train10, y_train10)
y_pred10 = CART.predict(X_test10)

model_eval(y_test10, y_pred10, classes10)

CONFUSION MATRIX (predicted along top, actual along side): 


Unnamed: 0,1970,1980,1990,2000
1970,929,602,474,391
1980,1113,1380,1034,674
1990,2952,3365,3542,2721
2000,6295,6657,7765,9168


             precision    recall  f1-score   support

       1970       0.08      0.39      0.14      2396
       1980       0.11      0.33      0.17      4201
       1990       0.28      0.28      0.28     12580
       2000       0.71      0.31      0.43     29885

avg / total       0.52      0.31      0.35     49062



In [11]:
# print(tree_to_code(CART.fit(X_train10, y_train10),X_train10.columns.values))

In [12]:
#print out the most important candidate splits, in order
print(list(zip(X_train10.columns[CART.tree_.feature], CART.tree_.threshold, CART.tree_.children_left, CART.tree_.children_right))[0:50])

[('PrinComp13', -0.1976751983165741, 1, 21228), ('PrinComp8', -0.13339287042617798, 2, 9063), ('PrinComp2', -0.46834075450897217, 3, 3962), ('PrinComp5', -0.33033832907676697, 4, 1297), ('PrinComp6', 0.4166070222854614, 5, 698), ('PrinComp15', 0.0008934421348385513, 6, 347), ('PrinComp11', 0.023176882416009903, 7, 170), ('PrinComp21', -0.08291637897491455, 8, 85), ('PrinComp8', -0.4276554584503174, 9, 46), ('PrinComp24', 1.5771021842956543, 10, 41), ('PrinComp22', 0.5921899676322937, 11, 38), ('PrinComp22', 0.5036613941192627, 12, 37), ('PrinComp17', -0.0027984227053821087, 13, 32), ('PrinComp17', -0.6010042428970337, 14, 23), ('PrinComp12', 0.5175155401229858, 15, 20), ('PrinComp20', -0.4730950891971588, 16, 19), ('PrinComp29', -0.6041961908340454, 17, 18), ('PrinComp29', -2.0, -1, -1), ('PrinComp29', -2.0, -1, -1), ('PrinComp29', -2.0, -1, -1), ('PrinComp27', 0.05499275028705597, 21, 22), ('PrinComp29', -2.0, -1, -1), ('PrinComp29', -2.0, -1, -1), ('PrinComp6', -0.3181633949279785, 2

### Using Half Decade Samples 

In [13]:
CART = DecisionTreeClassifier()
CART.fit(X_train5, y_train5)
y_pred5 = CART.predict(X_test5)

model_eval(y_test5, y_pred5, classes5)

CONFUSION MATRIX (predicted along top, actual along side): 


Unnamed: 0,1970,1975,1980,1985,1990,1995,2000,2005
1970,205,211,160,123,154,112,109,98
1975,225,225,199,140,142,105,97,91
1980,214,226,284,231,193,188,151,125
1985,325,320,373,504,354,300,223,190
1990,581,569,629,731,721,564,460,421
1995,938,963,961,1053,1055,1081,969,884
2000,1306,1388,1372,1398,1484,1549,1743,1761
2005,1897,1886,1966,2025,2208,2430,2634,2838


             precision    recall  f1-score   support

       1970       0.04      0.17      0.06      1172
       1975       0.04      0.18      0.06      1224
       1980       0.05      0.18      0.08      1612
       1985       0.08      0.19      0.11      2589
       1990       0.11      0.15      0.13      4676
       1995       0.17      0.14      0.15      7904
       2000       0.27      0.15      0.19     12001
       2005       0.44      0.16      0.23     17884

avg / total       0.27      0.15      0.18     49062



# Random Forest Classifier

### Using Full Decade Samples

In [14]:
RF = RandomForestClassifier()
RF.fit(X_train10, y_train10)
y_pred10 = RF.predict(X_test10)

model_eval(y_test10, y_pred10, classes10)

CONFUSION MATRIX (predicted along top, actual along side): 


Unnamed: 0,1970,1980,1990,2000
1970,1308,575,339,174
1980,1380,1664,763,394
1990,3840,3668,3073,1999
2000,7911,6466,6661,8847


             precision    recall  f1-score   support

       1970       0.09      0.55      0.16      2396
       1980       0.13      0.40      0.20      4201
       1990       0.28      0.24      0.26     12580
       2000       0.78      0.30      0.43     29885

avg / total       0.56      0.30      0.35     49062



### Using Half Decade Samples

In [15]:
RF = RandomForestClassifier()
RF.fit(X_train5, y_train5)
y_pred5 = RF.predict(X_test5)

model_eval(y_test5, y_pred5, classes5)

CONFUSION MATRIX (predicted along top, actual along side): 


Unnamed: 0,1970,1975,1980,1985,1990,1995,2000,2005
1970,435,268,128,102,89,60,45,45
1975,369,292,199,124,90,66,50,34
1980,304,340,351,246,131,112,83,45
1985,516,401,458,561,273,177,125,78
1990,926,689,681,808,642,403,275,252
1995,1483,1238,1040,1016,954,883,691,599
2000,2052,1685,1496,1282,1238,1400,1423,1425
2005,2954,2422,2116,1809,1771,1982,2236,2594


             precision    recall  f1-score   support

       1970       0.05      0.37      0.09      1172
       1975       0.04      0.24      0.07      1224
       1980       0.05      0.22      0.09      1612
       1985       0.09      0.22      0.13      2589
       1990       0.12      0.14      0.13      4676
       1995       0.17      0.11      0.14      7904
       2000       0.29      0.12      0.17     12001
       2005       0.51      0.15      0.23     17884

avg / total       0.31      0.15      0.17     49062



# KNN Classifier

### Using Full Decade Samples 

In [16]:
knn = KNeighborsClassifier(n_neighbors=4)
knn.fit(X_train10, y_train10)
y_pred10 = knn.predict(X_test10)

model_eval(y_test10, y_pred10, classes10)

CONFUSION MATRIX (predicted along top, actual along side): 


Unnamed: 0,1970,1980,1990,2000
1970,1441,497,301,157
1980,1533,1596,748,324
1990,4236,3473,3148,1723
2000,9393,6027,6733,7732


             precision    recall  f1-score   support

       1970       0.09      0.60      0.15      2396
       1980       0.14      0.38      0.20      4201
       1990       0.29      0.25      0.27     12580
       2000       0.78      0.26      0.39     29885

avg / total       0.56      0.28      0.33     49062



### Using Half Decade Samples 

In [17]:
knn = KNeighborsClassifier(n_neighbors=4)
knn.fit(X_train5, y_train5)
y_pred5 = knn.predict(X_test5)

model_eval(y_test5, y_pred5, classes5)

CONFUSION MATRIX (predicted along top, actual along side): 


Unnamed: 0,1970,1975,1980,1985,1990,1995,2000,2005
1970,505,304,109,69,69,43,38,35
1975,431,322,178,101,89,34,43,26
1980,402,372,319,231,112,66,67,43
1985,608,481,434,512,274,141,79,60
1990,1180,827,619,741,603,319,215,172
1995,1864,1486,976,924,863,719,588,484
2000,2674,2058,1461,1270,1066,1036,1268,1168
2005,3926,2981,2089,1678,1450,1479,2034,2247


             precision    recall  f1-score   support

       1970       0.04      0.43      0.08      1172
       1975       0.04      0.26      0.06      1224
       1980       0.05      0.20      0.08      1612
       1985       0.09      0.20      0.13      2589
       1990       0.13      0.13      0.13      4676
       1995       0.19      0.09      0.12      7904
       2000       0.29      0.11      0.16     12001
       2005       0.53      0.13      0.20     17884

avg / total       0.32      0.13      0.16     49062

