In [1]:
import time
import datetime
import random
import string
from sys import maxsize
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
from scipy import stats
from sklearn.linear_model import Lasso
from sklearn import preprocessing, svm
from sklearn.preprocessing import OneHotEncoder
from sklearn.utils import shuffle
from sklearn.metrics import precision_recall_fscore_support
from IPython.display import display, HTML

# PCA

In [2]:
class PCA:
    def __init__(self):
        self.eigenvalues = None
        self.eigenvectors = None
    
    def fit(self, x):
        x = np.array(x)
        mean = np.mean(x, axis=0)
        normalized = x - np.mean(x, axis=0)
        covariance = np.cov(x.T)
        eigenvalues, eigenvectors = np.linalg.eig(covariance)
        
        eigenpairs = [(np.abs(eigenvalues[i]), eigenvectors[:, i]) for i in range(len(eigenvalues))]
        eigenpairs.sort(key=lambda x: x[0], reverse=True)
        
        self.eigenvalues = np.array([eigenvalue for eigenvalue, eigenvector in eigenpairs])
        self.eigenvectors = np.array([eigenvector for eigenvalue, eigenvector in eigenpairs])
        self._eigenpairs = eigenpairs
        
    def transform(self, x, components):
        totalComponents = len(self.eigenvalues)
        if components > totalComponents or components < 1:
            raise ValueError('Invalid number of components')
        
        W = np.hstack([self.eigenvectors[i].reshape(totalComponents, 1) for i in range(components)])
        return np.array(W.T.dot(x.T).T)
    
    def fit_transform(self, x, components):
        self.fit(x)
        return self.transform(x, components)

# Alphabet Letter Classification

In [5]:
letterDf = pd.read_csv('data/letter-recognition.csv')
letterDf.Letter = letterDf.Letter.apply(lambda c: string.ascii_uppercase.find(c))
display(letterDf.head())
print('XBoxPos mean:', letterDf.XBoxPos.mean())

yCol = 'Letter'
xCols = [col for col in letterDf.columns if col != yCol]
trainProp = 0.8
numTraining = int(len(letterDf) * trainProp)
letterDf = shuffle(letterDf)

Unnamed: 0,Letter,XBoxPos,YBoxPos,Width,Height,PixelCount,XBar,YBar,X2Bar,Y2Bar,XYBar,X2YBar,XY2Bar,XEdge,XEdgeVY,YEdge,YEdgeVX
0,19,2,8,3,5,1,8,13,0,6,6,10,8,0,8,0,8
1,8,5,12,3,7,2,10,5,5,4,13,3,9,2,8,4,10
2,3,4,11,6,8,6,10,6,2,6,10,3,7,3,7,3,9
3,13,7,11,6,6,3,5,9,4,6,4,4,10,6,10,2,8
4,6,2,1,3,1,1,8,6,6,6,6,5,9,1,7,5,10


XBoxPos mean: 4.02355


In [4]:
pca = PCA()
newX = pca.fit_transform(letterDf[xCols].values, 10)
x = letterDf[xCols]
y = letterDf[yCol]
xTrain, xTest = x[:numTraining], x[numTraining:]
pcaXTrain, pcaXTest = newX[:numTraining], newX[numTraining:]
yTrain, yTest = y[:numTraining], y[numTraining:]

## SVM to Classify Letters on Untransformed Data

In [11]:
classifier = svm.SVC()
classifier.fit(xTrain, yTrain)
print('Accuracy:', classifier.score(xTest, yTest))
predicted = classifier.predict(xTest)
precision, recall, f1, support = precision_recall_fscore_support(yTest, predicted, average='macro')
print('Precision: {}, recall: {}, F1: {}'.format(precision, recall, f1))

Accuracy: 0.9765
Precision: 0.9766720830717864, recall: 0.9762443966573654, F1: 0.9763356050865389


## SVM Classifying Letters on PCA Transformed Data (10 components)

In [10]:
classifier = svm.SVC()
classifier.fit(pcaXTrain, yTrain)
print('Accuracy:', classifier.score(pcaXTest, yTest))
predicted = classifier.predict(pcaXTest)
precision, recall, f1, support = precision_recall_fscore_support(yTest, predicted, average='macro')
print('Precision: {}, recall: {}, F1: {}'.format(precision, recall, f1))

Accuracy: 0.952
Precision: 0.9526156008131282, recall: 0.9516717707845855, F1: 0.9518743774697247


# Predicting Doctor Visit No Shows

In [7]:
def convertTime(timeStr):
    return time.mktime(datetime.datetime.strptime(timeStr, '%Y-%m-%dT%H:%M:%SZ').timetuple())

noShows = pd.read_csv('data/No-show-Issue-Comma-300k.csv',
                     converters={'AppointmentRegistration': convertTime,
                                'ApointmentData': convertTime}).sample(n=300)
print('Number of no show samples:', len(noShows))
noShows.rename(columns={'ApointmentData': 'AppointmentDate', 'Alcoolism': 'Alcoholism'}, inplace=True)
noShows.loc[noShows['Status'] == 'Show-Up', 'Status'] = 1
noShows.loc[noShows['Status'] == 'No-Show', 'Status'] = -1
noShows['Status'] = noShows['Status'].astype(int)
noShows['AptRegistrationDifference'] = noShows.AppointmentDate - noShows.AppointmentRegistration
display(noShows.head())
print(noShows.Status.unique())
print(noShows.columns)

xCols = ['Age', 'AptRegistrationDifference', 'Diabetes', 'Alcoholism', 'HiperTension', 'Handcap',
         'Smokes', 'Scholarship', 'Tuberculosis', 'Sms_Reminder', 'AwaitingTime']
yCol = 'Status'

Number of no show samples: 300


Unnamed: 0,Age,Gender,AppointmentRegistration,AppointmentDate,DayOfTheWeek,Status,Diabetes,Alcoholism,HiperTension,Handcap,Smokes,Scholarship,Tuberculosis,Sms_Reminder,AwaitingTime,AptRegistrationDifference
268498,2,M,1432197000.0,1433203000.0,Tuesday,-1,0,0,0,0,0,0,0,0,-12,1006198.0
162849,5,M,1428483000.0,1429142000.0,Thursday,1,0,0,0,0,0,0,0,1,-8,659401.0
129172,62,F,1416901000.0,1418861000.0,Thursday,1,0,0,1,0,0,0,0,1,-23,1959617.0
214990,3,M,1416309000.0,1420502000.0,Tuesday,1,0,0,0,0,0,0,0,1,-49,4192954.0
114323,6,M,1414513000.0,1415750000.0,Wednesday,1,0,0,0,0,0,0,0,0,-15,1237622.0


[-1  1]
Index(['Age', 'Gender', 'AppointmentRegistration', 'AppointmentDate',
       'DayOfTheWeek', 'Status', 'Diabetes', 'Alcoholism', 'HiperTension',
       'Handcap', 'Smokes', 'Scholarship', 'Tuberculosis', 'Sms_Reminder',
       'AwaitingTime', 'AptRegistrationDifference'],
      dtype='object')


In [8]:
numTraining = int(len(noShows) * trainProp)
noShows = shuffle(noShows)
pca = PCA()
newX = pca.fit_transform(noShows[xCols].values, 3)
x = noShows[xCols]
y = noShows[yCol]
xTrain, xTest = x[:numTraining], x[numTraining:]
pcaXTrain, pcaXTest = newX[:numTraining], newX[numTraining:]
yTrain, yTest = y[:numTraining], y[numTraining:]

## SVM on Untransformed Data

In [9]:
classifier = svm.SVC()
classifier.fit(xTrain, yTrain)
print('Accuracy:', classifier.score(xTest, yTest))
predicted = classifier.predict(xTest)
precision, recall, f1, support = precision_recall_fscore_support(yTest, predicted, average='macro')
print('Precision: {}, recall: {}, F1: {}'.format(precision, recall, f1))

Accuracy: 0.633333333333
Precision: 0.31666666666666665, recall: 0.5, F1: 0.3877551020408163


  'precision', 'predicted', average, warn_for)


## SVM on PCA Transformed Data (3 components)

In [10]:
classifier = svm.SVC()
classifier.fit(pcaXTrain, yTrain)
print('Accuracy:', classifier.score(pcaXTest, yTest))
predicted = classifier.predict(pcaXTest)
precision, recall, f1, support = precision_recall_fscore_support(yTest, predicted, average='macro')
print('Precision: {}, recall: {}, F1: {}'.format(precision, recall, f1))

Accuracy: 0.633333333333
Precision: 0.31666666666666665, recall: 0.5, F1: 0.3877551020408163


  'precision', 'predicted', average, warn_for)


In [11]:
# HR Employee Dataset - Predicting whether or not employees will leave
### Source: Kaggle https://www.kaggle.com/ludobenistant/hr-analytics

In [19]:
hrDf = pd.read_csv('data/HR_comma_sep.csv', sep=',').sample(frac=0.1)
# hrDf.loc[hrDf['last_evaluation'], 'last_evaluation'] = 
hrDf['last_evaluation'] *= 100
hrDf['satisfaction_level'] *= 100
# display(hrDf.head())
# encoder = OneHotEncoder()
# print(hrDf['sales'])
# encoder.fit(hrDf['sales'])
# hrDf['encodedSales'] = encoder.transform(hrDf['sales'])
# display(hrDf.head())
yCol = 'left'
hrDf = pd.get_dummies(hrDf, 'enc', columns=categoricalColumns)
xCols = [col for col in hrDf.columns if col != yCol]
print('Columns:', hrDf.columns)
display(hrDf.head())

Columns: Index(['satisfaction_level', 'last_evaluation', 'number_project',
       'average_montly_hours', 'time_spend_company', 'Work_accident', 'left',
       'promotion_last_5years', 'enc_IT', 'enc_RandD', 'enc_accounting',
       'enc_hr', 'enc_management', 'enc_marketing', 'enc_product_mng',
       'enc_sales', 'enc_support', 'enc_technical', 'enc_high', 'enc_low',
       'enc_medium'],
      dtype='object')


Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,left,promotion_last_5years,enc_IT,enc_RandD,...,enc_hr,enc_management,enc_marketing,enc_product_mng,enc_sales,enc_support,enc_technical,enc_high,enc_low,enc_medium
14596,75.0,90.0,5,256,5,0,1,0,0,0,...,0,0,0,0,0,1,0,0,1,0
5837,79.0,86.0,4,173,4,0,0,0,0,0,...,0,0,0,0,0,0,1,0,1,0
1848,78.0,86.0,5,274,5,0,1,0,0,0,...,0,0,0,0,1,0,0,0,1,0
1884,87.0,91.0,5,228,5,0,1,0,0,0,...,0,0,0,0,0,1,0,0,1,0
3107,74.0,96.0,4,154,4,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,1


In [32]:
numTraining = int(len(hrDf) * trainProp)
noShows = shuffle(hrDf)
pca = PCA()
newX = pca.fit_transform(hrDf[xCols].values, 8)
x = hrDf[xCols]
y = hrDf[yCol]
xTrain, xTest = x[:numTraining], x[numTraining:]
pcaXTrain, pcaXTest = newX[:numTraining], newX[numTraining:]
yTrain, yTest = y[:numTraining], y[numTraining:]

## SVM Classifying Untransformed Data

In [33]:
classifier = svm.SVC()
classifier.fit(xTrain, yTrain)
print('Accuracy:', classifier.score(xTest, yTest))
predicted = classifier.predict(xTest)
precision, recall, f1, support = precision_recall_fscore_support(yTest, predicted, average='macro')
print('Precision: {}, recall: {}, F1: {}'.format(precision, recall, f1))

Accuracy: 0.906666666667
Precision: 0.9356813259252283, recall: 0.7963615399397861, F1: 0.8419150858175247


## SVM Classifying PCA Transformed Data (8 components)

In [34]:
classifier = svm.SVC()
classifier.fit(pcaXTrain, yTrain)
print('Accuracy:', classifier.score(pcaXTest, yTest))
predicted = classifier.predict(pcaXTest)
precision, recall, f1, support = precision_recall_fscore_support(yTest, predicted, average='macro')
print('Precision: {}, recall: {}, F1: {}'.format(precision, recall, f1))

Accuracy: 0.883333333333
Precision: 0.9347014925373134, recall: 0.7388059701492538, F1: 0.7883021835117643
