In [1]:
import os, glob

import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import classification_report,confusion_matrix, accuracy_score
from sklearn import preprocessing
from sklearn import metrics

## 8.1

For this exercise, we will use data from the NIST’s AnthroKids dataset. The dataset comes from a 1977 anthropometric study of body measurements for children. Subjects in this sample are between the ages of 8 and 18 years old, selected at random from the much larger dataset of the original study.

Use the SVM to see if we can use height, weight, and other features to predict the sex (F, M) of the child.

In [2]:
df = pd.read_csv("./data/anthrokids.csv")
df.head()

Unnamed: 0,id,mass,height,waist,foot,sittingHeight,upperLegLength,kneeHeight,forearmLength,age,gender,handedness,birthOrder
0,1,15.5,103.3,47.5,16.3,582.0,306.0,,259.0,4.219,F,right,1.0
1,2,17.6,103.9,49.8,16.3,606.0,311.0,,274.0,4.326,M,right,1.0
2,3,23.0,111.2,52.0,17.1,594.0,387.0,,304.0,4.476,F,right,1.0
3,4,16.5,99.7,49.1,16.3,542.0,312.0,,281.0,3.841,F,both,1.0
4,5,15.0,99.7,46.5,16.7,524.0,321.0,,269.0,3.46,F,both,1.0


In [6]:
df.isna().sum()

id                  0
mass                4
height              6
waist              80
foot               62
sittingHeight      42
upperLegLength     28
kneeHeight        212
forearmLength      69
age                 2
gender              0
handedness        115
birthOrder        165
dtype: int64

In [8]:
df.shape

(3900, 13)

In [10]:
df = df.dropna(subset=["height", "mass", "waist", "foot"])

In [11]:
#Creating X and y
X = df[["height", "mass", "waist", "foot"]]
y = df[['gender']]

#Creating training and testing set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

In [12]:
#Building the SVC model and fitting the training data
model = SVC(kernel='linear')
model.fit(X_train,y_train.values.ravel())

#Predicting on the test data
predictions = model.predict(X_test)

#Printing the accuracy
print("Accuracy:", accuracy_score(y_test, predictions))
#Printing the confusion matrix
print(confusion_matrix(y_test,predictions))
#Printing the classification report
print(classification_report(y_test,predictions))

Accuracy: 0.660245183887916
[[385 167]
 [221 369]]
              precision    recall  f1-score   support

           F       0.64      0.70      0.66       552
           M       0.69      0.63      0.66       590

    accuracy                           0.66      1142
   macro avg       0.66      0.66      0.66      1142
weighted avg       0.66      0.66      0.66      1142



## 8.2
Download hsbdemo2 dataset. Create a classifier using SVM from the reading, writing, mathematics, and science scores of the high-school students. Evaluate the classifier's accuracy in predicting which academic program the student will be joining. Report other evaluative criteria for assessing the goodness of the model built.

In [13]:
df = pd.read_csv("./data/hsbdemo2.csv")
df.head()

Unnamed: 0.1,Unnamed: 0,id,female,ses,schtyp,prog,read,write,math,science,socst,honors,awards,cid
0,11,1,female,low,public,vocation,34,44,40,39,41,not enrolled,0,1
1,9,2,female,middle,public,vocation,39,41,33,42,41,not enrolled,0,1
2,159,3,male,low,public,academic,63,65,48,63,56,enrolled,5,16
3,30,4,female,low,public,academic,44,50,41,39,51,not enrolled,1,3
4,33,5,male,low,public,academic,47,40,43,45,31,not enrolled,0,4


In [14]:
df.isna().sum()

Unnamed: 0    0
id            0
female        0
ses           0
schtyp        0
prog          0
read          0
write         0
math          0
science       0
socst         0
honors        0
awards        0
cid           0
dtype: int64

In [15]:
df.shape

(155, 14)

In [None]:
#Creating X and y
X = df[["read","write","math","science"]]
y = df[['gender']]

#Creating training and testing set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)