## First model with scikit-learn

In [56]:
import pandas as pd

In [57]:
adult_census = pd.read_csv("data/adult-census-numeric.csv")

In [58]:
adult_census.head()

Unnamed: 0,age,capital-gain,capital-loss,hours-per-week,class
0,41,0,0,92,<=50K
1,48,0,0,40,<=50K
2,60,0,0,25,<=50K
3,37,0,0,45,<=50K
4,73,3273,0,40,<=50K


In [59]:
adult_census.shape

(39073, 5)

In [60]:
len(adult_census)

39073

## Separate the data and the target

In [61]:
target_name = "class"
target = adult_census[target_name]
target

0         <=50K
1         <=50K
2         <=50K
3         <=50K
4         <=50K
          ...  
39068     <=50K
39069     <=50K
39070      >50K
39071     <=50K
39072      >50K
Name: class, Length: 39073, dtype: object

In [62]:
data = adult_census.drop(columns=[target_name, ])
data.head()

Unnamed: 0,age,capital-gain,capital-loss,hours-per-week
0,41,0,0,92
1,48,0,0,40
2,60,0,0,25
3,37,0,0,45
4,73,3273,0,40


In [63]:
data.columns

Index(['age', 'capital-gain', 'capital-loss', 'hours-per-week'], dtype='object')

In [64]:
print(f"The dataset cotains {data.shape[0]} samples and "
      f"{data.shape[1]} features")

The dataset cotains 39073 samples and 4 features


## Fit a model and make predictions

In [65]:
from sklearn import set_config

In [66]:
set_config(display='diagram')

In [67]:
from sklearn.neighbors import KNeighborsClassifier

In [68]:
model = KNeighborsClassifier()
model.fit(data, target)

In [69]:
target_predicted = model.predict(data)

In [70]:
target_predicted[:5]

array([' >50K', ' <=50K', ' <=50K', ' <=50K', ' <=50K'], dtype=object)

In [71]:
target[:5]

0     <=50K
1     <=50K
2     <=50K
3     <=50K
4     <=50K
Name: class, dtype: object

In [72]:
target[:5] == target_predicted[:5]

0    False
1     True
2     True
3     True
4     True
Name: class, dtype: bool

In [73]:
print(f"Number of correct prediction: "
      f"{(target[:5] == target_predicted[:5]).sum()} / 5")

Number of correct prediction: 4 / 5


In [74]:
(target == target_predicted).mean()  # average success rate

0.8224349294909529

## Train-test data split

In [75]:
adult_census_test = pd.read_csv('data/adult-census-numeric-test.csv')

In [76]:
target_test = adult_census_test[target_name]
data_test = adult_census_test.drop(columns=[target_name, ])

In [77]:
print(f"The testing dataset contains {data_test.shape[0]} samples and "
      f"{data_test.shape[1]} features")

The testing dataset contains 9769 samples and 4 features


In [78]:
accuracy = model.score(data_test, target_test)
model_name = model.__class__.__name__

print(f"The test accuracy using a {model_name} is "
      f"{accuracy:.3f}")

The test accuracy using a KNeighborsClassifier is 0.807


In this notebook we:

<ul>
    <li>fitted a k-nearest neighbors model on a training dataset;</li> 
    <li>evaluated its statistical performance on the testing data;</li>
    <li>introduced the scikit-learn API .fit(X, y) (to train a model), .predict(X) (to make predictions) and .score(X, y) (to evaluate a model).</li>
</ul>