In [1]:
import pandas as pd
import statsmodels.formula.api as smf
df = pd.read_csv('http://cssbook.net/d/mediause.csv')
model = smf.ols(formula = 'newspaper ~ age + gender', data = df).fit()
# model.summary() would give a lot more info, but we only care about the coefficients:
model.params

Intercept   -0.089560
age          0.067620
gender       0.176665
dtype: float64

In [2]:
newdata = pd.DataFrame([{'gender':1, 'age':20}, {'gender': 0, 'age':40} ])
model.predict(newdata)

0    1.439508
1    2.615248
dtype: float64

In [3]:
from sklearn.model_selection import train_test_split

df = pd.read_csv('http://cssbook.net/d/mediause.csv')

df['uses-internet'] = (df['internet']>0).replace({True:'user', False:'non-user'})
df.dropna(inplace=True)
print("How many people used online news at all?")
print(df['uses-internet'].value_counts())

X_train, X_test, y_train, y_test = train_test_split(df[['age', 'education', 'gender']], df['uses-internet'], test_size=0.2, random_state=42)

print(f'We have {len(X_train)} training and {len(X_test)} test cases.')

How many people used online news at all?
user        1262
non-user     803
Name: uses-internet, dtype: int64
We have 1652 training and 413 test cases.


In [4]:
from sklearn.naive_bayes import GaussianNB


myclassifier = GaussianNB()
myclassifier.fit(X_train, y_train)

y_pred = myclassifier.predict(X_test)

In [5]:
from sklearn.metrics import confusion_matrix, classification_report

print('Confusion matrix:')
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))


Confusion matrix:
[[ 55 106]
 [ 40 212]]
              precision    recall  f1-score   support

    non-user       0.58      0.34      0.43       161
        user       0.67      0.84      0.74       252

    accuracy                           0.65       413
   macro avg       0.62      0.59      0.59       413
weighted avg       0.63      0.65      0.62       413



In [6]:
from sklearn.linear_model import LogisticRegression
myclassifier = LogisticRegression(solver='lbfgs')
myclassifier.fit(X_train, y_train)

y_pred = myclassifier.predict(X_test)

In [7]:
from sklearn.svm import SVC
from sklearn import preprocessing

# !!! We normalize our features to have M = 0 and SD = 1
# This is necessary as our features are not measured on the same scale, which SVM requires
# It may also be OK to rescale to a range of [0:1] or [-1:1]

scaler = preprocessing.StandardScaler().fit(X_train)

X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

myclassifier = SVC(gamma='scale')
myclassifier.fit(X_train_scaled, y_train)

y_pred = myclassifier.predict(X_test_scaled)

In [8]:
from sklearn.ensemble import RandomForestClassifier
myclassifier = RandomForestClassifier(n_estimators=100)
myclassifier.fit(X_train, y_train)

y_pred = myclassifier.predict(X_test)


In [11]:
from sklearn.metrics import roc_curve
import numpy as np
myclassifier = LogisticRegression(solver='lbfgs')
myclassifier.fit(X_train, y_train)

print('With default cutoff point (.5):')
y_pred = myclassifier.predict(X_test)
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))

# get all predicted probabilities
predprobs = myclassifier.predict_log_proba(X_test)

fpr,tpr, thresholds = roc_curve(y_test, predprobs[:,1], pos_label='user')

# You can uncomment the following lines to print a table,
# determine a False Positive/True Positive combination
# you like, and take that cutoff from the third column
# and the threshold (which is its logarithm) from the
# forth column

#print("False Positive Rate\tTrue Positive Rate\tCutoff\tThreshold")
#for f, t, th in zip(fpr,tpr, thresholds):
#    print('{}\t{}\t{}'.format(f,t,np.exp(th)),th)

# or, choose the cutoff point where the difference between 
# False Positive Rate and True Positive Rate is maximal

optimal_threshold = thresholds[np.argmax(tpr-fpr)]

print(f"\nWith the optimal probability threshold is {optimal_threshold}, which is equivalent to a cutoff of {np.exp(optimal_threshold)}, we get:")
y_pred_alternative = np.where(predprobs[:,1] > optimal_threshold, 'user', 'non-user')
print(classification_report(y_test, y_pred_alternative))
print(confusion_matrix(y_test, y_pred_alternative))

With default cutoff point (.5):
              precision    recall  f1-score   support

    non-user       0.58      0.37      0.45       161
        user       0.67      0.83      0.74       252

    accuracy                           0.65       413
   macro avg       0.63      0.60      0.60       413
weighted avg       0.64      0.65      0.63       413

[[ 59 102]
 [ 42 210]]

With the optimal probability threshold is -0.3880564601305959, which is equivalent to a cutoff of 0.6783740410958884, we get:
              precision    recall  f1-score   support

    non-user       0.50      0.80      0.61       161
        user       0.79      0.49      0.61       252

    accuracy                           0.61       413
   macro avg       0.64      0.64      0.61       413
weighted avg       0.68      0.61      0.61       413

[[128  33]
 [128 124]]


In [12]:
from sklearn.model_selection import cross_val_score
myclassifier = LogisticRegression(solver='lbfgs')
accuracy = cross_val_score(estimator=myclassifier, X=X_train, y=y_train, scoring='accuracy', cv=5)
print(accuracy)
print(f"M = {accuracy.mean():.2f}, SD = {accuracy.std():.3f}")

[0.64652568 0.64048338 0.62727273 0.64242424 0.63636364]
M = 0.64, SD = 0.007


In [23]:
from sklearn.model_selection import cross_val_score
from sklearn.metrics import cohen_kappa_score, make_scorer, f1_score
f1scores = cross_val_score(estimator=myclassifier, X=X_train, y=y_train, scoring=make_scorer(cohen_kappa_score), cv=5)
print(f1scores)
print(f"M = {f1scores.mean():.2f}, SD = {f1scores.std():.3f}")


[0.09304265 0.13409553 0.06261019 0.05477517 0.07530192]
M = 0.08, SD = 0.028


In [25]:
from sklearn.model_selection import GridSearchCV

myclassifier = RandomForestClassifier()
f1_scorer = make_scorer(f1_score, pos_label="user")

grid = {
    'n_estimators' : [10, 50, 100, 200], 
    'criterion': ['gini', 'entropy'],
    'bootstrap': [True, False],
    }
search = GridSearchCV(estimator=myclassifier,
                     param_grid=grid,
                     scoring=f1_scorer,
                     cv=5)
search.fit(X_train, y_train)
print(f'Using these hyperparameters {search.best_params_}, we get the best performance:')
print(classification_report(y_test, search.predict(X_test)))

Using these hyperparameters {'bootstrap': True, 'criterion': 'entropy', 'n_estimators': 50}, we get the best performance:
              precision    recall  f1-score   support

    non-user       0.41      0.37      0.39       161
        user       0.62      0.66      0.64       252

    accuracy                           0.54       413
   macro avg       0.51      0.51      0.51       413
weighted avg       0.54      0.54      0.54       413



In [26]:
myclassifier = SVC(gamma='scale')

grid = {
    'C' : [100, 1e4], 
    'kernel': ['linear','rbf', 'poly'],
    'degree': [3,4]
}

search = GridSearchCV(estimator=myclassifier,
                      param_grid=grid,
                      scoring=f1_scorer,
                      cv=5,
                      n_jobs=-1,  # use all cpus
                      verbose=10)
search.fit(X_train_scaled, y_train)
print(f'Using these hyperparameters {search.best_params_}, we get the best performance:')
print(classification_report(y_test, search.predict(X_test_scaled)))

Fitting 5 folds for each of 12 candidates, totalling 60 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 tasks      | elapsed:    1.9s
[Parallel(n_jobs=-1)]: Done  10 tasks      | elapsed:    2.7s
[Parallel(n_jobs=-1)]: Done  17 tasks      | elapsed:    4.3s
[Parallel(n_jobs=-1)]: Done  24 tasks      | elapsed:    5.4s
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:   26.0s
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:  2.8min
[Parallel(n_jobs=-1)]: Done  53 tasks      | elapsed:  5.0min
[Parallel(n_jobs=-1)]: Done  60 out of  60 | elapsed:  8.9min remaining:    0.0s
[Parallel(n_jobs=-1)]: Done  60 out of  60 | elapsed:  8.9min finished


Using these hyperparameters {'C': 100, 'degree': 3, 'kernel': 'poly'}, we get the best performance:
              precision    recall  f1-score   support

    non-user       0.58      0.04      0.08       161
        user       0.62      0.98      0.76       252

    accuracy                           0.62       413
   macro avg       0.60      0.51      0.42       413
weighted avg       0.60      0.62      0.49       413

