In [1]:
import pandas as pd
import statsmodels.formula.api as smf
df = pd.read_csv('http://cssbook.net/d/mediause.csv')
model = smf.ols(formula = 'newspaper ~ age + gender', data = df).fit()
# model.summary() would give a lot more info, but we only care about the coefficients:
model.params

Intercept   -0.089560
age          0.067620
gender       0.176665
dtype: float64

In [2]:
newdata = pd.DataFrame([{'gender':1, 'age':20}, {'gender': 0, 'age':40} ])
model.predict(newdata)

0    1.439508
1    2.615248
dtype: float64

In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split

df = pd.read_csv('http://cssbook.net/d/mediause.csv')

df['uses-internet'] = df['internet']>0
df.dropna(inplace=True)
print("How many people used online news at all?")
print(df['uses-internet'].value_counts())

X_train, X_test, y_train, y_test = train_test_split(df[['age', 'education', 'gender']], df['uses-internet'], test_size=0.2, random_state=42)

print(f'We have {len(X_train)} training and {len(X_test)} test cases.')

How many people used online news at all?
True     1262
False     803
Name: uses-internet, dtype: int64
We have 1652 training and 413 test cases.


In [4]:
from sklearn.naive_bayes import GaussianNB


myclassifier = GaussianNB()
myclassifier.fit(X_train, y_train)

y_pred = myclassifier.predict(X_test)

In [11]:
from sklearn.metrics import confusion_matrix, classification_report

print('Confusion matrix:')
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))


Confusion matrix:
[[ 63  98]
 [ 76 176]]
              precision    recall  f1-score   support

       False       0.45      0.39      0.42       161
        True       0.64      0.70      0.67       252

   micro avg       0.58      0.58      0.58       413
   macro avg       0.55      0.54      0.54       413
weighted avg       0.57      0.58      0.57       413



In [6]:
from sklearn.linear_model import LogisticRegression
myclassifier = LogisticRegression(solver='lbfgs')
myclassifier.fit(X_train, y_train)

y_pred = myclassifier.predict(X_test)

In [12]:
from sklearn.svm import SVC
from sklearn import preprocessing

# !!! We normalize our features to have M = 0 and SD = 1
# This is necessary as our features are not measured on the same scale, which SVM requires
# It may also be OK to rescale to a range of [0:1] or [-1:1]

scaler = preprocessing.StandardScaler().fit(X_train)

X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

myclassifier = SVC(gamma='scale')
myclassifier.fit(X_train_scaled, y_train)

y_pred = myclassifier.predict(X_test_scaled)

  return self.partial_fit(X, y)
  
  import sys


In [8]:
from sklearn.ensemble import RandomForestClassifier
myclassifier = RandomForestClassifier(n_estimators=100)
myclassifier.fit(X_train, y_train)

y_pred = myclassifier.predict(X_test)


In [9]:
from sklearn.model_selection import cross_val_score
f1scores = cross_val_score(estimator=myclassifier, X=X_train, y=y_train, scoring='f1', cv=5)
print(f1scores)
print(f"M = {f1scores.mean():.2f}, SD = {f1scores.std():.3f}")

[0.69158879 0.66502463 0.65024631 0.65859564 0.65060241]
M = 0.66, SD = 0.015


In [13]:
from sklearn.model_selection import GridSearchCV

myclassifier = RandomForestClassifier(n_estimators=100)

grid = {
    'n_estimators' : [10, 50, 100, 200], 
    'criterion': ['gini', 'entropy'],
    'bootstrap': [True, False]
}
search = GridSearchCV(estimator=myclassifier,
                     param_grid=grid,
                     scoring='f1',
                     cv=5)
search.fit(X_train, y_train)
print(f'Using these hyperparameters {search.best_params_}, we get the best performance:')
print(classification_report(y_test, search.predict(X_test)))

Using these hyperparameters {'bootstrap': True, 'criterion': 'gini', 'n_estimators': 10}, we get the best performance:
              precision    recall  f1-score   support

       False       0.43      0.40      0.41       161
        True       0.63      0.66      0.64       252

   micro avg       0.56      0.56      0.56       413
   macro avg       0.53      0.53      0.53       413
weighted avg       0.55      0.56      0.55       413



In [None]:
from sklearn.model_selection import GridSearchCV

myclassifier = SVC(gamma='scale')


grid = {
    'C' : [100, 1e4, 1e6], 
    'kernel': ['linear','rbf','poly'],
    'degree': [2,3,4]
}
search = GridSearchCV(estimator=myclassifier,
                     param_grid=grid,
                     scoring='f1',
                     cv=5)
search.fit(X_train_scaled, y_train)
print(f'Using these hyperparameters {search.best_params_}, we get the best performance:')
print(classification_report(y_test, search.predict(X_test_scaled)))