In [1]:
import pandas as pd
import statsmodels.formula.api as smf
df = pd.read_csv('../datasets/mediause.csv')
model = smf.ols(formula = 'newspaper ~ age + gender', data = df).fit()
print(model.summary())

                            OLS Regression Results                            
Dep. Variable:              newspaper   R-squared:                       0.190
Model:                            OLS   Adj. R-squared:                  0.190
Method:                 Least Squares   F-statistic:                     244.4
Date:                Sun, 05 Jan 2020   Prob (F-statistic):           4.76e-96
Time:                        15:33:34   Log-Likelihood:                -4914.2
No. Observations:                2081   AIC:                             9834.
Df Residuals:                    2078   BIC:                             9851.
Df Model:                           2                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept     -0.0896      0.159     -0.564      0.5

In [2]:
newdata = pd.DataFrame([{'gender':1, 'age':20}, {'gender': 0, 'age':40} ])
model.predict(newdata)

0    1.439508
1    2.615248
dtype: float64

In [3]:
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report

df['uses-internet'] = df['internet']>0
df.dropna(inplace=True)
print("How many people used online news at all?")
print(df['uses-internet'].value_counts())


print("\nLet's see how well we can predict this...")

X_train, X_test, y_train, y_test = train_test_split(df[['age', 'education', 'gender']], df['uses-internet'], test_size=0.2, random_state=42)


myclassifier = GaussianNB()
myclassifier.fit(X_train, y_train)

y_pred = myclassifier.predict(X_test)

print('\nHow often do we get it right?\nConfusion matrix:')
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))


How many people used online news at all?
True     1262
False     803
Name: uses-internet, dtype: int64

Let's see how well we can predict this...

How often do we get it right?
Confusion matrix:
[[ 55 106]
 [ 40 212]]
              precision    recall  f1-score   support

       False       0.58      0.34      0.43       161
        True       0.67      0.84      0.74       252

   micro avg       0.65      0.65      0.65       413
   macro avg       0.62      0.59      0.59       413
weighted avg       0.63      0.65      0.62       413



In [4]:
from sklearn.linear_model import LogisticRegression
myclassifier = LogisticRegression(solver='lbfgs')
myclassifier.fit(X_train, y_train)

y_pred = myclassifier.predict(X_test)

print('\nHow often do we get it right?\nConfusion matrix:')
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))



How often do we get it right?
Confusion matrix:
[[ 59 102]
 [ 42 210]]
              precision    recall  f1-score   support

       False       0.58      0.37      0.45       161
        True       0.67      0.83      0.74       252

   micro avg       0.65      0.65      0.65       413
   macro avg       0.63      0.60      0.60       413
weighted avg       0.64      0.65      0.63       413



In [5]:
from sklearn.svm import SVC
myclassifier = SVC(gamma='scale')
myclassifier.fit(X_train, y_train)

y_pred = myclassifier.predict(X_test)

print('\nHow often do we get it right?\nConfusion matrix:')
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))



How often do we get it right?
Confusion matrix:
[[ 22 139]
 [ 22 230]]
              precision    recall  f1-score   support

       False       0.50      0.14      0.21       161
        True       0.62      0.91      0.74       252

   micro avg       0.61      0.61      0.61       413
   macro avg       0.56      0.52      0.48       413
weighted avg       0.58      0.61      0.54       413



In [6]:
from sklearn.ensemble import RandomForestClassifier
myclassifier = RandomForestClassifier(n_estimators=100)
myclassifier.fit(X_train, y_train)

y_pred = myclassifier.predict(X_test)

print('\nHow often do we get it right?\nConfusion matrix:')
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))



How often do we get it right?
Confusion matrix:
[[ 64  97]
 [ 77 175]]
              precision    recall  f1-score   support

       False       0.45      0.40      0.42       161
        True       0.64      0.69      0.67       252

   micro avg       0.58      0.58      0.58       413
   macro avg       0.55      0.55      0.55       413
weighted avg       0.57      0.58      0.57       413

