In [17]:
import pandas as pd

df = pd.read_csv('data/网站点击预测.csv')

from sklearn import preprocessing

le = preprocessing.LabelEncoder()

df['Gender'] = le.fit_transform(df['Gender'])

df['City'] = le.fit_transform(df['City'])

df['Country'] = le.fit_transform(df['Country'])


In [19]:

from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

X = df.drop(['Ad Topic Line', 'Timestamp', 'Clicked on Ad'], axis=1)

y = df['Clicked on Ad']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

lr = LogisticRegression(fit_intercept=True, dual=False, C=1.0).fit(X_train, y_train)

lr_pred = lr.predict(X_test)

df = DecisionTreeClassifier(random_state=0, max_depth=6, min_samples_split=2).fit(X_train, y_train)

df_pred = df.predict(X_test)



In [20]:
from sklearn.metrics import accuracy_score

lr_mse = accuracy_score(y_test, lr_pred)

df_mse = accuracy_score(y_test, df_pred)

print(lr_mse, df_mse)

0.6975757575757576 0.7584848484848485


In [21]:
from sklearn.model_selection import KFold, GridSearchCV

lr_param_grid = {'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000], 'penalty': ['l1', 'l2'], 'solver': ['newton-cg', 'sag']}

df_param_grid = {'max_depth': [3, 4, 5], 'min_samples_split': [2, 3, 4], 'splitter': ['best', 'random']}

kf = KFold(n_splits=2, random_state=None, shuffle=False)

lr_cv = GridSearchCV(estimator=lr, param_grid=lr_param_grid, cv=kf, scoring='neg_root_mean_squared_error')

df_cv = GridSearchCV(estimator=df, param_grid=df_param_grid, cv=kf, scoring='neg_root_mean_squared_error')

lr_cv.fit(X_train, y_train)

df_cv.fit(X_train, y_train)

best_lr_model = lr_cv.best_estimator_

best_df_model = df_cv.best_estimator_

best_lr_parm = lr_cv.best_params_

best_df_parm = df_cv.best_params_

28 fits failed out of a total of 56.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
14 fits failed with the following error:
Traceback (most recent call last):
  File "D:\anaconda3\Lib\site-packages\sklearn\model_selection\_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "D:\anaconda3\Lib\site-packages\sklearn\linear_model\_logistic.py", line 1162, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "D:\anaconda3\Lib\site-packages\sklearn\linear_model\_logistic.py", line 54, in _check_solver
    raise ValueError(
ValueError: Solver newton-cg supports only 'l2' or 'none' penalties, got l1 penalt

In [22]:
print(best_lr_parm)

print(best_df_parm)

print(best_lr_model.coef_)

print(best_df_model.feature_importances_)

{'C': 0.1, 'penalty': 'l2', 'solver': 'newton-cg'}
{'max_depth': 5, 'min_samples_split': 2, 'splitter': 'best'}
[[ 9.25455331e-03  1.26979360e-01 -8.19658113e-06 -4.80622471e-03
   1.81268376e-03 -2.93821406e-01 -1.38865578e-03]]
[0.00244994 0.73393094 0.06503601 0.03293747 0.09829095 0.
 0.0673547 ]


In [15]:
from joblib import dump

dump(best_lr_model, 'lr_model.joblib')

dump(best_df_model, 'df_model.joblib')

['df_model.joblib']