<a href="https://colab.research.google.com/github/qinyunkone/AQIstudy/blob/master/Classification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
# 数据分析模块
import pandas as pd
import numpy as np

# 预处理模块
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler

# 模型选择模块
from sklearn.model_selection import train_test_split, GridSearchCV

# 机器学习模块
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

# 模型评级模块
from sklearn.metrics import roc_curve, roc_auc_score, auc

# 其他模块
import os
import warnings
warnings.filterwarnings('ignore')

In [0]:
# 合并数据表
data = pd.DataFrame()
path = r'/content/drive/My Drive/Colab Notebooks/AQI_maincity'
for filename in os.listdir(path):
    dirname = os.path.join(path, filename)  # 路径
    with open(dirname, 'rb') as f:
        df = pd.read_csv(f, encoding='utf-8')
    data = pd.concat([data, df], ignore_index=True)

In [0]:
# 处理缺失值
print('存在缺失值的行数：', data[data['质量等级']=='无'].shape[0])
data.drop(data[data['质量等级']=='无'].index, inplace=True)
data.info()

存在缺失值的行数： 286
<class 'pandas.core.frame.DataFrame'>
Int64Index: 54465 entries, 0 to 54750
Data columns (total 9 columns):
date     54465 non-null object
AQI      54465 non-null int64
质量等级     54465 non-null object
PM2.5    54465 non-null int64
PM10     54465 non-null int64
SO2      54465 non-null int64
CO       54465 non-null float64
NO2      54465 non-null int64
O3_8h    54465 non-null int64
dtypes: float64(1), int64(6), object(2)
memory usage: 4.2+ MB


In [0]:
# 保存文件
data.to_csv('aqi.csv', encoding='utf-8', index=False)

In [0]:
data = pd.read_csv('aqi.csv', encoding='utf-8')
def get_grade(aqi):
    if aqi <= 100:
        return 1
    else:
        return 0
data['GRADE'] = data.AQI.apply(get_grade)
print(data.groupby(by='GRADE').size())
data.head()

GRADE
0    15408
1    39057
dtype: int64


Unnamed: 0.1,Unnamed: 0,date,AQI,质量等级,PM2.5,PM10,SO2,CO,NO2,O3_8h,GRADE
0,0,2014-01-01,195,中度污染,147,181,63,1.7,99,61,0
1,1,2014-01-02,147,轻度污染,113,131,37,1.6,95,60,0
2,2,2014-01-03,189,中度污染,142,163,56,1.4,96,45,0
3,3,2014-01-04,151,中度污染,115,125,36,1.2,64,38,0
4,4,2014-01-05,65,良,47,60,25,1.0,63,31,1


In [0]:
# 划分测试集

X, y = data.iloc[:, 4:-1], data.iloc[:, -1]
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=100)

# kNN

In [0]:
pipeline = make_pipeline(StandardScaler(), KNeighborsClassifier())

param = [
    {
        'kneighborsclassifier__n_neighbors': range(1, 10, 2),
    }
]

grid = GridSearchCV(pipeline, param, n_jobs=-1)
grid.fit(X_train, y_train)
print(grid.best_params_)

{'kneighborsclassifier__n_neighbors': 9}


In [0]:
knn = grid.best_estimator_
knn.fit(X_train, y_train)
knn.score(X_test, y_test)

0.9746640229125358

# 逻辑回归

In [0]:
pipeline = make_pipeline(StandardScaler(), LogisticRegression())

param = [
    {
        'logisticregression__C': [0.01, 0.1, 1, 10],
    }
]

grid = GridSearchCV(pipeline, param, n_jobs=-1)
grid.fit(X_train, y_train)
print(grid.best_params_)

{'logisticregression__C': 10}


In [0]:
log_clf = grid.best_estimator_
log_clf.fit(X_train, y_train)
log_clf.score(X_test, y_test)

0.9091576705588602

# SVM

In [0]:
pipeline = make_pipeline(StandardScaler(), SVC(kernel='rbf'))
params = {
    'svc__C': [0.1, 1, 10],
    'svc__gamma': [0.01, 0.1, 1]
}
grid = GridSearchCV(pipeline, params, n_jobs=-1)
grid.fit(X_train, y_train)
print(grid.best_params_)

{'svc__C': 10, 'svc__gamma': 1}


In [0]:
svc = grid.best_estimator_
svc.fit(X_train, y_train)
svc.score(X_test, y_test)

0.9884702944848351

# 决策树

In [0]:
pipeline = make_pipeline(DecisionTreeClassifier())
params = {
    'decisiontreeclassifier__max_depth': range(3, 10),
}
grid = GridSearchCV(pipeline, params, n_jobs=-1)
grid.fit(X_train, y_train)
print(grid.best_params_)

{'decisiontreeclassifier__max_depth': 6}


In [0]:
dt = grid.best_estimator_
dt.fit(X_train, y_train)
dt.score(X_test, y_test)

0.9958874935742087

# 随机森林

In [0]:
pipeline = make_pipeline(RandomForestClassifier(n_estimators=100))
params = {
    'randomforestclassifier__max_depth': range(3, 10),
    'randomforestclassifier__max_features': [3,4,5],
}
grid = GridSearchCV(pipeline, params, n_jobs=-1)
grid.fit(X_train, y_train)
print(grid.best_params_)

{'randomforestclassifier__max_depth': 6, 'randomforestclassifier__max_features': 3}


In [0]:
rf = grid.best_estimator_
rf.fit(X_train, y_train)
rf.score(X_test, y_test)

0.9960343688037012

# xgboost

In [0]:
pipeline = make_pipeline(XGBClassifier())
params = {
    'xgbclassifier__max_depth': range(3, 10),
    'xgbclassifier__subsample': [0.5,0.8,1],
    'xgbclassifier__colsample_bytree': [0.5,0.8,1],
    'xgbclassifier__learning_rate': [0.01, 0.1, 0.3],
    'xgbclassifier__gamma': [0, 0.01, 0.1]
}
grid = GridSearchCV(pipeline, params, n_jobs=-1)
grid.fit(X_train, y_train)
print(grid.best_params_)

{'xgbclassifier__colsample_bytree': 1, 'xgbclassifier__gamma': 0.01, 'xgbclassifier__learning_rate': 0.01, 'xgbclassifier__max_depth': 9, 'xgbclassifier__subsample': 0.5}


In [0]:
xgb = grid.best_estimator_
xgb.fit(X_train, y_train)
xgb.score(X_test, y_test)

0.9957406183447162