In [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np

In [106]:
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from sklearn.metrics import f1_score, make_scorer
from sklearn.model_selection import cross_validate, StratifiedKFold

In [5]:
df = pd.read_csv("DatasetAfterImputation.csv")
df["Time"] = pd.to_datetime(df["Time"])

In [6]:
df.head()

Unnamed: 0,Time,O3_flag,SO2_flag,NO2_flag,NO_flag,CO_flag,PM10_flag,PM2.5_flag,WDir_Avg,Rain_Tot,...,CO,PM10,PM2.5,Temp_Avg,RH_Avg,WSpeed_Avg,WSpeed_Max,Press_Avg,Rad_Avg,City
0,2015-01-01,OK,BDL,OK,OK,OK,OK,OK,254.0,0.48,...,0.417174,13.639045,15.658178,11.886855,49.923187,2.578788,3.39289,992.131944,161.775,Hermosillo
1,2015-01-01,OK,BDL,OK,BDL,OK,OK,OK,6.0,0.0,...,0.364306,25.296419,14.544301,15.523583,63.94309,2.326524,2.288214,809.352778,182.410231,Juriquilla
2,2015-01-02,OK,BDL,OK,BDL,OK,OK,OK,208.0,0.0,...,0.32434,21.233012,8.838785,17.286563,48.132736,2.749521,3.573296,807.790278,198.172784,Juriquilla
3,2015-01-02,OK,BDL,OK,OK,OK,OK,OK,245.0,0.0,...,0.647993,23.259388,13.146257,8.058157,47.275896,1.915147,2.608696,993.369444,178.987361,Hermosillo
4,2015-01-03,OK,BDL,OK,OK,OK,OK,OK,92.0,0.0,...,0.927549,37.48904,14.884469,10.231367,41.393625,1.528176,2.277276,996.759028,181.04875,Hermosillo


In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17659 entries, 0 to 17658
Data columns (total 25 columns):
 #   Column      Non-Null Count  Dtype         
---  ------      --------------  -----         
 0   Time        17659 non-null  datetime64[ns]
 1   O3_flag     17659 non-null  object        
 2   SO2_flag    17659 non-null  object        
 3   NO2_flag    17659 non-null  object        
 4   NO_flag     17659 non-null  object        
 5   CO_flag     17659 non-null  object        
 6   PM10_flag   17659 non-null  object        
 7   PM2.5_flag  17659 non-null  object        
 8   WDir_Avg    17659 non-null  float64       
 9   Rain_Tot    17659 non-null  float64       
 10  WDir_SD     17659 non-null  float64       
 11  O3          17659 non-null  float64       
 12  SO2         17659 non-null  float64       
 13  NO2         17659 non-null  float64       
 14  NO          17659 non-null  float64       
 15  CO          17659 non-null  float64       
 16  PM10        17659 non-

In [24]:
flag = [c for c in df.columns if "flag" in c]
X_flag = df[flag]

In [36]:
encoder = OneHotEncoder(sparse_output=False)
X_encoded = encoder.fit_transform(X_flag)
encoded_cols = encoder.get_feature_names_out(flag)

In [37]:
df_encoded = pd.DataFrame(X_encoded, columns=encoded_cols, index=df.index)
df = df.drop(columns=flag)
df = pd.concat([df, df_encoded], axis=1)

In [47]:
X, Y = df.drop(columns=["City", "Time"]), df["City"]

In [48]:
ss = StandardScaler()
X_ss = ss.fit_transform(X.values)

In [50]:
ls = LabelEncoder()
Y_encoded = ls.fit_transform(Y)

In [96]:
def f1_multiclase(Ytest, Ypred):
    return f1_score(Ytest, Ypred, average='weighted')

In [101]:
f1_w = make_scorer(f1_multiclase)
scoring = {'accuracy':'accuracy', "f1_w":f1_w}
skf = StratifiedKFold(n_splits=10)

In [102]:
knn = KNeighborsClassifier(n_neighbors=6, metric="manhattan")
model1 = cross_validate(knn, X_ss, Y_encoded, scoring=scoring, cv=10, n_jobs=-1)

In [103]:
model1["test_accuracy"].mean()

np.float64(0.7916565981924869)

In [105]:
model1["test_f1_w"].mean()

np.float64(0.7906501140961544)

In [134]:
tree = DecisionTreeClassifier(criterion="gini", max_depth=9, min_samples_split=10)
model2 = cross_validate(tree, X, Y, scoring=scoring, cv=10, n_jobs=-1)

In [135]:
model2["test_accuracy"].mean()

np.float64(0.9208314752373283)

In [136]:
model2["test_f1_w"].mean()

np.float64(0.9214983044114511)

In [143]:
forest = RandomForestClassifier(n_estimators=150, criterion="gini", max_depth=10, min_samples_split=10)
model3 = cross_validate(forest, X, Y, scoring=scoring, cv=10, n_jobs=-1)

In [144]:
model3["test_accuracy"].mean()

np.float64(0.9301760352134592)

In [145]:
model3["test_f1_w"].mean()

np.float64(0.9306672667398292)

array([ 9.87723613, 10.01993632,  9.81094623, 10.0027864 ,  9.99418426,
        9.9878068 ,  9.86263704, 10.04968333,  9.95847034,  9.94174933])