In [55]:
import numpy as np
import pandas as pd
from sklearn.model_selection import cross_val_score, train_test_split, cross_val_predict, GridSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_curve
from sklearn.linear_model import SGDClassifier

In [56]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [57]:
df = pd.read_csv("/content/drive/MyDrive/waterQuality/dataset.csv")

In [58]:
print((df.count(), len(df.columns)))

(Index                     5956842
pH                        5840788
Iron                      5917089
Nitrate                   5851117
Chloride                  5781311
Lead                      5929933
Zinc                      5800716
Color                     5951103
Turbidity                 5907027
Fluoride                  5767686
Copper                    5757440
Odor                      5777951
Sulfate                   5759424
Conductivity              5792981
Chlorine                  5899017
Manganese                 5847259
Total Dissolved Solids    5955172
Source                    5868580
Water Temperature         5788609
Air Temperature           5927114
Month                     5861174
Day                       5857239
Time of Day               5842323
Target                    5956842
dtype: int64, 24)


In [59]:
# pd.set_option('display.max_columns', None)

In [60]:
# df.head(1)

In [61]:
# df.describe()

In [62]:
# # Фильтруйте строки, где Target равно 1
# filtered_df = df[df['Target'] == 1]

# # Выведите первые 5 строк
# filtered_df.head(10)


In [63]:
# df.head(15)

In [64]:
# for t in df.dtypes:
#     print(t)

In [65]:
df.isna().sum()

Index                          0
pH                        116054
Iron                       39753
Nitrate                   105725
Chloride                  175531
Lead                       26909
Zinc                      156126
Color                       5739
Turbidity                  49815
Fluoride                  189156
Copper                    199402
Odor                      178891
Sulfate                   197418
Conductivity              163861
Chlorine                   57825
Manganese                 109583
Total Dissolved Solids      1670
Source                     88262
Water Temperature         168233
Air Temperature            29728
Month                      95668
Day                        99603
Time of Day               114519
Target                         0
dtype: int64

In [66]:
print('Уникальные значения для Source')
print(df['Source'].unique())
print('Уникальные значения для Month')
print(df['Month'].unique())
print('Уникальные значения для Color')
print(df['Color'].unique())

Уникальные значения для Source
[nan 'Lake' 'River' 'Ground' 'Spring' 'Stream' 'Aquifer' 'Reservoir'
 'Well']
Уникальные значения для Month
['January' 'November' 'April' 'June' 'March' 'September' 'May' 'July'
 'August' 'October' 'December' 'February' nan]
Уникальные значения для Color
['Colorless' 'Faint Yellow' 'Light Yellow' 'Near Colorless' 'Yellow' nan]


In [67]:
columns_to_fill = ['Color', 'Month', 'Source']
df[columns_to_fill] = df[columns_to_fill].fillna('NA')

In [68]:
df.isna().sum()

Index                          0
pH                        116054
Iron                       39753
Nitrate                   105725
Chloride                  175531
Lead                       26909
Zinc                      156126
Color                          0
Turbidity                  49815
Fluoride                  189156
Copper                    199402
Odor                      178891
Sulfate                   197418
Conductivity              163861
Chlorine                   57825
Manganese                 109583
Total Dissolved Solids      1670
Source                         0
Water Temperature         168233
Air Temperature            29728
Month                          0
Day                        99603
Time of Day               114519
Target                         0
dtype: int64

In [69]:
print('Уникальные значения для Source')
print(df['Source'].unique())
print('Уникальные значения для Month')
print(df['Month'].unique())
print('Уникальные значения для Color')
print(df['Color'].unique())

Уникальные значения для Source
['NA' 'Lake' 'River' 'Ground' 'Spring' 'Stream' 'Aquifer' 'Reservoir'
 'Well']
Уникальные значения для Month
['January' 'November' 'April' 'June' 'March' 'September' 'May' 'July'
 'August' 'October' 'December' 'February' 'NA']
Уникальные значения для Color
['Colorless' 'Faint Yellow' 'Light Yellow' 'Near Colorless' 'Yellow' 'NA']


In [70]:
# df['ColorInd']=pd.factorize(df['Color'])[0]

In [71]:
# df['MonthInd']=pd.factorize(df['Month'])[0]

In [72]:
# df['SourceInd']=pd.factorize(df['Source'])[0]

In [73]:
color_labels, color_uniques = pd.factorize(df['Color'])
month_labels, month_uniques = pd.factorize(df['Month'])
source_labels, source_uniques = pd.factorize(df['Source'])

df['ColorInd'] = color_labels
df['MonthInd'] = month_labels
df['SourceInd'] = source_labels

print("Color mappings:")
for i, color in enumerate(color_uniques):
    print(f"{color}: {i}")

print("\nMonth mappings:")
for i, month in enumerate(month_uniques):
    print(f"{month}: {i}")

print("\nSource mappings:")
for i, source in enumerate(source_uniques):
    print(f"{source}: {i}")

Color mappings:
Colorless: 0
Faint Yellow: 1
Light Yellow: 2
Near Colorless: 3
Yellow: 4
NA: 5

Month mappings:
January: 0
November: 1
April: 2
June: 3
March: 4
September: 5
May: 6
July: 7
August: 8
October: 9
December: 10
February: 11
NA: 12

Source mappings:
NA: 0
Lake: 1
River: 2
Ground: 3
Spring: 4
Stream: 5
Aquifer: 6
Reservoir: 7
Well: 8


In [74]:
df.isna().sum()

Index                          0
pH                        116054
Iron                       39753
Nitrate                   105725
Chloride                  175531
Lead                       26909
Zinc                      156126
Color                          0
Turbidity                  49815
Fluoride                  189156
Copper                    199402
Odor                      178891
Sulfate                   197418
Conductivity              163861
Chlorine                   57825
Manganese                 109583
Total Dissolved Solids      1670
Source                         0
Water Temperature         168233
Air Temperature            29728
Month                          0
Day                        99603
Time of Day               114519
Target                         0
ColorInd                       0
MonthInd                       0
SourceInd                      0
dtype: int64

In [75]:
print('Уникальные значения для Source')
print(df['SourceInd'].unique())
print('Уникальные значения для Month')
print(df['MonthInd'].unique())
print('Уникальные значения для Color')
print(df['ColorInd'].unique())

Уникальные значения для Source
[0 1 2 3 4 5 6 7 8]
Уникальные значения для Month
[ 0  1  2  3  4  5  6  7  8  9 10 11 12]
Уникальные значения для Color
[0 1 2 3 4 5]


In [76]:
df.head(3)

Unnamed: 0,Index,pH,Iron,Nitrate,Chloride,Lead,Zinc,Color,Turbidity,Fluoride,Copper,Odor,Sulfate,Conductivity,Chlorine,Manganese,Total Dissolved Solids,Source,Water Temperature,Air Temperature,Month,Day,Time of Day,Target,ColorInd,MonthInd,SourceInd
0,0,8.332988,8.3e-05,8.605777,122.799772,3.713298e-52,3.434827,Colorless,0.022683,0.607283,0.144599,1.626212,87.266538,471.683357,3.708178,2.269945e-15,332.118789,,,43.493324,January,29.0,4.0,0,0,0,0
1,1,6.917863,8.1e-05,3.734167,227.029851,7.849261999999999e-94,1.245317,Faint Yellow,0.019007,0.622874,0.437835,1.686049,144.010981,432.844908,3.292038,8.024076e-07,284.641984,Lake,15.348981,71.220586,November,26.0,16.0,0,1,1,1
2,2,5.443762,0.020106,3.816994,230.99563,5.2866160000000004e-76,0.52828,Light Yellow,0.319956,0.423423,0.431588,3.414619,275.702107,990.201209,3.560224,0.07007989,570.054094,River,11.643467,44.89133,January,31.0,8.0,0,2,0,2


In [77]:
df.drop(['Color', 'Source', 'Month'], axis=1, inplace=True)

In [78]:
df.isna().sum()

Index                          0
pH                        116054
Iron                       39753
Nitrate                   105725
Chloride                  175531
Lead                       26909
Zinc                      156126
Turbidity                  49815
Fluoride                  189156
Copper                    199402
Odor                      178891
Sulfate                   197418
Conductivity              163861
Chlorine                   57825
Manganese                 109583
Total Dissolved Solids      1670
Water Temperature         168233
Air Temperature            29728
Day                        99603
Time of Day               114519
Target                         0
ColorInd                       0
MonthInd                       0
SourceInd                      0
dtype: int64

In [79]:
quality_param=['pH','Iron','Nitrate','Chloride','Lead','Zinc','Turbidity',
               'Fluoride','Copper','Odor','Sulfate','Conductivity','Chlorine','Manganese',
               'Total Dissolved Solids','Water Temperature','Air Temperature','Day','Time of Day']

In [80]:
means = df[quality_param].mean()

In [81]:
means

pH                          7.445373
Iron                        0.127903
Nitrate                     6.169970
Chloride                  184.296981
Lead                        0.001498
Zinc                        1.550255
Turbidity                   0.521509
Fluoride                    0.964431
Copper                      0.516122
Odor                        1.803459
Sulfate                   146.076443
Conductivity              424.997435
Chlorine                    3.255738
Manganese                   0.109280
Total Dissolved Solids    267.145372
Water Temperature          19.129818
Air Temperature            60.003239
Day                        15.732267
Time of Day                11.504039
dtype: float64

In [82]:
means_param = np.asarray(means, dtype=float)
means_param[17]=round(means[17])
means_param[18]=round(means[18])

In [83]:
means

pH                          7.445373
Iron                        0.127903
Nitrate                     6.169970
Chloride                  184.296981
Lead                        0.001498
Zinc                        1.550255
Turbidity                   0.521509
Fluoride                    0.964431
Copper                      0.516122
Odor                        1.803459
Sulfate                   146.076443
Conductivity              424.997435
Chlorine                    3.255738
Manganese                   0.109280
Total Dissolved Solids    267.145372
Water Temperature          19.129818
Air Temperature            60.003239
Day                        16.000000
Time of Day                12.000000
dtype: float64

In [84]:
dictionary = dict(zip(quality_param, means_param))
print(dictionary)

{'pH': 7.445372942729366, 'Iron': 0.12790273707725777, 'Nitrate': 6.169969758594454, 'Chloride': 184.29698054263625, 'Lead': 0.0014983363203908506, 'Zinc': 1.5502546429661368, 'Turbidity': 0.5215093464819991, 'Fluoride': 0.9644314619476823, 'Copper': 0.5161215611529842, 'Odor': 1.8034592878352502, 'Sulfate': 146.0764427297663, 'Conductivity': 424.99743453877204, 'Chlorine': 3.255737601918627, 'Manganese': 0.10928017299168037, 'Total Dissolved Solids': 267.14537204279446, 'Water Temperature': 19.129817711866433, 'Air Temperature': 60.00323948308941, 'Day': 16.0, 'Time of Day': 12.0}


In [85]:
df[quality_param]=df[quality_param].fillna(dictionary)

In [86]:
df.isna().sum()

Index                     0
pH                        0
Iron                      0
Nitrate                   0
Chloride                  0
Lead                      0
Zinc                      0
Turbidity                 0
Fluoride                  0
Copper                    0
Odor                      0
Sulfate                   0
Conductivity              0
Chlorine                  0
Manganese                 0
Total Dissolved Solids    0
Water Temperature         0
Air Temperature           0
Day                       0
Time of Day               0
Target                    0
ColorInd                  0
MonthInd                  0
SourceInd                 0
dtype: int64

In [87]:
df.dtypes

Index                       int64
pH                        float64
Iron                      float64
Nitrate                   float64
Chloride                  float64
Lead                      float64
Zinc                      float64
Turbidity                 float64
Fluoride                  float64
Copper                    float64
Odor                      float64
Sulfate                   float64
Conductivity              float64
Chlorine                  float64
Manganese                 float64
Total Dissolved Solids    float64
Water Temperature         float64
Air Temperature           float64
Day                       float64
Time of Day               float64
Target                      int64
ColorInd                    int64
MonthInd                    int64
SourceInd                   int64
dtype: object

In [88]:
df.drop(['Index'], axis=1, inplace=True)

In [89]:
df.drop(['MonthInd'], axis=1, inplace=True)
df.drop(['Time of Day'], axis=1, inplace=True)
df.drop(['Day'], axis=1, inplace=True)

In [90]:
df.dtypes

pH                        float64
Iron                      float64
Nitrate                   float64
Chloride                  float64
Lead                      float64
Zinc                      float64
Turbidity                 float64
Fluoride                  float64
Copper                    float64
Odor                      float64
Sulfate                   float64
Conductivity              float64
Chlorine                  float64
Manganese                 float64
Total Dissolved Solids    float64
Water Temperature         float64
Air Temperature           float64
Target                      int64
ColorInd                    int64
SourceInd                   int64
dtype: object

In [91]:
df.head(1)

Unnamed: 0,pH,Iron,Nitrate,Chloride,Lead,Zinc,Turbidity,Fluoride,Copper,Odor,Sulfate,Conductivity,Chlorine,Manganese,Total Dissolved Solids,Water Temperature,Air Temperature,Target,ColorInd,SourceInd
0,8.332988,8.3e-05,8.605777,122.799772,3.713298e-52,3.434827,0.022683,0.607283,0.144599,1.626212,87.266538,471.683357,3.708178,2.269945e-15,332.118789,19.129818,43.493324,0,0,0


In [92]:
#df.to_csv("Cleardataset.csv", index=False)

In [93]:
X = df.drop(columns=['Target'])  # Признаки
y = df['Target']

In [94]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

Рандомный лес

In [95]:
# from sklearn.model_selection import train_test_split
# from sklearn.ensemble import RandomForestClassifier
# from sklearn.metrics import accuracy_score, classification_report
# from sklearn.model_selection import GridSearchCV

# model = RandomForestClassifier(n_estimators=30, max_depth=10)


In [96]:
# import time
# start = time.time()


# model.fit(X_train, y_train)


# end = time.time()

# print("The time of execution of above program is :",
#       (end-start), "s")

In [97]:
# y_pred = model.predict(X_test)

In [98]:
# print(f'Accuracy: {accuracy_score(y_test, y_pred)}')
# print(f'Precision: {precision_score(y_test, y_pred)}')
# print(f'Recall: {recall_score(y_test, y_pred)}')
# print(classification_report(y_test, y_pred))

In [99]:
# import joblib

# file_path = "/content/drive/MyDrive/waterQuality/model.joblib"

# joblib.dump(model, file_path)



In [100]:
# model2 = RandomForestClassifier(n_estimators=40, max_depth=14)


In [101]:
# start = time.time()


# model2.fit(X_train, y_train)


# end = time.time()

# print("The time of execution of above program is :",
#       (end-start), "s")

In [102]:
# y_pred2 = model2.predict(X_test)

In [103]:
# print(f'Accuracy: {accuracy_score(y_test, y_pred2)}')
# print(f'Precision: {precision_score(y_test, y_pred2)}')
# print(f'Recall: {recall_score(y_test, y_pred2)}')
# print(classification_report(y_test, y_pred2))

In [104]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import GridSearchCV
import time

model3 = RandomForestClassifier(n_estimators=50, max_depth=18)


In [105]:
start = time.time()


model3.fit(X_train, y_train)


end = time.time()

print("The time of execution of above program is :",
      (end-start), "s")

The time of execution of above program is : 2784.3662436008453 s


In [106]:
y_pred3 = model3.predict(X_test)

In [107]:
print(f'Accuracy: {accuracy_score(y_test, y_pred3)}')
print(f'Precision: {precision_score(y_test, y_pred3)}')
print(f'Recall: {recall_score(y_test, y_pred3)}')
print(classification_report(y_test, y_pred3))

Accuracy: 0.8821114986516908
Precision: 0.7280364487091312
Recall: 0.9745835251400883
              precision    recall  f1-score   support

           0       0.99      0.84      0.91   1246144
           1       0.73      0.97      0.83    540909

    accuracy                           0.88   1787053
   macro avg       0.86      0.91      0.87   1787053
weighted avg       0.91      0.88      0.89   1787053



In [108]:
estimator = model3.estimators_[1]



In [109]:
from sklearn.tree import export_graphviz
import graphviz


In [122]:
X_train.columns

Index(['pH', 'Iron', 'Nitrate', 'Chloride', 'Lead', 'Zinc', 'Turbidity',
       'Fluoride', 'Copper', 'Odor', 'Sulfate', 'Conductivity', 'Chlorine',
       'Manganese', 'Total Dissolved Solids', 'Water Temperature',
       'Air Temperature', 'ColorInd', 'SourceInd'],
      dtype='object')

In [125]:
dot_data = export_graphviz(estimator, feature_names=X_train.columns, filled=True, class_names=np.array(y_train.unique()).astype('str').tolist())

# Создайте графическое изображение дерева
# graph = graphviz.Source(dot_data)

# file_path = "/content/drive/MyDrive/waterQuality/tree_example.pdf"
# graph.render(file_path)  # Сохраните изображение в файл tree_example.pdf

In [126]:
graph2 = graphviz.Source(dot_data, format="png")
graph2

Output hidden; open in https://colab.research.google.com to view.

In [132]:
dot_data = export_graphviz(estimator, feature_names=X_train.columns, filled=True, class_names=np.array(y_train.unique()).astype('str').tolist(), max_depth=18)
graph2 = graphviz.Source(dot_data, format="png")
file_path = "/content/drive/MyDrive/waterQuality/tree_18.png"
graph2.render(file_path)

dot: graph is too large for cairo-renderer bitmaps. Scaling by 0.0415355 to fit


'/content/drive/MyDrive/waterQuality/tree_18.png.png'

In [None]:
# model4 = RandomForestClassifier(n_estimators=45, max_depth=16)


In [None]:
# start = time.time()


# model4.fit(X_train, y_train)


# end = time.time()

# print("The time of execution of above program is :",
#       (end-start), "s")

In [None]:
# y_pred4 = model4.predict(X_test)

In [None]:
# print(f'Accuracy: {accuracy_score(y_test, y_pred4)}')
# print(f'Precision: {precision_score(y_test, y_pred4)}')
# print(f'Recall: {recall_score(y_test, y_pred4)}')
# print(classification_report(y_test, y_pred4))

In [None]:
# model5 = RandomForestClassifier(n_estimators=60, max_depth=20)

In [None]:
# start = time.time()


# model5.fit(X_train, y_train)


# end = time.time()

# print("The time of execution of above program is :",
#       (end-start), "s")

In [None]:
# y_pred5 = model5.predict(X_test)

In [None]:
# print(f'Accuracy: {accuracy_score(y_test, y_pred5)}')
# print(f'Precision: {precision_score(y_test, y_pred5)}')
# print(f'Recall: {recall_score(y_test, y_pred5)}')
# print(classification_report(y_test, y_pred5))

In [None]:
# import joblib

# file_path = "/content/drive/MyDrive/waterQuality/model.joblib"

# joblib.dump(model, file_path)

In [None]:
# file_path2 = "/content/drive/MyDrive/waterQuality/model2.joblib"
# file_path3 = "/content/drive/MyDrive/waterQuality/model3.joblib"
# file_path4 = "/content/drive/MyDrive/waterQuality/model4.joblib"
# file_path5 = "/content/drive/MyDrive/waterQuality/model5.joblib"
# joblib.dump(model2, file_path2)
# joblib.dump(model3, file_path3)
# joblib.dump(model4, file_path4)
# joblib.dump(model5, file_path5)

In [None]:
# model6 = RandomForestClassifier(n_estimators=60, max_depth=10)

In [None]:
# start = time.time()


# model6.fit(X_train, y_train)


# end = time.time()

# print("The time of execution of above program is :",
#       (end-start), "s")

In [None]:
# y_pred6 = model6.predict(X_test)

In [None]:
# print(f'Accuracy: {accuracy_score(y_test, y_pred6)}')
# print(f'Precision: {precision_score(y_test, y_pred6)}')
# print(f'Recall: {recall_score(y_test, y_pred6)}')
# print(classification_report(y_test, y_pred6))