In [26]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import seaborn as sns
from sklearn.metrics import accuracy_score

In [2]:
from google.colab import files
file_ = files.upload()

Saving seattle-weather.csv to seattle-weather.csv


In [3]:
df = pd.read_csv("seattle-weather.csv")

In [4]:
df.head(25)


Unnamed: 0,date,precipitation,temp_max,temp_min,wind,weather
0,2012-01-01,0.0,12.8,5.0,4.7,drizzle
1,2012-01-02,10.9,10.6,2.8,4.5,rain
2,2012-01-03,0.8,11.7,7.2,2.3,rain
3,2012-01-04,20.3,12.2,5.6,4.7,rain
4,2012-01-05,1.3,8.9,2.8,6.1,rain
5,2012-01-06,2.5,4.4,2.2,2.2,rain
6,2012-01-07,0.0,7.2,2.8,2.3,rain
7,2012-01-08,0.0,10.0,2.8,2.0,sun
8,2012-01-09,4.3,9.4,5.0,3.4,rain
9,2012-01-10,1.0,6.1,0.6,3.4,rain


In [5]:
df.shape

(1461, 6)

In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1461 entries, 0 to 1460
Data columns (total 6 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   date           1461 non-null   object 
 1   precipitation  1461 non-null   float64
 2   temp_max       1461 non-null   float64
 3   temp_min       1461 non-null   float64
 4   wind           1461 non-null   float64
 5   weather        1461 non-null   object 
dtypes: float64(4), object(2)
memory usage: 68.6+ KB


In [11]:
# weather of different types
df['weather'].value_counts()

rain       641
sun        640
fog        101
drizzle     53
snow        26
Name: weather, dtype: int64

In [10]:
# Нет пропусков
df.isnull().sum()

date             0
precipitation    0
temp_max         0
temp_min         0
wind             0
weather          0
dtype: int64

Разделим данные на входные и ответы, удалим лишние столбцы.

In [15]:
# Сначала разделим на признаки (X), и ответы (y)
# предсказывать будем weather, сам прогноз погоды

y = df['weather']
X = df.drop(['date', 'weather'], axis=1)

In [16]:
X.head()

Unnamed: 0,precipitation,temp_max,temp_min,wind
0,0.0,12.8,5.0,4.7
1,10.9,10.6,2.8,4.5
2,0.8,11.7,7.2,2.3
3,20.3,12.2,5.6,4.7
4,1.3,8.9,2.8,6.1


In [17]:
y.head()

0    drizzle
1       rain
2       rain
3       rain
4       rain
Name: weather, dtype: object

Далее разделим данные на тренировочные и тестовые, чтобы обучать модели на первом, и проверять качество на втором.

In [51]:
# Делим на тестовую и валидационную выборку. Для теста будем использовать последние 400 строк.
from sklearn.model_selection import train_test_split

X_train, X_test = train_test_split(X.iloc[:-400], test_size=400, shuffle=False)
y_train, y_test = train_test_split(y.iloc[:-400], test_size=400, shuffle=False)

In [52]:
print("data:   ", df.shape[0])
print("X_train:", X_train.shape[0])
print("X_test: ", X_test.shape[0])

data:    1461
X_train: 661
X_test:  400


#Stochastic gradient descent (SGD)

In [145]:
from sklearn.linear_model import SGDClassifier

In [146]:
sgd = SGDClassifier(n_jobs=-1)
sgd.fit(X_train, y_train)
y_pred = sgd.predict(X_test)

In [147]:
pd.Series(y_pred).value_counts()

sun        221
rain       158
drizzle     11
snow        10
dtype: int64

In [148]:
accuracy_score(y_test, y_pred)

0.7775

#RandomForestClassifier

In [28]:
from sklearn.ensemble import RandomForestClassifier

In [53]:
rf_clf = RandomForestClassifier(n_jobs=-1)
rf_clf.fit(X_train, y_train)
y_pred = rf_clf.predict(X_test)

In [55]:
pd.concat([pd.Series(y_pred), y_test.reset_index()['weather']], axis=1)

Unnamed: 0,0,weather
0,sun,sun
1,sun,sun
2,sun,sun
3,sun,sun
4,rain,rain
...,...,...
395,rain,rain
396,rain,rain
397,rain,rain
398,rain,rain


In [57]:
pd.Series(y_pred).value_counts()

sun        198
rain       181
drizzle     15
snow         4
fog          2
dtype: int64

In [56]:
accuracy_score(y_test, y_pred)

0.8425

#  DecisionTreeClassifier

In [33]:
from sklearn.tree import DecisionTreeClassifier

In [58]:
des_tree = DecisionTreeClassifier()
des_tree.fit(X_train, y_train)
y_pred = des_tree.predict(X_test)

In [60]:
pd.Series(y_pred).value_counts()

rain       184
sun        174
drizzle     24
fog         12
snow         6
dtype: int64

In [59]:
accuracy_score(y_test, y_pred)

0.7825

#Gradient Boosting

In [37]:
from sklearn.ensemble import GradientBoostingClassifier

In [76]:
gb_clf = GradientBoostingClassifier(max_depth=3, learning_rate=0.1, n_estimators=40)
gb_clf.fit(X_train, y_train)
y_pred = gb_clf.predict(X_test)

In [78]:
pd.Series(y_pred).value_counts()

sun        224
rain       171
snow         3
drizzle      2
dtype: int64

In [77]:
accuracy_score(y_test, y_pred)

0.9025

#Catboost Classifier

In [42]:
!pip install catboost
from google.colab import output
output.enable_custom_widget_manager()

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting catboost
  Downloading catboost-1.1.1-cp39-none-manylinux1_x86_64.whl (76.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m76.6/76.6 MB[0m [31m7.9 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: catboost
Successfully installed catboost-1.1.1


In [43]:
from catboost import CatBoostClassifier

In [64]:
cbst = CatBoostClassifier() 
cbst.fit(X_train, y_train)
y_pred = cbst.predict(X_test)

Learning rate set to 0.077474
0:	learn: 1.4505504	total: 48.7ms	remaining: 48.6s
1:	learn: 1.3270221	total: 50.3ms	remaining: 25.1s
2:	learn: 1.2277560	total: 51.8ms	remaining: 17.2s
3:	learn: 1.1544245	total: 53.4ms	remaining: 13.3s
4:	learn: 1.0881286	total: 55ms	remaining: 10.9s
5:	learn: 1.0339442	total: 56.5ms	remaining: 9.35s
6:	learn: 0.9829320	total: 58.3ms	remaining: 8.27s
7:	learn: 0.9458310	total: 60.1ms	remaining: 7.45s
8:	learn: 0.9050157	total: 61.9ms	remaining: 6.82s
9:	learn: 0.8761387	total: 63.7ms	remaining: 6.3s
10:	learn: 0.8503293	total: 65.5ms	remaining: 5.89s
11:	learn: 0.8230953	total: 67.4ms	remaining: 5.55s
12:	learn: 0.8028372	total: 69.2ms	remaining: 5.25s
13:	learn: 0.7780523	total: 71.2ms	remaining: 5.02s
14:	learn: 0.7597504	total: 73.1ms	remaining: 4.8s
15:	learn: 0.7412163	total: 74.9ms	remaining: 4.61s
16:	learn: 0.7242546	total: 76.9ms	remaining: 4.45s
17:	learn: 0.7090504	total: 78.6ms	remaining: 4.29s
18:	learn: 0.6938008	total: 80.3ms	remaining: 4.

In [66]:
accuracy_score(y_test, y_pred)

0.83

#SVM

In [67]:
from sklearn import svm

In [156]:
model = svm.SVC(C=2, coef0=0.0, gamma='scale', kernel='linear',max_iter=-1)
#model = svm.SVC(C=2)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

In [157]:
accuracy_score(y_test, y_pred)

0.9325

In [153]:
pd.Series(y_pred).value_counts()

sun     236
rain    162
snow      2
dtype: int64

In [154]:
pd.concat([pd.Series(y_pred), y_test.reset_index()['weather']], axis=1)

Unnamed: 0,0,weather
0,sun,sun
1,sun,sun
2,sun,sun
3,sun,sun
4,rain,rain
...,...,...
395,rain,rain
396,rain,rain
397,rain,rain
398,rain,rain
