In [89]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.model_selection import train_test_split, GridSearchCV
import numpy as np

In [63]:
df = pd.read_csv('weatherAUS.csv')

In [64]:
df.head()

Unnamed: 0,Date,Location,MinTemp,MaxTemp,Rainfall,Evaporation,Sunshine,WindGustDir,WindGustSpeed,WindDir9am,...,Humidity9am,Humidity3pm,Pressure9am,Pressure3pm,Cloud9am,Cloud3pm,Temp9am,Temp3pm,RainToday,RainTomorrow
0,2008-12-01,Albury,13.4,22.9,0.6,,,W,44.0,W,...,71.0,22.0,1007.7,1007.1,8.0,,16.9,21.8,No,No
1,2008-12-02,Albury,7.4,25.1,0.0,,,WNW,44.0,NNW,...,44.0,25.0,1010.6,1007.8,,,17.2,24.3,No,No
2,2008-12-03,Albury,12.9,25.7,0.0,,,WSW,46.0,W,...,38.0,30.0,1007.6,1008.7,,2.0,21.0,23.2,No,No
3,2008-12-04,Albury,9.2,28.0,0.0,,,NE,24.0,SE,...,45.0,16.0,1017.6,1012.8,,,18.1,26.5,No,No
4,2008-12-05,Albury,17.5,32.3,1.0,,,W,41.0,ENE,...,82.0,33.0,1010.8,1006.0,7.0,8.0,17.8,29.7,No,No


In [65]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 145460 entries, 0 to 145459
Data columns (total 23 columns):
 #   Column         Non-Null Count   Dtype  
---  ------         --------------   -----  
 0   Date           145460 non-null  object 
 1   Location       145460 non-null  object 
 2   MinTemp        143975 non-null  float64
 3   MaxTemp        144199 non-null  float64
 4   Rainfall       142199 non-null  float64
 5   Evaporation    82670 non-null   float64
 6   Sunshine       75625 non-null   float64
 7   WindGustDir    135134 non-null  object 
 8   WindGustSpeed  135197 non-null  float64
 9   WindDir9am     134894 non-null  object 
 10  WindDir3pm     141232 non-null  object 
 11  WindSpeed9am   143693 non-null  float64
 12  WindSpeed3pm   142398 non-null  float64
 13  Humidity9am    142806 non-null  float64
 14  Humidity3pm    140953 non-null  float64
 15  Pressure9am    130395 non-null  float64
 16  Pressure3pm    130432 non-null  float64
 17  Cloud9am       89572 non-null

In [66]:
columns_to_drop = ['Date', 'Location', 'Evaporation', 'Sunshine', 'WindGustDir', 'WindDir9am', 'WindDir3pm', 'Cloud9am', 'Cloud3pm']

In [67]:
df = df.drop(columns_to_drop, axis=1)

In [68]:
df = df.fillna(method='ffill')

In [69]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 145460 entries, 0 to 145459
Data columns (total 14 columns):
 #   Column         Non-Null Count   Dtype  
---  ------         --------------   -----  
 0   MinTemp        145460 non-null  float64
 1   MaxTemp        145460 non-null  float64
 2   Rainfall       145460 non-null  float64
 3   WindGustSpeed  145460 non-null  float64
 4   WindSpeed9am   145460 non-null  float64
 5   WindSpeed3pm   145460 non-null  float64
 6   Humidity9am    145460 non-null  float64
 7   Humidity3pm    145460 non-null  float64
 8   Pressure9am    145460 non-null  float64
 9   Pressure3pm    145460 non-null  float64
 10  Temp9am        145460 non-null  float64
 11  Temp3pm        145460 non-null  float64
 12  RainToday      145460 non-null  object 
 13  RainTomorrow   145460 non-null  object 
dtypes: float64(12), object(2)
memory usage: 15.5+ MB


In [70]:
le = LabelEncoder()

In [76]:
le.fit(df['RainToday'])
rtod = le.transform(df['RainToday'])
df['RainToday'] = rtod

In [77]:
le.fit(df['RainTomorrow'])
y = le.transform(df['RainTomorrow'])
x = df.drop(['RainTomorrow'], axis=1)

In [87]:
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.25, shuffle=False)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 36365 entries, 109095 to 145459
Data columns (total 13 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   MinTemp        36365 non-null  float64
 1   MaxTemp        36365 non-null  float64
 2   Rainfall       36365 non-null  float64
 3   WindGustSpeed  36365 non-null  float64
 4   WindSpeed9am   36365 non-null  float64
 5   WindSpeed3pm   36365 non-null  float64
 6   Humidity9am    36365 non-null  float64
 7   Humidity3pm    36365 non-null  float64
 8   Pressure9am    36365 non-null  float64
 9   Pressure3pm    36365 non-null  float64
 10  Temp9am        36365 non-null  float64
 11  Temp3pm        36365 non-null  float64
 12  RainToday      36365 non-null  int64  
dtypes: float64(12), int64(1)
memory usage: 3.9 MB


# Naive Baesian

In [90]:
from sklearn.naive_bayes import  BernoulliNB

classifier = BernoulliNB()
classifier.fit(X_train, y_train)
prediction = classifier.predict(X_test)
print (np.mean(prediction == y_test))

0.7813007012237041


# KNeighborsClassifier

In [99]:
from sklearn.neighbors import KNeighborsClassifier

param_grid = {'n_neighbors': [4, 5, 6]}
              
grid = GridSearchCV(KNeighborsClassifier(), param_grid)
grid.fit(X_train, y_train)
best_params = grid.best_params_
print(f"Best params: {best_params}")

Best params: {'n_neighbors': 6}


In [100]:
neigh = KNeighborsClassifier(n_neighbors=6)
neigh.fit(X_train, y_train)

prediction = neigh.predict(X_test)
print (np.mean(prediction == y_test))

0.8307438471057336


# Логистическая регрессия

In [101]:
from sklearn.linear_model import LogisticRegression

classifier = LogisticRegression(penalty =  'l2', max_iter = 5000)
classifier.fit(X_train, y_train)
prediction = classifier.predict(X_test)
print (np.mean(prediction == y_test))

0.8416059397772584
