### Content
This dataset comprises about 10 years of daily weather observations from numerous locations across Australia.

#### RainTomorrow is the target variable to predict. It answers the crucial question: will it rain the next day? (Yes or No).

This column is marked 'Yes' if the rain for that day was 1mm or more.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
%matplotlib inline

In [None]:
df = pd.read_csv('weatherAUS.csv')
df.head()

In [None]:
df.shape

In [None]:
df.info()

In [None]:
nulls =df.isnull().sum()
nulls.sort_values()

In [None]:
df.dropna(subset=["RainTomorrow"], inplace=True)

In [None]:
nulls =df.isnull().sum()
nulls.sort_values()

In [None]:
px.histogram(df, x="RainTomorrow", color=df['RainToday'], title="RainToday vs RainTomorrow")

In [None]:
df.duplicated().sum()

In [None]:
df.Location.unique()

In [None]:
df.nunique().sort_values()

In [None]:
numerical_cols = df.select_dtypes(include=np.number).columns.tolist()
categorical_cols = df.select_dtypes('object').columns.tolist()

In [None]:
df[numerical_cols].isna().sum().sort_values(ascending=False)

In [None]:
# Import Libraries

from sklearn.impute import SimpleImputer
import numpy as np
#----------------------------------------------------

# Cleaning data

Imputer = SimpleImputer(missing_values = np.nan, strategy ='mean').fit(df[numerical_cols])

df[numerical_cols] = Imputer.transform(df[numerical_cols])
df

In [None]:
nulls =df.isnull().sum()
nulls.sort_values()

### Scaling

In [None]:
df.describe().loc[['min','max']]

In [None]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler().fit(df[numerical_cols])

In [None]:
df[numerical_cols] = scaler.transform(df[numerical_cols])

In [None]:
df.describe().loc[['min', 'max']]

### Encode categorical data

In [None]:
df[categorical_cols].nunique()

### splitdata

In [None]:
X = df.drop('RainTomorrow',axis='columns')
y = df.RainTomorrow

In [None]:
from sklearn.preprocessing import LabelEncoder,StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
ct= ColumnTransformer([('encoder',OneHotEncoder(),['Date','Location','WindGustDir','WindDir9am','WindDir3pm','RainToday'])],remainder='passthrough')
X= np.array(ct.fit_transform(X))
np.array

In [None]:
X = pd.DataFrame()

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler , MinMaxScaler
from sklearn.metrics import mean_squared_error, r2_score

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.3)

## Logistic Regression

In [None]:
lr_clf = LogisticRegression()
lr_clf.fit(X_train, y_train)

In [None]:
lr_clf.score(X_test,y_test)

In [None]:
y_predict = lr_clf.predict(X_test)

In [None]:
from sklearn.metrics import confusion_matrix


confusion_matrix(y_test, y_predict)

In [None]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test,y_predict)

In [None]:
from sklearn.metrics import classification_report
print(classification_report(y_test, y_predict))

## naive bayes

In [None]:
nb_clf = GaussianNB()
nb_clf.fit(X_train, y_train)

In [None]:
nb_clf.score(X_test,y_test)

In [None]:
y_predict = nb_clf.predict(X_test)

In [None]:
from sklearn.metrics import confusion_matrix


confusion_matrix(y_test, y_predict)

In [None]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test,y_predict)

In [None]:
from sklearn.metrics import classification_report
print(classification_report(y_test, y_predict))

## Decision tree

In [None]:
dt_clf = DecisionTreeClassifier()
dt_clf.fit(X_train, y_train)

In [None]:
dt_clf.score(X_test,y_test)

In [None]:
y_predict = dt_clf.predict(X_test)

In [None]:
from sklearn.metrics import confusion_matrix


confusion_matrix(y_test, y_predict)

In [None]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test,y_predict)

In [None]:
from sklearn.metrics import classification_report
print(classification_report(y_test, y_predict))

## Random forest

In [None]:
rf_clf = RandomForestClassifier()
rf_clf.fit(X_train, y_train)

In [None]:
rf_clf.score(X_test,y_test)

In [None]:
y_predict = rf_clf.predict(X_test)

In [None]:
from sklearn.metrics import confusion_matrix


confusion_matrix(y_test, y_predict)

In [None]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test,y_predict)

In [None]:
from sklearn.metrics import classification_report
print(classification_report(y_test, y_predict))