#### Objective
* Developing a Logistic Regression model to predict whether or not it will rain tomorrow.

In [43]:
import pandas as pd
pd.set_option('display.max_rows', 150)

from sklearn.linear_model import LogisticRegression
from joblib import dump, load
from sklearn.metrics import accuracy_score

In [27]:
# Ignore warnings
import warnings
warnings.filterwarnings("ignore")

In [28]:
%%javascript 
//Disable autoscrolling to see entire graph
IPython.OutputArea.prototype._should_scroll = function(lines) {
    return false;
}

<IPython.core.display.Javascript object>

##### 1. Develop a Logistic Regression model with an arbitrary random_state.
* You can also set the underlying optimization library by setting the solver parameter.features and labels into training and testing set with 20% test size.


In [29]:
# Read training, test, labels
X_train = pd.read_pickle("X_train.pkl")
X_test  = pd.read_pickle("X_test.pkl")
y_train = pd.read_pickle("y_train.pkl")
y_test  = pd.read_pickle("y_test.pkl")

##### 2. Train the model with the prepared training features and labels.

In [30]:
log_regression = LogisticRegression(solver='liblinear', random_state=0)

In [31]:
log_regression.fit(X_train, y_train)

LogisticRegression(random_state=0, solver='liblinear')

##### 3 Predict the next day's rain forecast for the prepared testing data.
* Calculate the probabilities for negative and positive classes.

In [32]:
y_prediction_test = log_regression.predict(X_test)

In [33]:
y_prediction_test

array([0, 0, 0, ..., 0, 0, 1], dtype=int8)

In [34]:
# Probability of No rain (0)
log_regression.predict_proba(X_test)[:,0]

array([0.92399234, 0.87930385, 0.86377416, ..., 0.98321557, 0.83075673,
       0.39542325])

In [35]:
# Probability of Yes rain (1)
log_regression.predict_proba(X_test)[:,1]

array([0.07600766, 0.12069615, 0.13622584, ..., 0.01678443, 0.16924327,
       0.60457675])

##### 4. Calculate the accuracy score of the model for the predicted results.

In [36]:
test_accuracy_score  = accuracy_score(y_test, y_prediction_test)

In [37]:
print(f"Accuracy Score is:{test_accuracy_score*100:.2f}%")

Accuracy Score is:84.80%


##### 5. Interpret the model results by checking feature importance:
* Check the learned weights for each feature.
* Check the bias term.

In [38]:
# Learned weights for each feature
model_weights = log_regression.coef_[0]
# Bias (Incercept)
model_bias = log_regression.intercept_[0]

In [39]:
# Create a dataframe with feature and weights for easier displaying
column_names = X_train.columns
data = {'Feature':column_names, 'Weights':model_weights}
df = pd.DataFrame(data=data)

In [40]:
df.head(117)

Unnamed: 0,Feature,Weights
0,MinTemp,0.933536
1,MaxTemp,-2.836491
2,Rainfall,1.499092
3,Evaporation,0.185941
4,Sunshine,-1.532401
5,WindGustSpeed,4.117873
6,WindSpeed9am,-0.251537
7,WindSpeed3pm,-0.880591
8,Humidity9am,0.538222
9,Humidity3pm,5.782802


In [41]:
# print bias
print(f"The bias (intercept) for this model is:{model_bias}")

The bias (intercept) for this model is:-3.8766255101624965


In [44]:
# From Project 3 - Milestone 1, save sklearn model
dump(log_regression,'log_regression_project_1.joblib')

['log_regression_project_1.joblib']