In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pickle
from datetime import date,timedelta 

In [2]:
from sklearn.cluster import KMeans
class KMeansInterp(KMeans):
    def __init__(self, ordered_feature_names, feature_importance_method='wcss_min', **kwargs):
        super(KMeansInterp, self).__init__(**kwargs)
        self.feature_importance_method = feature_importance_method
        self.ordered_feature_names = ordered_feature_names
        
    def fit(self, X, y=None, sample_weight=None):
        super().fit(X=X, y=y, sample_weight=sample_weight)
        
        if not len(self.ordered_feature_names) == self.n_features_in_:
            raise Exception(f"Model is fitted on {self.n_features_in_} but ordered_feature_names = {len(self.ordered_feature_names)}")
        
        if self.feature_importance_method == "wcss_min":
            self.feature_importances_ = self.get_feature_imp_wcss_min()
        elif self.feature_importance_method == "unsup2sup":
            self.feature_importances_ = self.get_feature_imp_unsup2sup(X)
        else: 
            raise Exception(f" {self.feature_importance_method}"+\
            "is not available. Please choose from  ['wcss_min' , 'unsup2sup']")
        
        return self
        
    def get_feature_imp_wcss_min(self):
        labels = self.n_clusters
        centroids = self.cluster_centers_
        centroids = np.vectorize(lambda x: np.abs(x))(centroids)
        sorted_centroid_features_idx = centroids.argsort(axis=1)[:,::-1]

        cluster_feature_weights = {}
        for label, centroid in zip(range(labels), sorted_centroid_features_idx):
            ordered_cluster_feature_weights = centroids[label][sorted_centroid_features_idx[label]]
            ordered_cluster_features = [self.ordered_feature_names[feature] for feature in centroid]
            cluster_feature_weights[label] = list(zip(ordered_cluster_features, 
                                                      ordered_cluster_feature_weights))
        
        return cluster_feature_weights
    
    def get_feature_imp_unsup2sup(self, X):
        try:
            from sklearn.ensemble import RandomForestClassifier
        except ImportError as IE:
            print(IE.__class__.__name__ + ": " + IE.message)
            raise Exception("Please install scikit-learn. " + 
                            "'unsup2sup' method requires using a classifier"+ 
                            "and depends on 'sklearn.ensemble.RandomForestClassifier'")
        
        cluster_feature_weights = {}
        for label in range(self.n_clusters):
            binary_enc = np.vectorize(lambda x: 1 if x == label else 0)(self.labels_)
            clf = RandomForestClassifier()
            clf.fit(X, binary_enc)

            sorted_feature_weight_idxes = np.argsort(clf.feature_importances_)[::-1]
            ordered_cluster_features = np.take_along_axis(
                np.array(self.ordered_feature_names), 
                sorted_feature_weight_idxes, 
                axis=0)
            ordered_cluster_feature_weights = np.take_along_axis(
                np.array(clf.feature_importances_), 
                sorted_feature_weight_idxes, 
                axis=0)
            cluster_feature_weights[label] = list(zip(ordered_cluster_features, 
                                                      ordered_cluster_feature_weights))
        return cluster_feature_weights

In [3]:
clf = pickle.load(open('Weather Prediction', 'rb'))
km = pickle.load(open('Weather Classifier','rb'))
scaler = pickle.load(open('Weather Scaler','rb'))
rr = pickle.load(open('RR_classifier','rb'))

In [4]:
from datetime import date,timedelta
def date_sequence(start_yr,start_mo,start_dd,end_yr,end_mo,end_dd):
    datedata = []
    start_date = date(start_yr, start_mo, start_dd) 
    end_date = date(end_yr, end_mo, end_dd) 
    delta = end_date - start_date 
    for i in range(delta.days + 1):
        day = start_date + timedelta(days=i)
        datedata.append(day)
    datedata = pd.to_datetime(datedata)
    dataframe = pd.DataFrame()
    dataframe['Date'] = datedata
    dataframe['YR'] = [d.year for d in dataframe['Date']]
    dataframe['MO'] = [d.month for d in dataframe['Date']]
    dataframe['DD'] = [d.day for d in dataframe['Date']]
    return dataframe

In [5]:
def predict(dataframe):
    pred_date = dataframe
    x = pred_date[['MO','DD']]
    y = clf.predict(x)
    pred_date[['Tavg']] = y[:,0]
    pred_date[['RH_avg']] = y[:,1]
    pred_date[['ss']] = y[:,2]
    rr_prob = rr.predict_proba(pred_date[['RH_avg','ss']])
    pred_date[['Rain_probability']] = rr_prob[:,1]
    to_scale = (pred_date[['Tavg','RH_avg','ss']])
    to_scale = scaler.transform(to_scale)
    scaled_df = pd.DataFrame()
    scaled_df['Tavg'] = to_scale[:,0]
    scaled_df['RH_avg'] = to_scale[:,1]
    scaled_df['ss'] = to_scale[:,2]
    scaled_df['Rain_probability'] = pred_date['Rain_probability']
    clusters = km.predict(scaled_df)
    pred_date['Cluster'] = clusters
    cclass = []
    for i in pred_date['Cluster']:
        if i == 0:
            cclass.append('Mildly cloudy with moderate probability of rain')
        elif i == 1:
            cclass.append('Sunny and chill weather')
        elif i == 2:
            cclass.append('Cloudy with a high probability of rain')
        else:
            cclass.append('Sunny and hot weather')
    pred_date['Weather Condition'] = cclass
    return pred_date

In [6]:
predictions = predict(date_sequence(2022,1,1,2022,3,31))
predictions

Unnamed: 0,Date,YR,MO,DD,Tavg,RH_avg,ss,Rain_probability,Cluster,Weather Condition
0,2022-01-01,2022,1,1,23.930085,86.495105,4.593940,0.784842,2,Cloudy with a high probability of rain
1,2022-01-02,2022,1,2,24.397636,84.050642,4.011629,0.758358,2,Cloudy with a high probability of rain
2,2022-01-03,2022,1,3,24.024850,85.937895,4.031969,0.803090,2,Cloudy with a high probability of rain
3,2022-01-04,2022,1,4,24.153166,85.200041,3.458619,0.816895,2,Cloudy with a high probability of rain
4,2022-01-05,2022,1,5,24.292385,83.900121,4.266524,0.738002,2,Cloudy with a high probability of rain
...,...,...,...,...,...,...,...,...,...,...
85,2022-03-27,2022,3,27,24.305113,83.308791,5.414700,0.636705,0,Mildly cloudy with moderate probability of rain
86,2022-03-28,2022,3,28,24.512241,80.172369,5.316653,0.536802,0,Mildly cloudy with moderate probability of rain
87,2022-03-29,2022,3,29,24.504988,82.710366,7.004885,0.483836,0,Mildly cloudy with moderate probability of rain
88,2022-03-30,2022,3,30,24.425060,82.051974,5.987552,0.546685,0,Mildly cloudy with moderate probability of rain


In [7]:
predictions.to_clipboard()