In [1]:
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
data = pd.read_csv("D:/USB-DS/E/minute_weather.csv")

In [3]:
data.shape

(1587257, 13)

In [4]:
data.head()

Unnamed: 0,rowID,hpwren_timestamp,air_pressure,air_temp,avg_wind_direction,avg_wind_speed,max_wind_direction,max_wind_speed,min_wind_direction,min_wind_speed,rain_accumulation,rain_duration,relative_humidity
0,0,2011-09-10 00:00:49,912.3,64.76,97.0,1.2,106.0,1.6,85.0,1.0,,,60.5
1,1,2011-09-10 00:01:49,912.3,63.86,161.0,0.8,215.0,1.5,43.0,0.2,0.0,0.0,39.9
2,2,2011-09-10 00:02:49,912.3,64.22,77.0,0.7,143.0,1.2,324.0,0.3,0.0,0.0,43.0
3,3,2011-09-10 00:03:49,912.3,64.4,89.0,1.2,112.0,1.6,12.0,0.7,0.0,0.0,49.5
4,4,2011-09-10 00:04:49,912.3,64.4,185.0,0.4,260.0,1.0,100.0,0.1,0.0,0.0,58.8


In [5]:
# To select every 10th row, bcos large number of data
sampled_df = data[(data['rowID'] % 10) == 0]  
sampled_df.shape

(158726, 13)

In [6]:
sampled_df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
rowID,158726.0,793625.0,458203.937509,0.0,396812.5,793625.0,1190437.5,1587250.0
air_pressure,158726.0,916.830161,3.051717,905.0,914.8,916.7,918.7,929.5
air_temp,158726.0,61.851589,11.833569,31.64,52.7,62.24,70.88,99.5
avg_wind_direction,158680.0,162.1561,95.278201,0.0,62.0,182.0,217.0,359.0
avg_wind_speed,158680.0,2.775215,2.057624,0.0,1.3,2.2,3.8,31.9
max_wind_direction,158680.0,163.462144,92.452139,0.0,68.0,187.0,223.0,359.0
max_wind_speed,158680.0,3.400558,2.418802,0.1,1.6,2.7,4.6,36.0
min_wind_direction,158680.0,166.774017,97.441109,0.0,76.0,180.0,212.0,359.0
min_wind_speed,158680.0,2.134664,1.742113,0.0,0.8,1.6,3.0,31.6
rain_accumulation,158725.0,0.000318,0.011236,0.0,0.0,0.0,0.0,3.12


In [7]:
sampled_df[sampled_df['rain_accumulation'] ==0].shape

(157812, 13)

In [8]:
#to drop the values in rain duration which have 0 values
sampled_df[sampled_df['rain_duration'] ==0].shape

(157237, 13)

In [9]:
del sampled_df['rain_accumulation']
del sampled_df['rain_duration']

In [10]:
rows_before = sampled_df.shape[0]
sampled_df = sampled_df.dropna()
rows_after = sampled_df.shape[0]

In [11]:
rows_before - rows_after

46

In [13]:
sampled_df.columns

Index(['rowID', 'hpwren_timestamp', 'air_pressure', 'air_temp',
       'avg_wind_direction', 'avg_wind_speed', 'max_wind_direction',
       'max_wind_speed', 'min_wind_direction', 'min_wind_speed',
       'relative_humidity'],
      dtype='object')

In [14]:
features = ['air_pressure','air_temp','avg_wind_direction','avg_wind_speed','max_wind_direction','max_wind_speed']

In [15]:
select_df = sampled_df[features]

In [16]:
select_df.columns

Index(['air_pressure', 'air_temp', 'avg_wind_direction', 'avg_wind_speed',
       'max_wind_direction', 'max_wind_speed'],
      dtype='object')

In [17]:
select_df

Unnamed: 0,air_pressure,air_temp,avg_wind_direction,avg_wind_speed,max_wind_direction,max_wind_speed
0,912.3,64.76,97.0,1.2,106.0,1.6
10,912.3,62.24,144.0,1.2,167.0,1.8
20,912.2,63.32,100.0,2.0,122.0,2.5
30,912.2,62.60,91.0,2.0,103.0,2.4
40,912.2,64.04,81.0,2.6,88.0,2.9
...,...,...,...,...,...,...
1587210,915.9,75.56,330.0,1.0,341.0,1.3
1587220,915.9,75.56,330.0,1.1,341.0,1.4
1587230,915.9,75.56,344.0,1.4,352.0,1.7
1587240,915.9,75.20,359.0,1.3,9.0,1.6


In [18]:
x = StandardScaler().fit_transform(select_df)
x

array([[-1.48456281,  0.24544455, -0.68385323, -0.76555283, -0.62153592,
        -0.74440309],
       [-1.48456281,  0.03247142, -0.19055941, -0.76555283,  0.03826701,
        -0.66171726],
       [-1.51733167,  0.12374562, -0.65236639, -0.37675365, -0.44847286,
        -0.37231683],
       ...,
       [-0.30488381,  1.15818654,  1.90856325, -0.66835304,  2.0393087 ,
        -0.70306017],
       [-0.30488381,  1.12776181,  2.06599745, -0.71695294, -1.67073075,
        -0.74440309],
       [-0.30488381,  1.09733708, -1.63895404, -0.61975314, -1.55174989,
        -0.62037434]])

In [19]:
#KMeans(n_cluster = 12) --->#to define K value, but for this case it's taking random value
kmeans = KMeans()
model = kmeans.fit(x)

In [20]:

centers = model.cluster_centers_
centers

array([[ 0.21426379,  0.25381579,  1.88309991, -0.65098176, -1.54462775,
        -0.57249249],
       [-1.11271395, -0.9359304 ,  0.4198832 ,  1.61299155,  0.51807729,
         1.57677893],
       [ 1.20106143, -0.23559081, -1.13874625,  1.98687498, -1.03929216,
         2.09739467],
       [-0.15641555, -0.97733251,  0.46471519, -0.3423991 ,  0.64504432,
        -0.34090961],
       [ 1.05895315, -0.47319186, -1.19842821, -0.32531376, -1.04918986,
        -0.31967905],
       [-0.22177709,  0.62949235,  0.39784721,  0.54464087,  0.50769991,
         0.49328954],
       [-0.28010047,  0.6617157 , -1.2274453 , -0.59975653, -1.07786846,
        -0.61605626],
       [ 0.05960061,  0.80171419,  0.73047496, -0.68028131,  0.9500913 ,
        -0.66450363]])