# Package Installation

In [None]:
!pip install tslearn

In [None]:
import numpy as np
import pandas as pd
from scipy import stats
import requests
import json
from tslearn.clustering import TimeSeriesKMeans
from pandas.io.json import json_normalize
from datetime import datetime
from sklearn.impute import KNNImputer
import seaborn as sns
import matplotlib.pyplot as plt
import plotly
import plotly.tools as tls
import plotly.io as pio

import warnings
warnings.filterwarnings('ignore')


In [None]:
# pd.set_option('display.max_rows', None)
# pd.set_option('display.max_columns', None)
# pd.set_option('display.width', None)
# pd.set_option('display.max_colwidth', None)
pd.options.plotting.backend = "plotly"
pio.templates.default = 'plotly_white'

# Data Import from Cloud

In [None]:
feedback = requests.get('https://api.thingspeak.com/channels/2034104/feeds.json?api_key=X5X174DFX7N0QZ69&start=2023-01-15%2000:00:00')
# feedback.json()
data = feedback.json()['feeds']
data_df = pd.DataFrame.from_dict(data)
data_df.rename(columns = {'field1':'CO','field2':'CO2', 'field3':'NH4','field4':'Toluene', 
                          'field5':'Alcohol','field6':'Humidity','field7':'Noise','field8':'Temperature'}, inplace = True)
data_df.head(5)

# Data Preprocessing

In [None]:
values = ["CO","CO2","NH4", "Toluene", "Alcohol", "Humidity", "Noise", "Temperature"]
data_df['created_at']=pd.to_datetime(data_df['created_at'])
data_df[values]= data_df[values].astype('float')

In [None]:
data_df.info()

### Null value Handeling

In [None]:
imputer = KNNImputer(n_neighbors=2)
clean_data = imputer.fit_transform(data_df[['Humidity','Noise','Temperature']])
clean_df = pd.DataFrame(clean_data, columns = ['Humidity','Noise','Temperature'])
clean_df = data_df[["created_at","entry_id","CO","CO2","NH4", "Toluene", "Alcohol"]].join(clean_df)

In [None]:
clean_df.info()

In [None]:
clean_df.isnull().sum()

### Outlier Handeling

In [None]:
plt.figure(figsize=(15,6))
for value in values:
    plt.subplot(2,4,values.index(value)+1)
    plt.title(value)
    plt.violinplot(clean_df[value])
    
plt.subplots_adjust(hspace=1)


In [None]:
data_df_z = clean_df[values].apply(lambda x: np.abs(stats.zscore(x)), axis=1)
clean_df = clean_df[data_df.index.isin(data_df_z[(data_df_z < 2.5).all(axis=1)].index)]

In [None]:
plt.figure(figsize=(15,6))
for value in values:
    plt.subplot(2,4,values.index(value)+1)
    plt.title(value)
    plt.violinplot(clean_df[value])
    
plt.subplots_adjust(hspace=1)

### Corelation Testing

In [None]:
corr_matrix = data_df[values].corr()

# Plot correlation matrix as a heatmap
plt.figure(figsize=(10,10))
sns.heatmap(corr_matrix, annot=True, cmap="YlGnBu")
plt.title("Correlation Matrix")
plt.show()

### Skewness Correction

In [None]:
plt.figure(figsize=(15,6))

for value in values:
    plt.subplot(2,4,values.index(value)+1)
    data = clean_df[value]
    sns.kdeplot(data)
    plt.title(value)
    plt.xlabel("Values")
    plt.ylabel("Density")
    plt.subplot(2,4,1)

plt.subplots_adjust(hspace=1)
tls.mpl_to_plotly(plt.gcf())

In [None]:
clean_df_log = clean_df.copy(deep=True)
clean_df_log[values] = np.log(clean_df[values])  

In [None]:
plt.figure(figsize=(15,6))

for value in values:
    plt.subplot(2,4,values.index(value)+1)
    data = clean_df_log[value]
    sns.kdeplot(data)
    plt.title(value)
    plt.xlabel("Values")
    plt.ylabel("Density")
    plt.subplot(2,4,1)

plt.subplots_adjust(hspace=1)
tls.mpl_to_plotly(plt.gcf())

In [None]:
final_df = clean_df.copy(deep=True)
final_df_log = clean_df_log.copy(deep=True)
final_df.to_csv('cleaned_data.csv')
final_df_log.to_csv('cleaned_log_data.csv')