In [2]:
import pandas as pd

# Read data from TSV file using pandas read_csv()
smartthings_df = pd.read_csv('data/smartthings/smartthings.202210.tsv.gz', sep='\t')
smartthings_df


Unnamed: 0,loc,level,name,epoch,capability,attribute,value,unit
0,bathroom,1st,Bathroom (flower),1665346388,switch,switch,off,
1,bathroom,1st,Bathroom (flower),1665346402,switch,switch,on,
2,bathroom,1st,Bathroom (flower),1665347032,switch,switch,off,
3,bathroom,1st,Bathroom (flower),1665347091,switch,switch,on,
4,bathroom,1st,Bathroom (sink),1665347673,switch,switch,on,
...,...,...,...,...,...,...,...,...
10552,ground,ground,Kitchen (stairs),1667253741,relativeHumidityMeasurement,humidity,65,%
10553,ground,ground,Kitchen (button),1667256310,temperatureMeasurement,temperature,19,°C
10554,ground,ground,Kitchen (button),1667256612,temperatureMeasurement,temperature,19.1,°C
10555,kitchen,ground,Door (main),1667256875,signalStrength,lqi,202,


In [29]:
import glob
# Read the file paths from smartthings folder
folder_path = 'data/smartthings'
file_paths = glob.glob(os.path.join(folder_path, '*.tsv.gz'))


In [43]:
# Initialize an empty list to store dataframes
dfs = []
df=[]

# Read data from each file and append to the list
for i in range(0, len(file_paths)):
    df = pd.read_csv(file_paths[i], sep='\t')  # Replace with appropriate read function
    dfs.append(df)

# Concatenate dataframes vertically
combined_df = pd.concat(dfs, ignore_index=True)

# Now 'combined_df' contains data from all files
combined_df


Unnamed: 0,loc,level,name,epoch,capability,attribute,value,unit
0,bathroom,1st,Bathroom (sensor eye),1672527756,accelerationSensor,acceleration,active,
1,bathroom,1st,Bathroom (sensor eye),1672527786,accelerationSensor,acceleration,inactive,
2,bathroom,1st,Bathroom (sensor eye),1672527955,accelerationSensor,acceleration,active,
3,bathroom,1st,Bathroom (sensor eye),1672528016,accelerationSensor,acceleration,inactive,
4,bathroom,1st,Bathroom (sensor eye),1672528063,accelerationSensor,acceleration,active,
...,...,...,...,...,...,...,...,...
1182205,garden,ground,Garden air (sensor),1682891249,temperatureMeasurement,temperature,12.7,°C
1182206,bathroom,1st,Bathroom (sensor eye),1682891317,temperatureMeasurement,temperature,21,°C
1182207,attic,2nd,Attic (sensor),1682891786,temperatureMeasurement,temperature,21.4,°C
1182208,garden,ground,Garden air (sensor),1682891803,relativeHumidityMeasurement,humidity,72,%


In [44]:
# Check the duplicates
combined_df.duplicated().sum()

0

In [45]:
# confirm the primary key
duplicates_exist = combined_df[['name','epoch','attribute','value']].duplicated().any()
duplicates_exist

False

In [96]:
# transform the epoch to time , i.e. unix timestamp to datetime
import datetime

current_timestamp = combined_df['epoch']

def timealign(x):
    dt_obj = datetime.datetime.utcfromtimestamp(x)
    return dt_obj.strftime("%Y-%m-%d %H:%M")

time_list = []
for i in current_timestamp:
    time_list.append(timealign(i))

combined_df['time']= time_list
combined_df


  dt_obj = datetime.datetime.utcfromtimestamp(x)


Unnamed: 0,loc,level,name,epoch,capability,attribute,value,unit,time
0,bathroom,1st,Bathroom (sensor eye),1672527756,accelerationSensor,acceleration,active,,2022-12-31 23:02
1,bathroom,1st,Bathroom (sensor eye),1672527786,accelerationSensor,acceleration,inactive,,2022-12-31 23:03
2,bathroom,1st,Bathroom (sensor eye),1672527955,accelerationSensor,acceleration,active,,2022-12-31 23:05
3,bathroom,1st,Bathroom (sensor eye),1672528016,accelerationSensor,acceleration,inactive,,2022-12-31 23:06
4,bathroom,1st,Bathroom (sensor eye),1672528063,accelerationSensor,acceleration,active,,2022-12-31 23:07
...,...,...,...,...,...,...,...,...,...
1182205,garden,ground,Garden air (sensor),1682891249,temperatureMeasurement,temperature,12.7,°C,2023-04-30 21:47
1182206,bathroom,1st,Bathroom (sensor eye),1682891317,temperatureMeasurement,temperature,21,°C,2023-04-30 21:48
1182207,attic,2nd,Attic (sensor),1682891786,temperatureMeasurement,temperature,21.4,°C,2023-04-30 21:56
1182208,garden,ground,Garden air (sensor),1682891803,relativeHumidityMeasurement,humidity,72,%,2023-04-30 21:56


### several related terms
superkey: Any set of columns which has/will-have different values in each row.

candidate key: A minimal superkey; such a set of columns that when a column is removed then there are/will-be duplicate rows. A table might have several candidate keys.
- 'name','epoch','attribute','value'

primary key: A chosen candidate key. The column names of a table primary key are often shown underlined.

In [None]:
%pip install sqlalchemy_utils

In [9]:
import sqlalchemy as sa
import sqlalchemy.orm as orm
from sqlalchemy_utils import database_exists, create_database

In [10]:
engine = sa.create_engine("sqlite:///myhome.db",echo=False)

if not database_exists(engine.url):
    create_database(engine.url)

In [11]:
Base = orm.declarative_base()

class SmartThings(Base):
  __tablename__ = "smartthings"
  # these are notations
  name = sa.Column(sa.String(160),primary_key=True)
  time = sa.Column(sa.DateTime,primary_key=True)
  attribute= sa.Column(sa.String(160),primary_key=True)
  value = sa.Column(sa.String(160),primary_key=True)
  epoach = sa.Column(sa.Integer)
  loc = sa.Column(sa.String)
  level	= sa.Column(sa.String)
  capability = sa.Column(sa.String)
  unit = sa.Column(sa.String)
  

In [12]:
Base.metadata.create_all(engine)