### Configuração do Ambiente

In [1]:
from google.colab import files
files.upload()  # selecione o arquivo kaggle.json

!pip install kaggle
!mkdir -p ~/.kaggle
!cp kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json

Saving kaggle.json to kaggle.json


### Transformação do dataset de JSON para CSV

In [2]:
!kaggle datasets download -d rohithmahadevan/distance-calibration-measurement-in-toy-train --unzip

Dataset URL: https://www.kaggle.com/datasets/rohithmahadevan/distance-calibration-measurement-in-toy-train
License(s): unknown
Downloading distance-calibration-measurement-in-toy-train.zip to /content
  0% 0.00/11.1k [00:00<?, ?B/s]
100% 11.1k/11.1k [00:00<00:00, 39.7MB/s]


In [6]:
import json
import pandas as pd
from pandas import json_normalize

path = '/content/distance-measurement-101-default-rtdb-export exp data 2.json'

with open(path, 'r') as f:
    data = json.load(f)

sensor = data['sensor']
df = json_normalize(sensor)
print("df shape:", df.shape)
df.columns = [c.replace("us.", "") for c in df.columns]
list(df.columns[:20])

df shape: (1, 1296)


['-MvCtiXks45XE6_NIkCM.Distance',
 '-MvCtiXks45XE6_NIkCM.Time',
 '-MvCtiXks45XE6_NIkCM.value',
 '-MvCtizSFs2_Eq6U0RzG.Distance',
 '-MvCtizSFs2_Eq6U0RzG.Time',
 '-MvCtizSFs2_Eq6U0RzG.value',
 '-MvCtjQtd0lt1nUSukR2.Distance',
 '-MvCtjQtd0lt1nUSukR2.Time',
 '-MvCtjQtd0lt1nUSukR2.value',
 '-MvCtjzrXWW4kcxoEP1N.Distance',
 '-MvCtjzrXWW4kcxoEP1N.Time',
 '-MvCtjzrXWW4kcxoEP1N.value',
 '-MvCtkRinw4MnTVpseoP.Distance',
 '-MvCtkRinw4MnTVpseoP.Time',
 '-MvCtkRinw4MnTVpseoP.value',
 '-MvCtktYyoY4vYtmbIr2.Distance',
 '-MvCtktYyoY4vYtmbIr2.Time',
 '-MvCtktYyoY4vYtmbIr2.value',
 '-MvCtlKinkow1wEwuTaO.Distance',
 '-MvCtlKinkow1wEwuTaO.Time']

In [7]:
import pandas as pd, numpy as np
pd.options.display.max_colwidth = 200

print("Shape:", df.shape)
cols = df.columns.tolist()
print("Número de colunas:", len(cols))
print("\nExibindo as primeiras 40 colunas e seus valores (preview):")
preview = {c: df.loc[0, c] for c in cols[:40]}
for k, v in preview.items():
    vtype = type(v).__name__
    slen = len(v) if isinstance(v, (str, list, dict, tuple)) else None
    print(f" - {k} | type={vtype} | len={slen} | sample={str(v)[:120]}")


Shape: (1, 1296)
Número de colunas: 1296

Exibindo as primeiras 40 colunas e seus valores (preview):
 - -MvCtiXks45XE6_NIkCM.Distance | type=float64 | len=None | sample=59.13
 - -MvCtiXks45XE6_NIkCM.Time | type=str | len=26 | sample=2022-02-06 12:49:28.505962
 - -MvCtiXks45XE6_NIkCM.value | type=int64 | len=None | sample=0
 - -MvCtizSFs2_Eq6U0RzG.Distance | type=float64 | len=None | sample=58.75
 - -MvCtizSFs2_Eq6U0RzG.Time | type=str | len=26 | sample=2022-02-06 12:49:30.327585
 - -MvCtizSFs2_Eq6U0RzG.value | type=int64 | len=None | sample=0
 - -MvCtjQtd0lt1nUSukR2.Distance | type=float64 | len=None | sample=59.5
 - -MvCtjQtd0lt1nUSukR2.Time | type=str | len=26 | sample=2022-02-06 12:49:32.171886
 - -MvCtjQtd0lt1nUSukR2.value | type=int64 | len=None | sample=0
 - -MvCtjzrXWW4kcxoEP1N.Distance | type=float64 | len=None | sample=31.89
 - -MvCtjzrXWW4kcxoEP1N.Time | type=str | len=26 | sample=2022-02-06 12:49:34.485819
 - -MvCtjzrXWW4kcxoEP1N.value | type=int64 | len=None | sample=1
 - -

In [8]:
import pandas as pd
import re

row = df.iloc[0]
pattern = re.compile(r"^(?P<id>[^.]+)\.(?P<field>Distance|Time|value)$")
records = {}

for col, val in row.items():
    m = pattern.match(col)
    if m:
        rec_id = m.group("id")
        field = m.group("field")
        records.setdefault(rec_id, {})[field] = val

data_list = []
for rec_id, fields in records.items():
    data_list.append({
        "id": rec_id,
        "distance": fields.get("Distance"),
        "time": fields.get("Time"),
        "value": fields.get("value")
    })

long_df = pd.DataFrame(data_list)

long_df["distance"] = pd.to_numeric(long_df["distance"], errors="coerce")
long_df["value"] = pd.to_numeric(long_df["value"], errors="coerce")
long_df["time"] = pd.to_datetime(long_df["time"], errors="coerce")

long_df = long_df.sort_values("time").reset_index(drop=True)

print("Dataset reconstruído — shape:", long_df.shape)
display(long_df.head())

out = "/content/toytrain_dataset.csv"
long_df.to_csv(out, index=False)
print("Salvo em:", out)


Dataset reconstruído — shape: (432, 4)


Unnamed: 0,id,distance,time,value
0,-MvCtiXks45XE6_NIkCM,59.13,2022-02-06 12:49:28.505962,0
1,-MvCtizSFs2_Eq6U0RzG,58.75,2022-02-06 12:49:30.327585,0
2,-MvCtjQtd0lt1nUSukR2,59.5,2022-02-06 12:49:32.171886,0
3,-MvCtjzrXWW4kcxoEP1N,31.89,2022-02-06 12:49:34.485819,1
4,-MvCtkRinw4MnTVpseoP,59.12,2022-02-06 12:49:36.292045,0


Salvo em: /content/toytrain_dataset.csv


### Carregamento e preparação dos dados

In [17]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

df = pd.read_csv('/content/toytrain_dataset.csv', parse_dates=['time'])
df = df.sort_values('time').reset_index(drop=True)

df['distance_filt'] = df['distance'].rolling(5, center=True, min_periods=1).median()

df['dist_diff'] = df['distance'].diff().fillna(0)
df['dist_ddiff'] = df['dist_diff'].diff().fillna(0)

df['occupied'] = df['value'].astype(int)
df = df.drop(columns=['value'])

features = ['distance', 'distance_filt', 'dist_diff', 'dist_ddiff']
X = df[features].fillna(0)
y = df['occupied']

X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

print("===== Dataset =====")
print("X shape total:", X.shape)
print()

print("===== Train/Test Shapes =====")
print("X_train:", X_train.shape)
print("X_test :", X_test.shape)
print()

===== Dataset =====
X shape total: (432, 4)

===== Train/Test Shapes =====
X_train: (345, 4)
X_test : (87, 4)



### Modelo RandomForest

In [18]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, f1_score, classification_report, confusion_matrix

clf = RandomForestClassifier(
    n_estimators=300,
    random_state=42,
    class_weight='balanced'
)

clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred))
print("F1:", f1_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))


Accuracy: 1.0
F1: 1.0

Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00        72
           1       1.00      1.00      1.00        15

    accuracy                           1.00        87
   macro avg       1.00      1.00      1.00        87
weighted avg       1.00      1.00      1.00        87


Confusion Matrix:
 [[72  0]
 [ 0 15]]


Importância das features

distance:       0.5672   ← MAIS IMPORTANTE

distance_filt:  0.0011   ← QUASE ZERO

dist_diff:      0.3321   ← MUITO IMPORTANTE

dist_ddiff:     0.0996   ← ALGUMA IMPORTÂNCIA

In [19]:
importances = clf.feature_importances_
for feat, imp in zip(features, importances):
    print(f"{feat}: {imp:.4f}")

distance: 0.5672
distance_filt: 0.0011
dist_diff: 0.3321
dist_ddiff: 0.0996


In [20]:
dist_vals = df['distance_filt'].values
labels = y.values

ths = np.linspace(dist_vals.min(), dist_vals.max(), 200)
best = {'th': None, 'f1': -1}

for t in ths:
    pred = (dist_vals < t).astype(int)
    f1 = f1_score(labels, pred)
    if f1 > best['f1']:
        best = {'th': t, 'f1': f1}

print("Melhor threshold:", best['th'], "F1:", best['f1'])

Melhor threshold: 59.239045226130656 F1: 0.3023758099352052


### Exportação do modelo


In [21]:
import joblib
from google.colab import files

joblib.dump(clf, '/content/parking_model.pkl')
print("Modelo salvo em /content/parking_model.pkl")
files.download('/content/parking_model.pkl')

#clf = joblib.load('/content/parking_rf_model.pkl')

Modelo salvo em /content/parking_model.pkl


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>